# Capstone project - Analytics in agriculture

### In this file, we can find the ETL process that our project follows to go from the raw data located in 'data/' to the curated data stored in our rdbms. For this first version the rdbms will be PostgreSQL

In [19]:
import psycopg2
from psycopg2 import extras
import pandas as pd
import time
import configparser
import json

# 1. Extraction

### We are not starting from the very first stage. The extraction phase begins when downloading the data from the database, but since this first step needs to be done yearly due to de refresh schedule that this data is following, we did a manual step before the one described below (Manual step: download files > uncompress files)

### After the short explanation, we proceed with the extraction of the data. The data that our source provides are csv files. Since, the data is completely untouched, we will need to select the files/tables that are useful for our project and rearrange the structure of the columns because as we will see during the etl, the structure given is optimized for storage but not for a more advanced data model.

In [2]:
crops_data = pd.read_csv("data/Production_Crops_E_All_Data.csv", encoding="ANSI")
trade_data = pd.read_csv("data/Trade_Crops_Livestock_E_All_Data.csv", encoding="ANSI")
crops_flags = pd.read_csv("data/Production_Crops_E_Flags.csv", encoding="ANSI")
trade_flags = pd.read_csv("data/Trade_Crops_Livestock_E_Flags.csv", encoding="ANSI")

with open("credentials/redshift.json", 'r') as j:
    redshift = json.loads(j.read())

# Deletes temporary variable j
del j

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# 2. Transformation

## Creation of dimension tables

In [3]:
dim_countries = crops_data[["Area Code", "Area"]].append(trade_data[["Area Code", "Area"]]).drop_duplicates()
dim_items = crops_data[["Item Code", "Item"]].drop_duplicates()
dim_elements = crops_data[["Element Code", "Element"]].append(trade_data[["Element Code", "Element"]]).drop_duplicates()
dim_flags = crops_flags.append(trade_flags).drop_duplicates()

#Delete original flags dataframes
del crops_flags, trade_flags

## Clean dataframes

### Trade data has mixed crops and products data. to increase the performance of the next steps, first we will need to remove the rows that are not crops.

### Dimensions contain lots of duplciated data, therefore they will be trimmed as well

In [4]:
trade_data = trade_data[trade_data["Item Code"].isin(dim_items["Item Code"])]
#temp_items = trade_data["Item"].drop_duplicates()
#temp_items

## Modify Flags dataframe

### Flags table has "blank" primary key string associated to "Official data", but in the  fact table the value is blank. So it is needed to change the string "blank" to a blank string

In [5]:
dim_flags = dim_flags.replace("<blank>", "")

## rearrange the dataframe structures and creation of the fact table

### The design of this structure, will make the data grow horizontally, but for our SQL schema we can't keep a schema that is growing into this direction, so to rearrange the tables we have divided the data into 2 groups: keys and values. 
* keys: data that will be repeated after each iteration and serves as an identifier for the values
* values: data reported yearly and makes the dataframe grow each year 2 columns more

In [6]:
start = time.time()

raw_crop_keys = crops_data[["Area Code", "Item Code", "Element Code", "Unit"]]
raw_crop_values = crops_data.drop(labels = ["Area Code", "Area", "Item Code", "Item", "Element Code", "Element", "Unit"], axis = 1)

raw_trade_keys = trade_data[["Area Code", "Item Code", "Element Code", "Unit"]]
raw_trade_values = trade_data.drop(labels = ["Area Code", "Area", "Item Code", "Item", "Element Code", "Element", "Unit"], axis = 1)

if(len(raw_crop_values.columns) % 2 == 1):
    print(raw_crop_values.columns)
    raise Exception("Unexpected column found, columns number must be even as they consist of pairs. Please check out the dataframe structure")

if(len(raw_trade_values.columns) % 2 == 1):
    print(raw_trade_values.columns)
    raise Exception("Unexpected column found, columns number must be even as they consist of pairs. Please check out the dataframe structure")

fact_crops_data = pd.DataFrame(columns = ["Area Code", "Item Code", "Element Code", "Unit", "Year", "Value", "Flag"])

fact_trade_data = pd.DataFrame(columns = ["Area Code", "Item Code", "Element Code", "Unit", "Year", "Value", "Flag"])

for A, B in zip(*[iter(raw_crop_values)]*2):
    temp_aux_crops = raw_crop_keys.append(raw_crop_values[[A, B]]).rename(columns = {A: "Value", B: "Flag"})
    print("evaluated from crops_data: ", A)
    fact_crops_data = fact_crops_data.append(temp_aux_crops)

for A, B in zip(*[iter(raw_trade_values)]*2):
    temp_aux_trade = raw_trade_keys.append(raw_trade_values[[A, B]]).rename(columns = {A: "Value", B: "Flag"})
    print("evaluated from trade_data: ", A)
    fact_trade_data = fact_trade_data.append(temp_aux_trade)

end = time.time()

# Delete temporary variables
del A, B, raw_crop_keys, raw_crop_values, raw_trade_keys, raw_trade_values

# Delete original dataframes
del crops_data, trade_data

print("elapsed time: ", end - start)

# Delete chrono temporary variables
del start, end

evaluated from crops_data:  Y1961
evaluated from crops_data:  Y1962
evaluated from crops_data:  Y1963
evaluated from crops_data:  Y1964
evaluated from crops_data:  Y1965
evaluated from crops_data:  Y1966
evaluated from crops_data:  Y1967
evaluated from crops_data:  Y1968
evaluated from crops_data:  Y1969
evaluated from crops_data:  Y1970
evaluated from crops_data:  Y1971
evaluated from crops_data:  Y1972
evaluated from crops_data:  Y1973
evaluated from crops_data:  Y1974
evaluated from crops_data:  Y1975
evaluated from crops_data:  Y1976
evaluated from crops_data:  Y1977
evaluated from crops_data:  Y1978
evaluated from crops_data:  Y1979
evaluated from crops_data:  Y1980
evaluated from crops_data:  Y1981
evaluated from crops_data:  Y1982
evaluated from crops_data:  Y1983
evaluated from crops_data:  Y1984
evaluated from crops_data:  Y1985
evaluated from crops_data:  Y1986
evaluated from crops_data:  Y1987
evaluated from crops_data:  Y1988
evaluated from crops_data:  Y1989
evaluated from

# 3. Load

## Load the tables into our redshift cluster

In [7]:
conn = psycopg2.connect(f"host={redshift['endpoint']} dbname={redshift['database']} user={redshift['username']} password={redshift['password']} port={redshift['port']}")

In [58]:
cur = conn.cursor()
cur.execute("SET AUTOCOMMIT = TRUE;")

In [118]:
cur.execute("SELECT * FROM information_schema.tables WHERE table_schema = 'public'")
for row in cur.fetchall():
    print(row)

# Delete temporary value
del row

('dev', 'public', 'items', 'BASE TABLE', None, None, None, None, None)
('dev', 'public', 'flags', 'BASE TABLE', None, None, None, None, None)
('dev', 'public', 'elements', 'BASE TABLE', None, None, None, None, None)
('dev', 'public', 'countries', 'BASE TABLE', None, None, None, None, None)
('dev', 'public', 'fact_trade', 'BASE TABLE', None, None, None, None, None)
('dev', 'public', 'fact_crops', 'BASE TABLE', None, None, None, None, None)


In [153]:
# ONLY EXECUTE IF DATA IS WRONG

cur.execute("DROP TABLE IF EXISTS countries, elements, flags, items, fact_crops, fact_trade;")

In [154]:
# Create crops fact table if not exists
cur.execute("""
    CREATE TABLE IF NOT EXISTS fact_crops(
        Area_code int not null,
        Item_Code int not null,
        Element_Code int not null,
        Unit varchar(50),
        Year int not null,
        Value float,
        Flag varchar(25)
    );
""")

# Create trade fact table if not exists
cur.execute("""
    CREATE TABLE IF NOT EXISTS fact_trade(
        Area_code int not null,
        Item_Code int not null,
        Element_Code int not null,
        Unit varchar(50),
        Year int not null,
        Value float,
        Flag varchar(25)
    );
""")

# Create Countries dimension table if not exists
cur.execute("""
    CREATE TABLE IF NOT EXISTS countries(
        Area_Code int not null UNIQUE PRIMARY KEY,
        Area varchar(100)
    );
""")

# Create Elements dimension table if not exists
cur.execute("""
    CREATE TABLE IF NOT EXISTS elements(
        Element_Code int not null UNIQUE PRIMARY KEY,
        Element varchar(100)
    );
""")

# Create Flags dimension table if not exists
cur.execute("""
    CREATE TABLE IF NOT EXISTS flags(
        Flag varchar(25) UNIQUE PRIMARY KEY,
        Description varchar(100)
    );
""")

# Create Items dimension table if not exists
cur.execute("""
    CREATE TABLE IF NOT EXISTS items(
        Item_Code int not null UNIQUE PRIMARY KEY,
        Item varchar(100)
    );
""")

In [155]:
start = time.time()

try:
    psycopg2.extras.execute_values(cur, "INSERT INTO countries VALUES %s;", dim_countries.itertuples(index=False))
    
    psycopg2.extras.execute_values(cur, "INSERT INTO flags VALUES %s", dim_flags.itertuples(index=False))

    psycopg2.extras.execute_values(cur, "INSERT INTO elements VALUES %s", dim_elements.itertuples(index=False))

    psycopg2.extras.execute_values(cur, "INSERT INTO items VALUES %s", dim_items.itertuples(index=False))

    psycopg2.extras.execute_values(cur, "INSERT INTO fact_crops VALUES %s", fact_crops_data.itertuples(index=False))

    psycopg2.extras.execute_values(cur, "INSERT INTO fact_trade VALUES %s", fact_trade_data.itertuples(index=False))

except: 
    conn.rollback()

end = time.time()

print("elapsed time: ", end - start)

# Delete chrono temporary variables
del start, end

In [156]:
cur.execute("SELECT * FROM countries;")
for row in cur.fetchall():
    print(row)

# Delete temporary value
del row

(2, 'Afghanistan')
(3, 'Albania')
(4, 'Algeria')
(7, 'Angola')
(8, 'Antigua and Barbuda')
(9, 'Argentina')
(1, 'Armenia')
(10, 'Australia')
(11, 'Austria')
(52, 'Azerbaijan')
(12, 'Bahamas')
(13, 'Bahrain')
(16, 'Bangladesh')
(14, 'Barbados')
(57, 'Belarus')
(255, 'Belgium')
(15, 'Belgium-Luxembourg')
(23, 'Belize')
(53, 'Benin')
(18, 'Bhutan')
(19, 'Bolivia (Plurinational State of)')
(80, 'Bosnia and Herzegovina')
(20, 'Botswana')
(21, 'Brazil')
(26, 'Brunei Darussalam')
(27, 'Bulgaria')
(233, 'Burkina Faso')
(29, 'Burundi')
(35, 'Cabo Verde')
(115, 'Cambodia')
(32, 'Cameroon')
(33, 'Canada')
(37, 'Central African Republic')
(39, 'Chad')
(40, 'Chile')
(351, 'China')
(96, 'China, Hong Kong SAR')
(128, 'China, Macao SAR')
(41, 'China, mainland')
(214, 'China, Taiwan Province of')
(44, 'Colombia')
(45, 'Comoros')
(46, 'Congo')
(47, 'Cook Islands')
(48, 'Costa Rica')
(107, "Côte d'Ivoire")
(98, 'Croatia')
(49, 'Cuba')
(50, 'Cyprus')
(167, 'Czechia')
(51, 'Czechoslovakia')
(116, "Democrati

In [125]:
cur.execute("""
    SELECT 
        column_name, data_type, character_maximum_length, column_default, is_nullable
    FROM 
        information_schema.columns
    WHERE 
        table_name = 'countries';
""")
for row in cur.fetchall():
    print(row)

# Delete temporary value
del row

('area_code', 'integer', None, None, 'NO')
('area', 'character varying', 100, None, 'YES')
