#### This notebook provides some unit tests for processed data versus data in the Data Warehouse 

In [1]:
# import libraries
import pandas as pd
import re
import numpy as np
import psycopg2

In [2]:
def main_1(query):
    """
    - establishes connection to postgres
    
    - sends files to ETL
    """
    conn = psycopg2.connect("host=127.0.0.1 dbname=capstone user=postgres password=postgres port=5433")
    conn.set_session(autocommit=True)
    cur = conn.cursor()
    
    cur.execute(query)
    record = cur.fetchall()[0]
    
    colnames = [desc[0] for desc in cur.description]
    
    res = pd.DataFrame([record], columns=colnames, index=[0])

    conn.close()
    
    return res

#### QUALITY CHECK 1

In [3]:
# read processed data
raw_df_ccodes = pd.read_csv('./output/df_ccodes.csv', converters={"country_code":str,
                                                                     "country_region_code":str,
                                                                     "country_sub_region_code":str})

In [4]:
# make a copy
df_ccodes = raw_df_ccodes.copy()

In [5]:
df_ccodes.head(2)

Unnamed: 0,i94_country_code,country_name,country_alpha_2,country_alpha_3,country_code,country_iso_3166_2,country_region,country_sub_region,country_region_code,country_sub_region_code
0,236,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34
1,101,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,150,39


#### quality check 1
multiple values must be uploaded in the write format (```astype(str)```):
df_ccodes contains information about countries, such as country_code. Together with country_region_code, and country_sub_region_code these values start with zeros.

This check will be executed as follows:
* loc item country_name == 'Afghanistan' from df_ccodes
* execute an SQL query to find row with the same value as country_name
* validate correctness by comparing two rows

In [6]:
record_processed = df_ccodes[df_ccodes.country_name == 'Afghanistan']

In [7]:
record_processed

Unnamed: 0,i94_country_code,country_name,country_alpha_2,country_alpha_3,country_code,country_iso_3166_2,country_region,country_sub_region,country_region_code,country_sub_region_code
0,236,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34


In [8]:
query_ccodes = f"SELECT * FROM ccodes WHERE ccodes.country_name = 'Afghanistan'"

In [9]:
record_postgres = main_1(query_ccodes)

In [10]:
record_postgres

Unnamed: 0,i94_country_code,country_name,country_alpha_2,country_alpha_3,country_code,country_iso_3166_2,country_region,country_sub_region,country_region_code,country_sub_region_code
0,236,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34


#### both processed data (.csv) and Data Warehouse records are identical

In [11]:
print(f'{record_processed.equals(record_postgres)} <]-- records are equal')

True <]-- records are equal


#### QUALITY CHECK 2

In [12]:
import glob
files = glob.glob('./output_test/*')

In [13]:
df_processed = pd.read_csv(files[0], compression='gzip')

In [14]:
df_processed.head(2)

Unnamed: 0,CoC,CoR,PoE,state_landing,age,visapost,occup,year_birth,gender,airline_used,num_flight,visatype,dt_arrival,dt_departure,month
0,101,101,WAS,MI,55,,,1961,M,OS,93,B2,2016-04-01,2016-08-25,4
1,101,101,BOS,MA,58,TIA,,1958,M,LH,422,B1,2016-04-01,2016-04-05,4


#### quality check 2
values from num_flight must be uploaded in the write format (```astype(str)```):
If upload was done correctly a simple query will give the same result

This check will be executed as follows:
1. compare formats
* loc item PoE == 'BOS' from df_processed
* execute an SQL query to find row with the same value as PoE
* validate correctness by comparing two rows

2. simple query results
* ```value_counts()``` will be applied to the column PoE
* value for MIA will be extracted as test_value
* query will be executed in order to count MIA PoE records in the Data Warehouse as tested_value
* both values will be compared

In [15]:
df_processed.PoE.value_counts().head(4)

NYC    22578
MIA    16302
LOS    14640
ORL     9617
Name: PoE, dtype: int64

In [16]:
test_value = df_processed.PoE.value_counts()[1]

In [17]:
test_value

16302

In [18]:
def main_2(query):
    """
    - establishes connection to postgres
    
    - sends files to ETL
    """
    conn = psycopg2.connect("host=127.0.0.1 dbname=capstone user=postgres password=postgres port=5433")
    conn.set_session(autocommit=True)
    cur = conn.cursor()
    
    cur.execute(query)
    record = cur.fetchall()[0]
    
    conn.close()
    
    return record

In [19]:
query_processed = f"SELECT count(poe) FROM immigrants WHERE poe = 'MIA';"

In [20]:
tested_value = main_2(query_processed)[0]

In [21]:
## both values are equal
test_value == tested_value

True