# Final Project Code

In [19]:
# run in base directory
import os
os.chdir("/home/awesome")

# datatypes
import json
import yaml
# database connection
import psycopg2
import psycopg2.extras
import psycopg2.extensions as psql_ext
from psycopg2 import sql
# custom etl functions
from etl import etl
# respective datasets
from etl import station_info as info
# computation
import pandas as pd
# utilities
from pathlib import Path
import itertools
# typing
from typing import Union

# set up directories
HOME_PATH = Path.cwd()
DATA_PATH = HOME_PATH / 'etl' / 'data'
SCHEMAS_PATH = HOME_PATH / 'etl' / 'schemas'

PROJECT_SCHEMA = 'citibike_project'

# Set Up Database

### Connect, Set up schema

In [20]:
# PSQL db connection using psycopg2
conn = psycopg2.connect(
    dbname='new_db', 
    user='awesome_user', 
    password='awesome_password', 
    host='postgres', 
    port='5432'
)

In [21]:
etl.drop_recreate_schema(conn, PROJECT_SCHEMA)

Table 'irs_codes' dropped.
Table 'irs_data' dropped.
Table 'station_info' dropped.
All tables in citibike_project dropped successfully.
Dropped Schema citibike_project.
Created Schema citibike_project.


### Clean all tables

In [22]:
schema_files = [item for item in SCHEMAS_PATH.iterdir() if item.is_file()]
tables_schemas = list(itertools.chain(*[etl.read_yaml_to_dict(schema_file)["tables"] for schema_file in schema_files]))

for table_schema in tables_schemas:
    etl.drop_recreate_table(
        db_schema=PROJECT_SCHEMA,
        table_schema=table_schema,
        conn=conn,
    )


Dropping citibike_project.irs_codes
Creating citibike_project.irs_codes
Dropping citibike_project.irs_data
Creating citibike_project.irs_data
Dropping citibike_project.station_info
Creating citibike_project.station_info


# Upload Data

### Station Info

In [23]:
df_station_status = info.get_station_info_data()
df_station_status.head()

Unnamed: 0,station_id,short_name,name,lat,lon,region_id,capacity,rental_uris
0,19b61564-7629-41b4-80c8-1756135a5442,7913.15,Cauldwell Ave & E 158 St,40.81989,-73.908351,71,19,{'android': 'https://bkn.lft.to/lastmile_qr_sc...
1,7d5fa1f0-4069-4d5d-b735-5d500597394a,5569.07,W Broadway & Watts St,40.72323,-74.00314,71,51,{'android': 'https://bkn.lft.to/lastmile_qr_sc...
2,40043e8a-6ef6-428d-b585-a7c23cdb06d4,6098.12,Broadway & E 19 St,40.738661,-73.989873,71,65,{'android': 'https://bkn.lft.to/lastmile_qr_sc...
3,c1a4d909-0a00-475a-8e82-18ed13a4eb01,4962.02,Whitehall St & Bridge St,40.703662,-74.013181,71,44,{'android': 'https://bkn.lft.to/lastmile_qr_sc...
4,7779e057-b33e-46a7-8a1e-8c28bcd9a558,4325.03,Thomas S. Boyland St & Macon St,40.68491,-73.91493,71,21,{'android': 'https://bkn.lft.to/lastmile_qr_sc...


In [24]:
etl.upload_dataframe(
    conn=conn,
    dataframe=df_station_status,
    db_schema=PROJECT_SCHEMA,
    table_name='station_info'
)

Uploaded 2217 records to citibike_project.station_info


In [27]:
pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.station_info LIMIT 3", conn)

  pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.station_info LIMIT 3", conn)


Unnamed: 0,short_name,station_id,name,lat,lon,region_id,capacity,rental_uris
0,7913.15,19b61564-7629-41b4-80c8-1756135a5442,Cauldwell Ave & E 158 St,40.81989,-73.908351,71,19,{'android': 'https://bkn.lft.to/lastmile_qr_sc...
1,5569.07,7d5fa1f0-4069-4d5d-b735-5d500597394a,W Broadway & Watts St,40.72323,-74.00314,71,51,{'android': 'https://bkn.lft.to/lastmile_qr_sc...
2,6098.12,40043e8a-6ef6-428d-b585-a7c23cdb06d4,Broadway & E 19 St,40.738661,-73.989873,71,65,{'android': 'https://bkn.lft.to/lastmile_qr_sc...


### IRS Data

In [31]:
df_irs_codes = pd.read_csv(DATA_PATH / "irs_codes.csv")
df_irs_codes.head()

Unnamed: 0.1,Unnamed: 0,Code,Description
0,0,STATEFIPS,State Federal Information Processing System
1,1,STATE,State associated with zip code
2,2,ZIPCODE,5-digit zip code
3,3,AGI_STUB,Size of adjusted gross income
4,4,N1,Number of returns


In [32]:
etl.upload_dataframe(
    conn=conn,
    dataframe=df_irs_codes[["Code", "Description"]],
    db_schema=PROJECT_SCHEMA,
    table_name='irs_codes'
)

Uploaded 167 records to citibike_project.irs_codes


In [33]:
pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.irs_codes LIMIT 3", conn)

  pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.irs_codes LIMIT 3", conn)


Unnamed: 0,code,description
0,STATEFIPS,State Federal Information Processing System
1,STATE,State associated with zip code
2,ZIPCODE,5-digit zip code


### Close the connection

In [26]:
# conn.close()

# Report Generation