# Final Project Code

In [10]:
# run in base directory
import os
os.chdir("/home/awesome")

# datatypes
import json
import yaml
# database connection
import psycopg2
import psycopg2.extras
import psycopg2.extensions as psql_ext
from psycopg2 import sql
# custom etl functions
from etl import etl
# respective datasets
from etl import station_info as info
# computation
import pandas as pd
# utilities
from pathlib import Path
import itertools
# typing
from typing import Union

# set up directories
HOME_PATH = Path.cwd()
DATA_PATH = HOME_PATH / 'etl' / 'processed_data'
SCHEMAS_PATH = HOME_PATH / 'etl' / 'schemas'

PROJECT_SCHEMA = 'citibike_project'

# Set Up Database

### Connect, Set up schema

In [2]:
# PSQL db connection using psycopg2
conn = psycopg2.connect(
    dbname='new_db', 
    user='awesome_user', 
    password='awesome_password', 
    host='postgres', 
    port='5432'
)

In [3]:
etl.drop_recreate_schema(conn, PROJECT_SCHEMA)

Table 'irs_codes' dropped.
Table 'irs_data' dropped.
Table 'station_info' dropped.
Table 'general' dropped.
All tables in citibike_project dropped successfully.
Dropped Schema citibike_project.
Created Schema citibike_project.


### Clean all tables

In [20]:
schema_files = [item for item in SCHEMAS_PATH.iterdir() if item.is_file()]
tables_schemas = list(itertools.chain(*[etl.read_yaml_to_dict(schema_file)["tables"] for schema_file in schema_files]))
tables_schemas = {k: v for d in tables_schemas for k, v in d.items()}

for table_name, table_schema in tables_schemas.items():
    etl.drop_recreate_table(
        db_schema=PROJECT_SCHEMA,
        table_name=table_name,
        table_schema=table_schema,
        conn=conn,
    )


Dropping citibike_project.irs_codes
Creating citibike_project.irs_codes
Dropping citibike_project.irs_data
Creating citibike_project.irs_data
Dropping citibike_project.station_info
Creating citibike_project.station_info
Dropping citibike_project.weather_general
Creating citibike_project.weather_general
Dropping citibike_project.weather_precip
Creating citibike_project.weather_precip


# Upload Data

### Weather Data

In [21]:
for file in [
    "weather_general",
    "weather_precip",
]:
    df = pd.read_csv(DATA_PATH / "weather" / f"{file}.csv")
    etl.upload_dataframe(
        conn=conn,
        dataframe=df,
        db_schema=PROJECT_SCHEMA,
        table_name=file,
        table_schema=tables_schemas[file]
    )

Uploaded 11272 records to citibike_project.weather_general
Uploaded 11272 records to citibike_project.weather_precip


### Station Info

In [23]:
df_station_status = info.get_station_info_data()

etl.upload_dataframe(
    conn=conn,
    dataframe=df_station_status,
    db_schema=PROJECT_SCHEMA,
    table_name='station_info',
    table_schema=tables_schemas["station_info"]
)

Uploaded 2221 records to citibike_project.station_info


In [22]:
# pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.station_info LIMIT 3", conn)

### IRS Data

In [25]:

df_irs_codes.head()

Unnamed: 0.1,Unnamed: 0,Code,Description
0,0,STATEFIPS,State Federal Information Processing System
1,1,STATE,State associated with zip code
2,2,ZIPCODE,5-digit zip code
3,3,AGI_STUB,Size of adjusted gross income
4,4,N1,Number of returns


In [28]:
df_irs_codes = pd.read_csv(DATA_PATH / "irs" / "irs_codes.csv")
etl.upload_dataframe(
    conn=conn,
    dataframe=df_irs_codes[["Code", "Description"]],
    db_schema=PROJECT_SCHEMA,
    table_name='irs_codes',
    table_schema=tables_schemas["irs_codes"]
)

Uploaded 167 records to citibike_project.irs_codes


In [33]:
pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.irs_codes LIMIT 3", conn)

  pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.irs_codes LIMIT 3", conn)


Unnamed: 0,code,description
0,STATEFIPS,State Federal Information Processing System
1,STATE,State associated with zip code
2,ZIPCODE,5-digit zip code


### Close the connection

In [26]:
# conn.close()

# Report Generation