# Final Project Code

In [6]:
# run in base directory
WORKDIR = "/home/awesome"
import os
os.chdir(WORKDIR)

# datatypes
import json
import yaml
# database connection
import psycopg2
import psycopg2.extras
import psycopg2.extensions as psql_ext
from psycopg2 import sql
# custom etl functions
from etl_process import utils as etl
# respective datasets
from etl_process.data_processing import station_info as info
# computation
import pandas as pd
# utilities
from pathlib import Path
import itertools
# typing
from typing import Union

# set up directories
WORKDIR_PATH = Path.cwd()
DATA_PATH = WORKDIR_PATH / 'etl_process' / 'processed_data'
SCHEMAS_PATH = WORKDIR_PATH / 'etl_process' / 'schemas'

PROJECT_SCHEMA = 'citibike_project'

# Set Up Database

### Connect, Set up schema

In [13]:
# PSQL db connection using psycopg2
conn = psycopg2.connect(
    dbname='new_db', 
    user='awesome_user', 
    password='awesome_password', 
    host='postgres', 
    port='5432'
)

In [14]:
etl.drop_recreate_schema(conn, PROJECT_SCHEMA)

All tables in citibike_project dropped successfully.
Dropped Schema citibike_project.
Created Schema citibike_project.


### Clean all tables

In [15]:
schema_files = [item for item in SCHEMAS_PATH.iterdir() if item.is_file()]
tables_schemas = list(itertools.chain(*[etl.read_yaml_to_dict(schema_file)["tables"] for schema_file in schema_files]))
tables_schemas = {k: v for d in tables_schemas for k, v in d.items()}

for table_name, table_schema in tables_schemas.items():
    etl.drop_recreate_table(
        db_schema=PROJECT_SCHEMA,
        table_name=table_name,
        table_schema=table_schema,
        conn=conn,
    )


FileNotFoundError: [Errno 2] No such file or directory: '/root/etl_process/schemas'

# Upload Data

### Aggregated Ride Data

In [8]:
traffic_data = [item for item in (DATA_PATH / "rides" / "station_traffic").iterdir() if item.is_file()]

for file in traffic_data:
    df = pd.read_csv(file)
    etl.upload_dataframe(
        conn=conn,
        dataframe=df,
        db_schema=PROJECT_SCHEMA,
        table_name="citibike_station_history",
        table_schema=tables_schemas["citibike_station_history"]
    )


Uploaded 893703 records to citibike_project.citibike_station_history
Uploaded 955062 records to citibike_project.citibike_station_history
Uploaded 917794 records to citibike_project.citibike_station_history


KeyboardInterrupt: 

### Weather Data

In [None]:
for file in [
    "weather_general",
    "weather_precip",
]:
    df = pd.read_csv(DATA_PATH / "weather" / f"{file}.csv")
    etl.upload_dataframe(
        conn=conn,
        dataframe=df,
        db_schema=PROJECT_SCHEMA,
        table_name=file,
        table_schema=tables_schemas[file]
    )

### Station Info

In [None]:
df_station_info = info.get_station_info_data()

etl.upload_dataframe(
    conn=conn,
    dataframe=df_station_info,
    db_schema=PROJECT_SCHEMA,
    table_name='station_info',
    table_schema=tables_schemas["station_info"]
)

In [None]:
# pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.station_info LIMIT 3", conn)

### IRS Data

In [None]:
for file in [
    "irs_codes",
    "nyc_irs",
]:
    df = pd.read_csv(DATA_PATH / "irs" / f"{file}.csv")
    etl.upload_dataframe(
        conn=conn,
        dataframe=df,
        db_schema=PROJECT_SCHEMA,
        table_name=file,
        table_schema=tables_schemas[file]
    )

### Close the connection

In [None]:
# conn.close()

# Report Generation

In [None]:
df_stations = pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.station_info", conn)
df_largest_stations = df_stations.sort_values(by="capacity", ascending=False).iloc[:50]

In [None]:
df_weather_precip = pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.weather_precip", conn)
df_weather_precip["one_hour_precip_amount"] = pd.to_numeric(df_weather_precip['one_hour_precip_amount'], errors='coerce')
df_weather_precip_daily = df_weather_precip.groupby(by="date")[["one_hour_precip_amount"]].sum()

In [None]:
import plotly.graph_objs as go

df_stations = pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.station_info", conn)

df_weather_precip = pd.read_sql_query(f"SELECT * FROM {PROJECT_SCHEMA}.weather_precip", conn)
df_weather_precip["one_hour_precip_amount"] = pd.to_numeric(df_weather_precip['one_hour_precip_amount'], errors='coerce')
df_weather_precip_daily = df_weather_precip.groupby(by="date")[["one_hour_precip_amount"]].sum()

# Create simple plots
bar1 = go.Bar(x=df_weather_precip_daily.index, y=df_weather_precip_daily.one_hour_precip_amount)
layout1 = go.Layout()
fig1 = go.Figure([bar1], layout1)

# Create another simple plot
# Create a table
fig2 = go.Figure(data=[go.Table(
    header=dict(
        values=list(df_largest_stations.to_dict().keys()),
        fill_color='paleturquoise',
        align='left'
    ),
    cells=dict(
        values=df_largest_stations.values.T,
        fill_color='lavender',
        align='left'
    ))
])

# Convert figures to HTML strings
fig1_html = fig1.to_html(full_html=False, include_plotlyjs='cdn')
fig2_html = fig2.to_html(full_html=False, include_plotlyjs='cdn')

# Create the HTML template
template = """
<html>

df_largest_stations = df_stations.sort_values(by="capacity", ascending=False).iloc[:50]
<head>
    <title>Plotly Report</title>
    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
</head>
<body>
    <h1>CitiBike Report</h1>
    <div id='divPlotly1'>
        <h2>Daily Precipitation</h2>
        {fig1_html}
    </div>
    <div id='divPlotly2'>
        <h2>Largest Stations</h2>
        {fig2_html}
    </div>
</body>
</html>
"""

# Write the HTML report to a file
with open('report.html', 'w') as f:
    f.write(template.format(fig1_html=fig1_html, fig2_html=fig2_html))
