<a href="https://colab.research.google.com/github/ath0217/hello-github/blob/main/session2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
!mkdir data

In [None]:
import gdown

urls = ['https://drive.google.com/uc?export=download&id=1GiEi6LVHBAAX4IWVA2gRH1gJZ1vLYghG', # chicago_train_data  https://drive.google.com/file/d/1GiEi6LVHBAAX4IWVA2gRH1gJZ1vLYghG/view?usp=sharing
        'https://drive.google.com/uc?export=download&id=1LIK0YFER5ve9Wn7Uap4Zb6f5pXcAExZ-', # train_days   https://drive.google.com/file/d/1LIK0YFER5ve9Wn7Uap4Zb6f5pXcAExZ-/view?usp=sharing 
        'https://drive.google.com/uc?export=download&id=18pIOZQXBOSwT-t69icoGZgZh-kUUFMrN', # extra data chicago https://drive.google.com/file/d/18pIOZQXBOSwT-t69icoGZgZh-kUUFMrN/view?usp=sharing
        'https://drive.google.com/uc?export=download&id=13ofbMM3hiY44MlHmWWQw9_6WGGoV4J4o', # chichago_gas_prices https://drive.google.com/file/d/13ofbMM3hiY44MlHmWWQw9_6WGGoV4J4o/view?usp=sharing 
        'https://drive.google.com/uc?export=download&id=1Zd_v6ufxJ8FEHSVgq9jKQbpPu5dW1rFX' # https://drive.google.com/file/d/1Zd_v6ufxJ8FEHSVgq9jKQbpPu5dW1rFX/view?usp=sharing
        ]
outputs = ['chicago_train_data.csv','train_days.csv' ,
           'extra_data_chicago.csv','chicago_gas_prices.csv',
           'lag_14_data.csv']
for url,output in zip(urls,outputs):
  gdown.download(url, f'data/{output}', quiet=False)

In [None]:
df = pd.read_csv('data/chicago_train_data.csv', parse_dates=True, index_col=2)
train_days = pd.read_csv('data/train_days.csv',index_col=1, parse_dates=True)
extra_data = pd.read_csv('data/extra_data_chicago.csv',index_col=0, parse_dates=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df = df.sort_index()
df.info()

In [None]:
df.head()

In [None]:
df.loc[train_days.index]

In [None]:
df.index.year.unique()

In [None]:
df_cl = df[df.stationname=='Clark/Lake'][['daytype'	,'rides']] # let's focus on Clark/Lake station
df_cl ['rides_x1000']= df_cl.rides/1000 # transform 
df_cl['part_of_the_week']=df_cl.index.weekday.map(lambda x: 'Weekday' if x<5 else 'Weekend')

In [None]:
df_cl.head()

In [None]:
df_cl.index.duplicated().sum()

In [None]:
df_cl = df_cl[~df_cl.index.duplicated(keep='first')] # there was one record duplicated index 3815 value '2011-07-03'
print(df_cl.index.duplicated().sum())
train_df_cl = df_cl.loc[train_days.index] # define our training data with train days

In [None]:
sns.set_style("darkgrid")

In [None]:
fig, axs = plt.subplots(2,2, figsize=(12,8)) #create grid
sns.boxplot(data=train_df_cl, x='rides_x1000', orient='h', ax=axs[0,0])
sns.histplot(data=train_df_cl,x='rides_x1000', binwidth=0.7, ax=axs[0,1])
sns.violinplot(data=train_df_cl,x='rides_x1000', ax=axs[1,0])
sns.kdeplot(data=train_df_cl, x='rides_x1000', ax=axs[1,1])
plt.tight_layout()

In [None]:
g = sns.FacetGrid(train_df_cl, sharey=False, row='part_of_the_week',  hue='part_of_the_week',height=3, aspect=3)
g.map_dataframe(sns.histplot, x='rides', log_scale=True, binwidth=0.03)

In [None]:
train_df_cl['two_week_lag_rides'] = train_df_cl['rides'].shift(periods=14, fill_value = 0) # We create a lagging variable of two weeks
train_df_cl ['two_week_lag_rides_x1000']= train_df_cl['two_week_lag_rides']/1000

In [None]:
train_df_cl.head(16)

In [None]:
plt.figure(figsize=(10,10)) 
sns.scatterplot(data=train_df_cl, x='two_week_lag_rides_x1000', y='rides_x1000', hue='part_of_the_week')

In [None]:
train_df_cl['less_than_10000']= train_df_cl.rides<=10000
train_df_cl['less_than_10000'] = train_df_cl['less_than_10000'].astype(int) # to transforme it to int, i.e. True=1 and False=0
train_df_cl['month_name'] = train_df_cl.index.month_name() 
train_df_cl['month'] = train_df_cl.index.month
train_df_cl['day'] = train_df_cl.index.day
train_df_cl['month_day']= train_df_cl['month'].apply('{:0>2d}'.format) +'-' + train_df_cl['day'].apply('{:0>2d}'.format) # To create a format mm-dd
train_df_cl['year'] = train_df_cl.index.year

In [None]:
train_df_cl['2001-02'].head()

In [None]:
heat_map_data = train_df_cl[train_df_cl.part_of_the_week=='Weekday'].pivot_table(index='month_day', columns='year', values='less_than_10000', fill_value=0)

In [None]:
heat_map_data.head(10)

In [None]:
plt.figure(figsize=(10,10))
ax_hm = sns.heatmap(data=heat_map_data, cmap=['white','red'], cbar=False, yticklabels=15)
ax_hm.invert_yaxis()

In [None]:
from scipy.stats.mstats import gmean

In [None]:
line_plot_data_gmean = train_df_cl.groupby([pd.Grouper(freq='M'),'part_of_the_week']).agg({'rides_x1000':gmean, 'month_name':max, 'year':max}).reset_index()
line_plot_data_gmean.set_index('date', inplace=True)
line_plot_data_mean = train_df_cl.groupby([pd.Grouper(freq='M'),'part_of_the_week']).agg({'rides_x1000':'mean', 'month_name':max, 'year':max}).reset_index()
line_plot_data_mean.set_index('date', inplace=True)

In [None]:
line_plot_data_gmean.head()

In [None]:
gline = sns.FacetGrid(line_plot_data_gmean, sharex=False, col='part_of_the_week', hue='year',height=6, aspect=2, palette='gist_heat_r')
gline.map_dataframe(sns.lineplot, x='month_name', y='rides_x1000', legend='full')
gline.axes[0,1].legend()
gline.set_xticklabels(rotation=30)

In [None]:
gline = sns.FacetGrid(line_plot_data_mean, sharex=False, col='part_of_the_week', hue='year',height=6, aspect=2, palette='gist_heat_r')
gline.map_dataframe(sns.lineplot, x='month_name', y='rides_x1000', legend='full')
gline.axes[0,1].legend()

In [None]:
fig = px.line(line_plot_data_gmean, x='month_name', y='rides_x1000', facet_col='part_of_the_week', color='year', color_discrete_sequence=px.colors.qualitative.Alphabet)
fig.update_xaxes(tickangle=-45)
fig.show()

In [None]:
gas_price_df = pd.read_csv('data/chicago_gas_prices.csv',index_col=0, parse_dates=True)

In [None]:
gas_price_df = gas_price_df.sort_index()

In [None]:
gas_price_df.head()

In [None]:
line_plot_gas = gas_price_df['2001-01':'2016-08'].groupby(pd.Grouper(freq='M')).agg({'gas_price':'mean', 'month_name':max, 'year':max}).reset_index()
line_plot_gas.set_index('date', inplace=True)

In [None]:
line_plot_gas_lagged.head()

In [None]:
line_plot_data_gmean.loc[line_plot_data_gmean.part_of_the_week=='Weekday', 'gas_price'] = line_plot_gas_lagged
line_plot_data_gmean.loc[line_plot_data_gmean.part_of_the_week=='Weekend', 'gas_price'] = line_plot_gas_lagged

In [None]:
line_plot_data_gmean.head()

In [None]:
line_plot_data_gmean.year = line_plot_data_gmean.year.astype(str) # we make it string so it is considered categorical variable

In [None]:
fig = px.scatter(line_plot_data_gmean, x='gas_price', y='rides_x1000', color='year', facet_col='part_of_the_week', color_discrete_sequence=px.colors.qualitative.Alphabet, trendline='ols')
fig.show()