In [71]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (15, 8)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_log_error

In [85]:
df = pd.read_csv('/Users/alantrinh/Code/Spiced Academy/garlic-boosting-student-code/03_week_project/bike-sharing-demand/train.csv', index_col=0, parse_dates=True)
X = df.loc[:, df.columns != 'count']
y = df['count']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [92]:
def add_date_time_features(dataframe):
    dataframe['year_month'] = dataframe.index.year.astype(str) + '_' + dataframe.index.month_name()  
    dataframe['year'] = dataframe.index.year.astype(str)
    dataframe['month'] = dataframe.index.month_name()
    dataframe['day'] = dataframe.index.day_name()
    dataframe['hour'] = dataframe.index.hour.astype(str) + 'h'

add_date_time_features(X_train)
X_train

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,year_month_hour,day,year_month,year,month,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-07-15 18:00:00,3,0,1,1,29.52,32.575,45,15.0013,102,394,2011_July_18h,Friday,2011_July,2011,July,18h
2012-02-05 02:00:00,1,0,0,2,9.84,12.880,93,7.0015,4,59,2012_February_2h,Sunday,2012_February,2012,February,2h
2012-06-19 23:00:00,2,0,1,1,29.52,34.850,79,8.9981,23,102,2012_June_23h,Tuesday,2012_June,2012,June,23h
2011-05-16 11:00:00,2,0,1,1,26.24,31.060,57,0.0000,56,129,2011_May_11h,Monday,2011_May,2011,May,11h
2011-03-07 19:00:00,1,0,1,1,12.30,14.395,33,15.0013,11,134,2011_March_19h,Monday,2011_March,2011,March,19h
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-04-05 01:00:00,2,0,1,1,19.68,23.485,27,22.0028,4,9,2012_April_1h,Thursday,2012_April,2012,April,1h
2011-09-11 23:00:00,3,0,0,3,23.78,27.275,78,6.0032,12,33,2011_September_23h,Sunday,2011_September,2011,September,23h
2012-03-16 23:00:00,1,0,1,2,18.04,21.970,94,0.0000,22,104,2012_March_23h,Friday,2012_March,2012,March,23h
2011-12-14 07:00:00,4,0,1,1,11.48,15.910,75,0.0000,4,243,2011_December_7h,Wednesday,2011_December,2011,December,7h


In [57]:
X_train_date_time_only = X_train[['year_month', 'day', 'hour', 'holiday']]

In [49]:
categorical_steps = [('onehotcat', OneHotEncoder(handle_unknown='ignore'))]

categorical_transformer = Pipeline(categorical_steps)
preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, ['year_month', 'day', 'hour'])
], remainder='passthrough')


final_steps = [
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(max_depth=25, n_estimators=1000))
]

pipeline = Pipeline(final_steps)
pipeline.fit(X_train_date_time_only, y_train)
y_pred = pipeline.predict(X_train_date_time_only)

add_date_time_features(X_val)
X_val_date_time_only = X_val[['year_month', 'day', 'hour', 'holiday']]

y_pred_val = pipeline.predict(X_val_date_time_only)
print(f'{mean_squared_log_error(y_train, y_pred, squared=False)}, {mean_squared_log_error(y_val, y_pred_val, squared=False)}')

0.35521070376889374, 0.5231890792264544


In [95]:
X_test = pd.read_csv('/Users/alantrinh/Code/Spiced Academy/garlic-boosting-student-code/03_week_project/bike-sharing-demand/test.csv', index_col=0, parse_dates=True)
add_date_time_features(X_test)
X_test

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year_month,year,month,day,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011_January,2011,January,Thursday,0h
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011_January,2011,January,Thursday,1h
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011_January,2011,January,Thursday,2h
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011_January,2011,January,Thursday,3h
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011_January,2011,January,Thursday,4h
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012_December,2012,December,Monday,19h
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012_December,2012,December,Monday,20h
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012_December,2012,December,Monday,21h
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012_December,2012,December,Monday,22h


In [52]:
y_pred_test = pipeline.predict(X_test[['year_month', 'day', 'hour', 'holiday']])
pd.DataFrame(y_pred_test)

Unnamed: 0,0
0,7
1,2
2,2
3,3
4,4
...,...
6488,340
6489,200
6490,245
6491,126


In [53]:
submission_test = X_test.reset_index()[['datetime']].join(pd.DataFrame(y_pred_test, columns=['count']))
submission_test

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,7
1,2011-01-20 01:00:00,2
2,2011-01-20 02:00:00,2
3,2011-01-20 03:00:00,3
4,2011-01-20 04:00:00,4
...,...,...
6488,2012-12-31 19:00:00,340
6489,2012-12-31 20:00:00,200
6490,2012-12-31 21:00:00,245
6491,2012-12-31 22:00:00,126


In [54]:
submission_test.to_csv('/Users/alantrinh/Code/Spiced Academy/garlic-boosting-student-code/03_week_project/bike-sharing-demand/test_submission_depth_25.csv', index=False)

In [72]:
# From EDA temp and humidity have a strong correlation to count
X_train_date_time_weather = X_train[['year_month', 'day', 'hour', 'holiday', 'temp', 'humidity']]
X_train_date_time_weather

Unnamed: 0_level_0,year_month,day,hour,holiday,temp,humidity
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-07-11 23:00:00,2011_July,Monday,23h,0,28.70,79
2011-06-03 02:00:00,2011_June,Friday,2h,0,22.96,43
2011-06-14 18:00:00,2011_June,Tuesday,18h,0,25.42,51
2011-05-09 13:00:00,2011_May,Monday,13h,0,25.42,38
2012-01-12 17:00:00,2012_January,Thursday,17h,0,18.86,63
...,...,...,...,...,...,...
2012-09-04 19:00:00,2012_September,Tuesday,19h,0,31.16,70
2012-02-15 06:00:00,2012_February,Wednesday,6h,0,12.30,61
2012-10-08 14:00:00,2012_October,Monday,14h,1,17.22,58
2011-01-07 07:00:00,2011_January,Friday,7h,0,8.20,69


In [106]:
numerical_steps = [('scaler', StandardScaler())]
# sub-pipeline 2
numerical_transformer  = Pipeline(numerical_steps)

preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, ['year_month', 'day', 'hour']),
    ('numerical', numerical_transformer, ['temp', 'humidity'])
], remainder='passthrough')

for i in range(10, 26):
    final_steps = [
        ('preprocessor', preprocessor),
        ('clf', RandomForestClassifier(max_depth=i, n_estimators=100))
    ]

    pipeline = Pipeline(final_steps)
    pipeline.fit(X_train_date_time_weather, y_train)
    y_pred = pipeline.predict(X_train_date_time_weather)

    add_date_time_features(X_val)
    X_val_date_time_weather = X_val[['year_month', 'day', 'hour', 'holiday', 'temp', 'humidity']]

    y_pred_val = pipeline.predict(X_val_date_time_weather)
    print(f'{i}: {mean_squared_log_error(y_train, y_pred, squared=False)}, {mean_squared_log_error(y_val, y_pred_val, squared=False)}')

10: 0.8920086022775187, 1.1260812199215013
11: 0.6887674133704943, 0.9206415722792969
12: 0.39253893820230906, 0.7766529213325543
13: 0.28180587212160774, 0.7300949046581826
14: 0.1795967216621052, 0.687709192221872
15: 0.1308351965774999, 0.6255117877196886
16: 0.11477278702900587, 0.6121580795735585
17: 0.09935851571037391, 0.6179378220123858
18: 0.050193708384210105, 0.5817191710044611
19: 0.039577184932837274, 0.5659961481960192
20: 0.03407401599000412, 0.5892106380507895
21: 0.027280824290512998, 0.5931172970373052
22: 0.026788077016705047, 0.5598209579994871
23: 0.02354164145282194, 0.563757183356423
24: 0.023433642356558624, 0.545277570846225
25: 0.023569686563538944, 0.5477895680371977


In [76]:
for i in range(10, 26):
    final_steps = [
        ('preprocessor', preprocessor),
        ('rgrs', RandomForestRegressor(max_depth=i, n_estimators=100))
    ]

    pipeline = Pipeline(final_steps)
    pipeline.fit(X_train_date_time_weather, y_train)
    y_pred = pipeline.predict(X_train_date_time_weather)

    add_date_time_features(X_val)
    X_val_date_time_weather = X_val[['year_month', 'day', 'hour', 'holiday', 'temp', 'humidity']]

    y_pred_val = pipeline.predict(X_val_date_time_weather)
    print(f'{i}: {mean_squared_log_error(y_train, y_pred, squared=False)}, {mean_squared_log_error(y_val, y_pred_val, squared=False)}')

10: 1.0058678436528872, 1.0481259128662053
11: 0.9506247175409868, 0.9966907027396704
12: 0.8803531563630539, 0.9333684063164914
13: 0.8078647669229497, 0.8674406526145101
14: 0.7481841489435441, 0.8220742651577639
15: 0.6928447607134074, 0.7739995100538651
16: 0.628566282299463, 0.7224899790126669
17: 0.5830640956662654, 0.6853346707903314
18: 0.5433076821692479, 0.6612954087592782
19: 0.4955364416955014, 0.6212557806735403
20: 0.4587888479592414, 0.6002252277131174
21: 0.42377922568320997, 0.5739451201029709
22: 0.3940345659684689, 0.5484484678747531
23: 0.3670815882105367, 0.53934670435002
24: 0.3413501540232797, 0.5171470937774872
25: 0.31574377049858326, 0.5083839834249804


In [77]:
for i in range(26, 31):
    final_steps = [
        ('preprocessor', preprocessor),
        ('rgrs', RandomForestRegressor(max_depth=i, n_estimators=100))
    ]

    pipeline = Pipeline(final_steps)
    pipeline.fit(X_train_date_time_weather, y_train)
    y_pred = pipeline.predict(X_train_date_time_weather)

    add_date_time_features(X_val)
    X_val_date_time_weather = X_val[['year_month', 'day', 'hour', 'holiday', 'temp', 'humidity']]

    y_pred_val = pipeline.predict(X_val_date_time_weather)
    print(f'{i}: {mean_squared_log_error(y_train, y_pred, squared=False)}, {mean_squared_log_error(y_val, y_pred_val, squared=False)}')

26: 0.3041716626693416, 0.5087438311799993
27: 0.2861385439426883, 0.49896087452489823
28: 0.27923163151333974, 0.4967268104249721
29: 0.268243636152978, 0.4923684823072772
30: 0.2670890328250638, 0.4955544049616659


In [78]:
final_steps = [
    ('preprocessor', preprocessor),
    ('rgrs', RandomForestRegressor(max_depth=29, n_estimators=1000))
]

pipeline = Pipeline(final_steps)
pipeline.fit(X_train_date_time_weather, y_train)
y_pred = pipeline.predict(X_train_date_time_weather)

add_date_time_features(X_val)
X_val_date_time_weather = X_val[['year_month', 'day', 'hour', 'holiday', 'temp', 'humidity']]

y_pred_val = pipeline.predict(X_val_date_time_weather)
print(f'{mean_squared_log_error(y_train, y_pred, squared=False)}, {mean_squared_log_error(y_val, y_pred_val, squared=False)}')

0.26581029251227645, 0.49151163832445977


In [79]:
y_pred_test = pipeline.predict(X_test[['year_month', 'day', 'hour', 'holiday', 'temp', 'humidity']])
submission_test = X_test.reset_index()[['datetime']].join(pd.DataFrame(y_pred_test, columns=['count']))
submission_test.to_csv('/Users/alantrinh/Code/Spiced Academy/garlic-boosting-student-code/03_week_project/bike-sharing-demand/test_submission_depth_29_weather_rgrs.csv', index=False)

In [146]:
cols = ['year_month', 'workingday', 'hour', 'holiday', 'atemp', 'humidity']
cat_cols_for_transform = ['year_month', 'hour']
num_cols_for_transform = ['atemp', 'humidity']

def run_pipeline(features, cat_features_to_transform, num_features_to_transform, depth, n_estimators, min_samples_leaf):

    preprocessor = ColumnTransformer(transformers=[
        ('categorical', categorical_transformer, cat_features_to_transform),
        ('numerical', numerical_transformer, num_features_to_transform)
    ], remainder='passthrough')
    
    final_steps = [
        ('preprocessor', preprocessor),
        ('rgrs', RandomForestRegressor(max_depth=depth, n_estimators=n_estimators, min_samples_leaf=min_samples_leaf))
    ]

    pipeline = Pipeline(final_steps)
    pipeline.fit(X_train[cols], y_train)
    y_pred = pipeline.predict(X_train[cols])

    add_date_time_features(X_val)

    y_pred_val = pipeline.predict(X_val[cols])
    print(f'{depth}: {mean_squared_log_error(y_train, y_pred, squared=False)}, {mean_squared_log_error(y_val, y_pred_val, squared=False)}')
    return pipeline
    
pipeline = run_pipeline(cols, cat_cols_for_transform, num_cols_for_transform, 29, 100, 1)

29: 0.25525282435662355, 0.45298736107515764


In [126]:
for i in range(15, 31):
    run_pipeline(cols, cat_cols_for_transform, num_cols_for_transform, i, 100, 1)

15: 0.6773598499674482, 0.7249667666574562
16: 0.59454482707809, 0.6507868190216158
17: 0.5534022947766677, 0.618580262216439
18: 0.5236942970040281, 0.5987297293111757
19: 0.4773742283930195, 0.5645543431176502
20: 0.4462911554773352, 0.5442594191537792
21: 0.40072895480786425, 0.5149739502556335
22: 0.3834198464988097, 0.5088273282937967
23: 0.3531207570056962, 0.4952194211488127
24: 0.33298676613370665, 0.4818684604852459
25: 0.300976725230134, 0.46327018083969673
26: 0.2857945917774307, 0.4621871021063866
27: 0.27492155823294245, 0.45722481702871903
28: 0.2624297593041644, 0.4554654049859097
29: 0.25789444743389534, 0.45434615524417865
30: 0.25096318680415863, 0.4542125844211479


In [147]:
def create_submission():
    y_pred_test = pipeline.predict(X_test[cols])
    submission_test = X_test.reset_index()[['datetime']].join(pd.DataFrame(y_pred_test, columns=['count']))
    filename = 'test_submission_depth_29_weather_atemp_workingday_rgrs.csv'
    submission_test.to_csv(f'/Users/alantrinh/Code/Spiced Academy/garlic-boosting-student-code/03_week_project/bike-sharing-demand/{filename}', index=False)
    
create_submission()