In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
pd.options.display.max_columns = None

import altair as alt
alt.renderers.enable("notebook")

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error


In [2]:
chicago_trips = pd.read_csv('../Data/taxi_model_sample001.csv')

In [3]:
chicago_trips.sample(5)

Unnamed: 0,trip_id,trip_seconds,pickup_community_area,dropoff_community_area,fare,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,humidity,pressure,temperature,weather_description,wind_direction,wind_speed,year,month,day,week_day,hour,minute,distance_miles,distance_ord,distance_mdw,taxi_id_ind,payment_type_ind
233091,de61404ae87423550f5ae731c5837610b4ba6332,780,32,8,9.45,Cash,Dispatch Taxi Affiliation,41.870607,-87.622173,41.902788,-87.626146,100.0,1008.0,283.6,light rain,290.0,1.0,2013,10,17,5,20,0,2.239155,19.96803,10.738379,1316,0
219883,6fc502edc19e940995c3463cf79b858f2dc62e99,2220,32,76,44.25,Credit Card,Taxi Affiliation Services,41.880994,-87.632746,41.979071,-87.90304,51.0,1027.0,262.57,sky is clear,360.0,3.0,2016,2,11,5,15,45,19.885248,0.120623,10.56154,313,1
79643,59637bfac464a525fc5e272fa235ee96c882634a,1080,32,8,9.85,Cash,Taxi Affiliation Services,41.880994,-87.632746,41.898332,-87.620763,39.0,1017.0,300.91,scattered clouds,0.0,3.0,2014,7,31,5,19,15,1.456177,19.983967,10.56154,1482,0
222730,b335380290a28e81276b0f2b0fa8a84fb38b934e,240,32,8,5.25,Cash,Taxi Affiliation Services,41.884987,-87.620993,41.899156,-87.626211,34.0,1009.0,293.22,scattered clouds,260.0,7.0,2014,4,25,6,20,0,1.04284,20.030945,11.373879,87,0
223724,831787ef50a9113524c937ec85f79f796f82102c,240,4,4,4.85,Cash,Dispatch Taxi Affiliation,41.975171,-87.687516,41.975171,-87.687516,62.0,1025.0,283.254,scattered clouds,277.0,4.0,2014,5,4,1,3,0,0.0,15.026208,13.807652,57,0


In [4]:
chicago_trips.dtypes

trip_id                        object
trip_seconds                    int64
pickup_community_area           int64
dropoff_community_area          int64
fare                          float64
payment_type                   object
company                        object
pickup_centroid_latitude      float64
pickup_centroid_longitude     float64
dropoff_centroid_latitude     float64
dropoff_centroid_longitude    float64
humidity                      float64
pressure                      float64
temperature                   float64
weather_description            object
wind_direction                float64
wind_speed                    float64
year                            int64
month                           int64
day                             int64
week_day                        int64
hour                            int64
minute                          int64
distance_miles                float64
distance_ord                  float64
distance_mdw                  float64
taxi_id_ind 

In [5]:
chicago_trips = chicago_trips.drop(['trip_id','fare'],axis=1)

In [6]:
chicago_trips = pd.get_dummies(chicago_trips)

In [7]:
chicago_train = chicago_trips[(chicago_trips['year']<2016) |
                             ((chicago_trips['year']==2016) & (chicago_trips['month']<=1))]

In [8]:
chicago_train.shape

(206977, 172)

In [9]:
train_target = chicago_train[['trip_seconds']]
train_predictors = chicago_train.drop(['trip_seconds'],axis=1)

In [10]:
chicago_test_val = chicago_trips[(chicago_trips['year']==2017) |
                             ((chicago_trips['year']==2016) & (chicago_trips['month']>1))]

In [11]:
chicago_test_val.shape

(74645, 172)

In [12]:
chicago_test_val_target = chicago_test_val[['trip_seconds']]
chicago_test_val_predictors = chicago_test_val.drop(['trip_seconds'],axis=1)

In [13]:
validation_predictors,test_predictors, validation_target, test_target = train_test_split(chicago_test_val_predictors, 
                                                                                         chicago_test_val_target, 
                                                                                         test_size=0.38, 
                                                                                         random_state=42)

In [14]:
del chicago_test_val, chicago_test_val_predictors, chicago_test_val_target, chicago_train, chicago_trips,

In [15]:
def rmse(target,predictors):
    return sqrt(mean_squared_error(target, predictors))

In [16]:
def rmsle(target,predictors):
    return np.sqrt(mean_squared_log_error( target, predictors ))

## DECISION TREE

In [17]:
%%time
dicc_dt={}
for depth in range(2,15):
    print('Depth: %d' % (depth))
    taxi_model_dt = DecisionTreeRegressor(max_depth=depth,random_state=13)
    
    taxi_model_dt.fit(train_predictors, train_target)
    
    train_predictions = taxi_model_dt.predict(train_predictors)
    val_predictions = taxi_model_dt.predict(validation_predictors)
    
    error_train = rmsle(train_target, train_predictions)
    error_val = rmsle(validation_target, val_predictions)
    difference = error_val - error_train
    
    dicc_dt[str(depth)]= [error_train,error_val,difference]
    print ('For a %d depth the RMSE_train is %.5f $, the RMSE_val is %.5f $ and the difference %.5f $'
               % (depth,error_train,error_val,difference)) 

Depth: 2
For a 2 depth the RMSE_train is 0.43030 $, the RMSE_val is 0.43720 $ and the difference 0.00690 $
Depth: 3
For a 3 depth the RMSE_train is 0.39827 $, the RMSE_val is 0.40472 $ and the difference 0.00646 $
Depth: 4
For a 4 depth the RMSE_train is 0.39051 $, the RMSE_val is 0.39723 $ and the difference 0.00673 $
Depth: 5
For a 5 depth the RMSE_train is 0.38134 $, the RMSE_val is 0.38831 $ and the difference 0.00696 $
Depth: 6
For a 6 depth the RMSE_train is 0.37565 $, the RMSE_val is 0.38264 $ and the difference 0.00699 $
Depth: 7
For a 7 depth the RMSE_train is 0.37022 $, the RMSE_val is 0.37736 $ and the difference 0.00714 $
Depth: 8
For a 8 depth the RMSE_train is 0.36542 $, the RMSE_val is 0.37371 $ and the difference 0.00829 $
Depth: 9
For a 9 depth the RMSE_train is 0.36028 $, the RMSE_val is 0.37249 $ and the difference 0.01220 $
Depth: 10
For a 10 depth the RMSE_train is 0.35512 $, the RMSE_val is 0.37090 $ and the difference 0.01579 $
Depth: 11
For a 11 depth the RMSE_t

In [18]:
min(dicc_dt.items(), key=lambda x: x[1][0])

('14', [0.32567075051477273, 0.37783998816706227, 0.052169237652289535])

In [19]:
min(dicc_dt.items(), key=lambda x: x[1][1])

('11', [0.34906083913451136, 0.368833850132751, 0.019773010998239637])

In [20]:
min(dicc_dt.items(), key=lambda x: x[1][2])

('3', [0.39826689065691134, 0.40472219022376466, 0.006455299566853312])

## RANDOM FOREST

In [21]:
%%time
dicc_rf={}
for depth in range(2,15):
    print('Depth: %d' % (depth))
    taxi_model_rf = RandomForestRegressor(max_depth=depth,
                                          n_estimators=100,
                                          n_jobs=-1,
                                          random_state=13)
    
    taxi_model_rf.fit(train_predictors, train_target)                                    
    
    train_predictions = taxi_model_rf.predict(train_predictors)
    val_predictions = taxi_model_rf.predict(validation_predictors)
    
    error_train = rmsle(train_target, train_predictions)
    error_val = rmsle(validation_target, val_predictions)
    difference = error_val - error_train
    
    dicc_rf[str(depth)]= [error_train,error_val,difference]
    print ('For a %d depth the RMSE_train is %.5f $, the RMSE_val is %.5f $ and the difference %.5f $'
               % (depth,error_train,error_val,difference))  
            

Depth: 2


  if __name__ == '__main__':


For a 2 depth the RMSE_train is 0.42826 $, the RMSE_val is 0.43550 $ and the difference 0.00723 $
Depth: 3


  if __name__ == '__main__':


For a 3 depth the RMSE_train is 0.39733 $, the RMSE_val is 0.40394 $ and the difference 0.00661 $
Depth: 4


  if __name__ == '__main__':


For a 4 depth the RMSE_train is 0.38909 $, the RMSE_val is 0.39618 $ and the difference 0.00709 $
Depth: 5


  if __name__ == '__main__':


For a 5 depth the RMSE_train is 0.37989 $, the RMSE_val is 0.38678 $ and the difference 0.00689 $
Depth: 6


  if __name__ == '__main__':


For a 6 depth the RMSE_train is 0.37321 $, the RMSE_val is 0.37997 $ and the difference 0.00676 $
Depth: 7


  if __name__ == '__main__':


For a 7 depth the RMSE_train is 0.36753 $, the RMSE_val is 0.37474 $ and the difference 0.00721 $
Depth: 8


  if __name__ == '__main__':


For a 8 depth the RMSE_train is 0.36174 $, the RMSE_val is 0.37048 $ and the difference 0.00874 $
Depth: 9


  if __name__ == '__main__':


For a 9 depth the RMSE_train is 0.35591 $, the RMSE_val is 0.36700 $ and the difference 0.01109 $
Depth: 10


  if __name__ == '__main__':


For a 10 depth the RMSE_train is 0.34959 $, the RMSE_val is 0.36371 $ and the difference 0.01412 $
Depth: 11


  if __name__ == '__main__':


For a 11 depth the RMSE_train is 0.34251 $, the RMSE_val is 0.36148 $ and the difference 0.01897 $
Depth: 12


  if __name__ == '__main__':


For a 12 depth the RMSE_train is 0.33449 $, the RMSE_val is 0.35989 $ and the difference 0.02539 $
Depth: 13


  if __name__ == '__main__':


For a 13 depth the RMSE_train is 0.32550 $, the RMSE_val is 0.35882 $ and the difference 0.03331 $
Depth: 14


  if __name__ == '__main__':


For a 14 depth the RMSE_train is 0.31542 $, the RMSE_val is 0.35803 $ and the difference 0.04261 $
CPU times: user 57min 54s, sys: 11.4 s, total: 58min 6s
Wall time: 7min 55s


In [22]:
min(dicc_rf.items(), key=lambda x: x[1][0])

('14', [0.3154158706637447, 0.35803061627937754, 0.04261474561563283])

In [23]:
min(dicc_rf.items(), key=lambda x: x[1][1])

('14', [0.3154158706637447, 0.35803061627937754, 0.04261474561563283])

In [24]:
min(dicc_rf.items(), key=lambda x: x[1][2])

('3', [0.39733329372861786, 0.4039430545809194, 0.006609760852301527])

## XGBOOST

In [33]:
%%time
dicc_xgb={}
estimator = 100
for depth in range(2,15):
    print('Depth: %d' % (depth))
    
    taxi_model_xgb = XGBRegressor(max_depth=depth,
                                  learning_rate=0.1,
                                  n_estimators=estimator,
                                  n_jobs=-1)
    
    taxi_model_xgb.fit(train_predictors,
                       train_target,
                       early_stopping_rounds=5,
                       eval_set=[(validation_predictors, validation_target)],
                       verbose=False)
    
    train_predictions = taxi_model_xgb.predict(train_predictors)
    val_predictions = taxi_model_xgb.predict(validation_predictors)
    
    error_train = rmse(train_target, train_predictions)
    error_val = rmse(validation_target, val_predictions)
    difference = error_val - error_train
    
    dicc_xgb[str(depth) + '_' + str(estimator)]= [error_train,error_val,difference]
    print ('For a %d depth the RMSE_train is %.5f $, the RMSE_val is %.5f $ and the difference %.5f $'
               % (depth,error_train,error_val,difference))

Depth: 2
For a 2 depth the RMSE_train is 322.05932 $, the RMSE_val is 328.69850 $ and the difference 6.63918 $
Depth: 3
For a 3 depth the RMSE_train is 308.76393 $, the RMSE_val is 314.74609 $ and the difference 5.98216 $
Depth: 4
For a 4 depth the RMSE_train is 297.76626 $, the RMSE_val is 304.15056 $ and the difference 6.38430 $
Depth: 5
For a 5 depth the RMSE_train is 285.88762 $, the RMSE_val is 294.87158 $ and the difference 8.98395 $
Depth: 6
For a 6 depth the RMSE_train is 276.29015 $, the RMSE_val is 290.26501 $ and the difference 13.97487 $
Depth: 7
For a 7 depth the RMSE_train is 266.21796 $, the RMSE_val is 286.26197 $ and the difference 20.04401 $
Depth: 8
For a 8 depth the RMSE_train is 253.67448 $, the RMSE_val is 283.70137 $ and the difference 30.02689 $
Depth: 9
For a 9 depth the RMSE_train is 243.82842 $, the RMSE_val is 282.90198 $ and the difference 39.07356 $
Depth: 10
For a 10 depth the RMSE_train is 230.73429 $, the RMSE_val is 283.28904 $ and the difference 52.55

KeyboardInterrupt: 

In [34]:
min(dicc_xgb.items(), key=lambda x: x[1][0])

('10_100', [230.73428568354035, 283.2890417830573, 52.554756099516965])

In [35]:
min(dicc_xgb.items(), key=lambda x: x[1][1])

('9_100', [243.82841769657472, 282.90197806800563, 39.073560371430915])

In [36]:
min(dicc_xgb.items(), key=lambda x: x[1][2])

('3_100', [308.7639300075196, 314.7460941573672, 5.98216414984762])

## ADABOOST

In [29]:
%%time
dicc_adab={}
estimator = 100
for depth in range(2,15):
    print('Depth: %d' % (depth))
    taxi_model_adab = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth=depth),
                                    loss='exponential',
                                    n_estimators=estimator)
    
    taxi_model_adab.fit(train_predictors,train_target)
        
    train_predictions = taxi_model_adab.predict(train_predictors)
    val_predictions = taxi_model_adab.predict(validation_predictors)
        
    error_train = rmsle(train_target, train_predictions)
    error_val = rmsle(validation_target, val_predictions)
    difference = error_val - error_train
        
    dicc_adab[str(depth) + '_' + str(estimator)]= [error_train,error_val,difference]
        
    print ('For a %d depth the RMSE_train is %.5f $, the RMSE_val is %.5f $ and the difference %.5f $'
               % (depth,error_train,error_val,difference)) 

Depth: 2


  y = column_or_1d(y, warn=True)


For a 2 depth the RMSE_train is 0.99756 $, the RMSE_val is 1.00008 $ and the difference 0.00252 $
Depth: 3


  y = column_or_1d(y, warn=True)


For a 3 depth the RMSE_train is 1.10663 $, the RMSE_val is 1.10402 $ and the difference -0.00261 $
Depth: 4


  y = column_or_1d(y, warn=True)


For a 4 depth the RMSE_train is 1.10063 $, the RMSE_val is 1.09873 $ and the difference -0.00191 $
Depth: 5


  y = column_or_1d(y, warn=True)


For a 5 depth the RMSE_train is 1.06539 $, the RMSE_val is 1.06242 $ and the difference -0.00297 $
Depth: 6


  y = column_or_1d(y, warn=True)


For a 6 depth the RMSE_train is 1.01376 $, the RMSE_val is 1.01268 $ and the difference -0.00108 $
Depth: 7


  y = column_or_1d(y, warn=True)


For a 7 depth the RMSE_train is 0.92695 $, the RMSE_val is 0.92073 $ and the difference -0.00622 $
Depth: 8


  y = column_or_1d(y, warn=True)


For a 8 depth the RMSE_train is 0.81906 $, the RMSE_val is 0.80478 $ and the difference -0.01427 $
Depth: 9


  y = column_or_1d(y, warn=True)


For a 9 depth the RMSE_train is 0.74860 $, the RMSE_val is 0.75148 $ and the difference 0.00287 $
Depth: 10


  y = column_or_1d(y, warn=True)


For a 10 depth the RMSE_train is 0.65725 $, the RMSE_val is 0.65755 $ and the difference 0.00030 $
Depth: 11


  y = column_or_1d(y, warn=True)


For a 11 depth the RMSE_train is 0.56393 $, the RMSE_val is 0.57763 $ and the difference 0.01370 $
Depth: 12


  y = column_or_1d(y, warn=True)


For a 12 depth the RMSE_train is 0.50052 $, the RMSE_val is 0.51494 $ and the difference 0.01442 $
Depth: 13


  y = column_or_1d(y, warn=True)


For a 13 depth the RMSE_train is 0.46465 $, the RMSE_val is 0.48725 $ and the difference 0.02260 $
Depth: 14


  y = column_or_1d(y, warn=True)


For a 14 depth the RMSE_train is 0.42066 $, the RMSE_val is 0.45645 $ and the difference 0.03580 $
CPU times: user 2h 6min 41s, sys: 1min 37s, total: 2h 8min 19s
Wall time: 1h 56min 1s


In [30]:
min(dicc_adab.items(), key=lambda x: x[1][0])

('14_100', [0.42065662750929306, 0.4564548223036921, 0.03579819479439905])

In [31]:
min(dicc_adab.items(), key=lambda x: x[1][1])

('14_100', [0.42065662750929306, 0.4564548223036921, 0.03579819479439905])

In [32]:
min(dicc_adab.items(), key=lambda x: x[1][2])

('8_100', [0.819057266860833, 0.8047828076868712, -0.014274459173961862])