In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
pd.options.display.max_columns = None

import altair as alt
alt.renderers.enable("notebook")

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.metrics import mean_squared_error

In [2]:
chicago_trips = pd.read_csv('../Data/taxi_model_sample001.csv')

In [3]:
chicago_trips.sample(5)

Unnamed: 0,trip_id,trip_seconds,pickup_community_area,dropoff_community_area,fare,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,humidity,pressure,temperature,weather_description,wind_direction,wind_speed,year,month,day,week_day,hour,minute,distance_miles,distance_ord,distance_mdw,taxi_id_ind,payment_type_ind
205282,2bfe22116bbe19d8a324a96bf89b213d5a568ddf,420,8,8,6.65,Credit Card,Taxi Affiliation Services,41.899156,-87.626211,41.893216,-87.637844,77.0,1021.0,288.64,haze,30.0,3.0,2015,10,5,2,21,15,0.903199,19.379112,10.844393,1161,1
220166,ccff669d7b0769ad5264ac35c00388ef4d5ae3c6,420,8,28,6.45,Credit Card,Taxi Affiliation Services,41.892508,-87.626215,41.879255,-87.642649,81.0,1030.0,279.648,sky is clear,5.0,6.0,2015,4,30,5,23,15,1.459363,19.382717,9.955301,295,1
250700,334fb349ef58af0b1738af2f624447ce230d2583,180,32,8,5.45,Cash,Dispatch Taxi Affiliation,41.884987,-87.620993,41.891972,-87.612945,69.0,1027.0,290.898333,light rain,160.0,6.0,2015,5,17,1,17,30,0.736529,20.663214,11.373879,904,0
190955,e7307e61466072cd85671f220f8a8682b10a2373,360,24,8,6.45,Cash,Choice Taxi Association,41.898306,-87.653614,41.905858,-87.630865,76.0,1032.0,265.467667,broken clouds,249.0,1.0,2015,2,22,1,1,30,1.657725,18.233088,10.334944,835,0
3696,130209cbec0ed5ae23482a93e6f45ff9a5d58aa2,1080,8,33,13.25,Credit Card,Dispatch Taxi Affiliation,41.898332,-87.620763,41.85935,-87.617358,77.0,1021.0,292.15,moderate rain,360.0,3.0,2016,9,14,4,9,15,2.702183,20.408993,10.628303,329,1


In [4]:
chicago_trips.dtypes

trip_id                        object
trip_seconds                    int64
pickup_community_area           int64
dropoff_community_area          int64
fare                          float64
payment_type                   object
company                        object
pickup_centroid_latitude      float64
pickup_centroid_longitude     float64
dropoff_centroid_latitude     float64
dropoff_centroid_longitude    float64
humidity                      float64
pressure                      float64
temperature                   float64
weather_description            object
wind_direction                float64
wind_speed                    float64
year                            int64
month                           int64
day                             int64
week_day                        int64
hour                            int64
minute                          int64
distance_miles                float64
distance_ord                  float64
distance_mdw                  float64
taxi_id_ind 

In [5]:
chicago_trips = chicago_trips.drop(['trip_id','trip_seconds'],axis=1)

In [6]:
chicago_trips = pd.get_dummies(chicago_trips)

In [7]:
chicago_train = chicago_trips[(chicago_trips['year']<2016) |
                             ((chicago_trips['year']==2016) & (chicago_trips['month']<=1))]

In [8]:
chicago_train.shape

(206977, 172)

In [9]:
train_target = np.ravel(chicago_train[['fare']])
train_predictors = chicago_train.drop(['fare'],axis=1)

In [10]:
chicago_test_val = chicago_trips[(chicago_trips['year']==2017) |
                             ((chicago_trips['year']==2016) & (chicago_trips['month']>1))]

In [11]:
chicago_test_val.shape

(74645, 172)

In [12]:
chicago_test_val_target = np.ravel(chicago_test_val[['fare']])
chicago_test_val_predictors = chicago_test_val.drop(['fare'],axis=1)

In [13]:
validation_predictors,test_predictors, validation_target, test_target = train_test_split(chicago_test_val_predictors, 
                                                                                         chicago_test_val_target, 
                                                                                         test_size=0.38, 
                                                                                         random_state=42)

train_target = np.ravel(chicago_train[['fare']])

In [14]:
del chicago_test_val, chicago_test_val_predictors, chicago_test_val_target, chicago_train, chicago_trips,

In [15]:
def rmse(target,predictors):
    return sqrt(mean_squared_error(target, predictors))

## DECISION TREE

In [16]:
%%time
dicc_dt={}
for depth in range(2,15):
    print('Depth: %d' % (depth))
    taxi_model_dt = DecisionTreeRegressor(max_depth=depth,random_state=13)
    
    taxi_model_dt.fit(train_predictors, train_target)
    
    train_predictions = taxi_model_dt.predict(train_predictors)
    val_predictions = taxi_model_dt.predict(validation_predictors)
    
    error_train = rmse(train_target, train_predictions)
    error_val = rmse(validation_target, val_predictions)
    difference = error_val - error_train
    
    dicc_dt[str(depth)]= [error_train,error_val,difference]
    print ('For a %d depth the RMSE_train is %.5f $, the RMSE_val is %.5f $ and the difference %.5f $'
               % (depth,error_train,error_val,difference)) 

Depth: 2
For a 2 depth the RMSE_train is 3.19190 $, the RMSE_val is 4.67036 $ and the difference 1.47847 $
Depth: 3
For a 3 depth the RMSE_train is 2.74364 $, the RMSE_val is 4.27453 $ and the difference 1.53089 $
Depth: 4
For a 4 depth the RMSE_train is 2.53374 $, the RMSE_val is 4.07055 $ and the difference 1.53682 $
Depth: 5
For a 5 depth the RMSE_train is 2.38114 $, the RMSE_val is 3.85131 $ and the difference 1.47017 $
Depth: 6
For a 6 depth the RMSE_train is 2.30764 $, the RMSE_val is 3.05415 $ and the difference 0.74651 $
Depth: 7
For a 7 depth the RMSE_train is 2.24247 $, the RMSE_val is 2.96382 $ and the difference 0.72135 $
Depth: 8
For a 8 depth the RMSE_train is 2.18080 $, the RMSE_val is 2.97718 $ and the difference 0.79639 $
Depth: 9
For a 9 depth the RMSE_train is 2.11989 $, the RMSE_val is 4.38145 $ and the difference 2.26156 $
Depth: 10
For a 10 depth the RMSE_train is 2.06228 $, the RMSE_val is 4.37851 $ and the difference 2.31622 $
Depth: 11
For a 11 depth the RMSE_t

In [17]:
min(dicc_dt.items(), key=lambda x: x[1][0])

('14', [1.7984890911964497, 4.378592597095931, 2.5801035058994817])

In [18]:
min(dicc_dt.items(), key=lambda x: x[1][1])

('7', [2.242467594023435, 2.963820720290686, 0.721353126267251])

In [19]:
min(dicc_dt.items(), key=lambda x: x[1][2])

('7', [2.242467594023435, 2.963820720290686, 0.721353126267251])

## RANDOM FOREST

In [19]:
%%time
dicc_rf={}
estimator = 100
for depth in range(2,15):
    print('Depth: %d' % (depth))
    taxi_model_rf = RandomForestRegressor(max_depth=depth,
                                          n_estimators=estimator,
                                          n_jobs=-1,
                                          random_state=13)
    
    taxi_model_rf.fit(train_predictors, train_target)                                    
    
    train_predictions = taxi_model_rf.predict(train_predictors)
    val_predictions = taxi_model_rf.predict(validation_predictors)
    
    error_train = rmse(train_target, train_predictions)
    error_val = rmse(validation_target, val_predictions)
    difference = error_val - error_train
    
    dicc_rf[str(depth)]= [error_train,error_val,difference]
    print ('For a %d depth and %d estimators the RMSE_train is %.5f $, the RMSE_val is %.5f $ and the difference %.5f $'
               % (depth,estimator,error_train,error_val,difference))  
            

Depth: 2
For a 2 depth and 100 estimators the RMSE_train is 3.17849 $, the RMSE_val is 4.66310 $ and the difference 1.48461 $
Depth: 3
For a 3 depth and 100 estimators the RMSE_train is 2.70733 $, the RMSE_val is 4.24974 $ and the difference 1.54241 $
Depth: 4
For a 4 depth and 100 estimators the RMSE_train is 2.48478 $, the RMSE_val is 4.04609 $ and the difference 1.56131 $
Depth: 5
For a 5 depth and 100 estimators the RMSE_train is 2.35051 $, the RMSE_val is 3.62166 $ and the difference 1.27115 $
Depth: 6
For a 6 depth and 100 estimators the RMSE_train is 2.26968 $, the RMSE_val is 3.03562 $ and the difference 0.76595 $
Depth: 7
For a 7 depth and 100 estimators the RMSE_train is 2.20020 $, the RMSE_val is 2.89855 $ and the difference 0.69835 $
Depth: 8
For a 8 depth and 100 estimators the RMSE_train is 2.13895 $, the RMSE_val is 2.76071 $ and the difference 0.62176 $
Depth: 9
For a 9 depth and 100 estimators the RMSE_train is 2.07821 $, the RMSE_val is 2.70698 $ and the difference 0.

In [20]:
min(dicc_rf.items(), key=lambda x: x[1][0])

('14', [1.7452094994588234, 2.6988454358189475, 0.9536359363601241])

In [21]:
min(dicc_rf.items(), key=lambda x: x[1][1])

('10', [2.017711359586667, 2.6719590465333813, 0.6542476869467144])

In [22]:
min(dicc_rf.items(), key=lambda x: x[1][2])

('8', [2.1389487017489706, 2.760711324189831, 0.6217626224408606])

## XGBOOST

In [16]:
%%time
dicc_xgb={}
estimator = 100
for depth in range(2,10):
    print('Depth: %d' % (depth))
    
    taxi_model_xgb = XGBRegressor(max_depth=depth,
                                  learning_rate=0.05,
                                  n_estimators=estimator,
                                  n_jobs=-1)
    
    taxi_model_xgb.fit(train_predictors,
                       train_target,
                       early_stopping_rounds=5,
                       eval_set=[(validation_predictors, validation_target)],
                       verbose=False)
    
    train_predictions = taxi_model_xgb.predict(train_predictors)
    val_predictions = taxi_model_xgb.predict(validation_predictors)
    
    error_train = rmse(train_target, train_predictions)
    error_val = rmse(validation_target, val_predictions)
    difference = error_val - error_train
    
    dicc_xgb[str(depth) + '_' + str(estimator)]= [error_train,error_val,difference]
    print ('For a %d depth and %d estimatorsthe RMSE_train is %.5f $, the RMSE_val is %.5f $ and the difference %.5f $'
               % (depth,estimator, error_train,error_val,difference))
    

Depth: 2
For a 2 depth the RMSE_train is 2.31917 $, the RMSE_val is 2.83356 $ and the difference 0.51440 $
Depth: 3
For a 3 depth the RMSE_train is 2.21039 $, the RMSE_val is 2.60742 $ and the difference 0.39702 $
Depth: 4
For a 4 depth the RMSE_train is 2.13788 $, the RMSE_val is 2.50796 $ and the difference 0.37008 $
Depth: 5
For a 5 depth the RMSE_train is 2.07930 $, the RMSE_val is 2.48920 $ and the difference 0.40990 $
Depth: 6
For a 6 depth the RMSE_train is 2.04049 $, the RMSE_val is 2.52300 $ and the difference 0.48251 $
Depth: 7
For a 7 depth the RMSE_train is 2.00226 $, the RMSE_val is 2.54588 $ and the difference 0.54362 $
Depth: 8
For a 8 depth the RMSE_train is 1.91177 $, the RMSE_val is 2.55509 $ and the difference 0.64332 $
Depth: 9
For a 9 depth the RMSE_train is 1.87239 $, the RMSE_val is 2.69990 $ and the difference 0.82751 $
CPU times: user 24min 23s, sys: 12.4 s, total: 24min 35s
Wall time: 24min 16s


In [17]:
min(dicc_xgb.items(), key=lambda x: x[1][0])

('9_100', [1.8723851172763226, 2.699896880138007, 0.8275117628616846])

In [18]:
min(dicc_xgb.items(), key=lambda x: x[1][1])

('5_100', [2.079298129977469, 2.489202763001754, 0.40990463302428504])

In [19]:
min(dicc_xgb.items(), key=lambda x: x[1][2])

('4_100', [2.1378810011360967, 2.507960139981517, 0.3700791388454201])

## BAGGING REGRESSOR

In [37]:
%%time
dicc_bg={}
estimator = 100
for depth in range(3,16):
    print('Depth: %d' % (depth))    
    
    taxi_model_bg= BaggingRegressor(base_estimator = DecisionTreeRegressor(max_depth=depth,
                                                                           random_state=13),
                                    n_estimators = 100,
                                    n_jobs= -1,
                                    random_state=13)
    
    taxi_model_bg.fit(train_predictors, train_target)

    train_predictions = taxi_model_bg.predict(train_predictors)
    val_predictions = taxi_model_bg.predict(validation_predictors)

    error_train = rmse(train_target, train_predictions)
    error_val = rmse(validation_target, val_predictions)
    difference = error_val - error_train
    
    dicc_bg[str(depth) + '_' + str(estimator)]= [error_train,error_val,difference]
    print ('For a %d depth and %d estimator the RMSE_train is %.5f $, the RMSE_val is %.5f $ and the difference %.5f $'
            % (depth,estimator,error_train,error_val,difference))
    
    

Depth: 11


  return column_or_1d(y, warn=True)


For a 11 depth and 100 estimator the RMSE_train is 1.95488 $, the RMSE_val is 2.67923 $ and the difference 0.72435 $
Depth: 12


  return column_or_1d(y, warn=True)


For a 12 depth and 100 estimator the RMSE_train is 1.89040 $, the RMSE_val is 2.68096 $ and the difference 0.79056 $
Depth: 13


  return column_or_1d(y, warn=True)


For a 13 depth and 100 estimator the RMSE_train is 1.82033 $, the RMSE_val is 2.67898 $ and the difference 0.85866 $
Depth: 14


  return column_or_1d(y, warn=True)


For a 14 depth and 100 estimator the RMSE_train is 1.74517 $, the RMSE_val is 2.69343 $ and the difference 0.94826 $
Depth: 15


  return column_or_1d(y, warn=True)


For a 15 depth and 100 estimator the RMSE_train is 1.66568 $, the RMSE_val is 2.68176 $ and the difference 1.01608 $
Depth: 16


  return column_or_1d(y, warn=True)


For a 16 depth and 100 estimator the RMSE_train is 1.58538 $, the RMSE_val is 2.68332 $ and the difference 1.09794 $
Depth: 17


  return column_or_1d(y, warn=True)


For a 17 depth and 100 estimator the RMSE_train is 1.50423 $, the RMSE_val is 2.69000 $ and the difference 1.18577 $
Depth: 18


  return column_or_1d(y, warn=True)


For a 18 depth and 100 estimator the RMSE_train is 1.42334 $, the RMSE_val is 2.68903 $ and the difference 1.26569 $
Depth: 19


  return column_or_1d(y, warn=True)


For a 19 depth and 100 estimator the RMSE_train is 1.34489 $, the RMSE_val is 2.69056 $ and the difference 1.34567 $
CPU times: user 15.4 s, sys: 12.6 s, total: 27.9 s
Wall time: 18min 12s


In [39]:
min(dicc_bg.items(), key=lambda x: x[1][0])

('19_100', [1.3448893006630633, 2.6905579192108475, 1.3456686185477842])

In [40]:
min(dicc_bg.items(), key=lambda x: x[1][1])

('13_100', [1.8203261438320244, 2.678982604173621, 0.8586564603415965])

In [41]:
min(dicc_bg.items(), key=lambda x: x[1][2])

('11_100', [1.954876857760605, 2.6792264323575044, 0.7243495745968993])

## XGBOOST

In [None]:
%%time
dicc_xgb={}
estimator = 100
for depth in range(5,6):
    print('Depth: %d' % (depth))
    for estimator in range(140,147,4):
        taxi_model_xgb = XGBRegressor(max_depth=depth,
                                      learning_rate=0.05,
                                      n_estimators=estimator,
                                      n_jobs=-1)
    
        taxi_model_xgb.fit(train_predictors,
                           train_target,
                           early_stopping_rounds=5,
                           eval_set=[(validation_predictors, validation_target)],
                           verbose=False)
    
        train_predictions = taxi_model_xgb.predict(train_predictors)
        val_predictions = taxi_model_xgb.predict(validation_predictors)
    
        error_train = rmse(train_target, train_predictions)
        error_val = rmse(validation_target, val_predictions)
        difference = error_val - error_train
    
        dicc_xgb[str(depth) + '_' + str(estimator)]= [error_train,error_val,difference]
        print ('For a %d depth and %d estimators the RMSE_train is %.5f $, the RMSE_val is %.5f $ and the difference %.5f $'
               % (depth,estimator, error_train,error_val,difference))

Depth: 5
For a 5 depth and 140 estimators the RMSE_train is 2.11329 $, the RMSE_val is 2.51105 $ and the difference 0.39776 $


In [30]:
min(dicc_xgb.items(), key=lambda x: x[1][0])

('5_140', [2.0554481837217, 2.476160793077591, 0.42071260935589105])

In [31]:
min(dicc_xgb.items(), key=lambda x: x[1][1])

('5_140', [2.0554481837217, 2.476160793077591, 0.42071260935589105])

In [32]:
min(dicc_xgb.items(), key=lambda x: x[1][2])

('5_80', [2.0978493835211767, 2.503664419638038, 0.40581503611686154])

## ADABOOST

In [18]:
%%time
dicc_adab={}
estimator = 100
for depth in range(2,16):
    print('Depth: %d' % (depth))
    taxi_model_adab = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth=depth,
                                                                               random_state=13),
                                        learning_rate = 0.1,
                                        n_estimators = estimator,
                                        random_state = 13)

    taxi_model_adab.fit(train_predictors,train_target)
        
    train_predictions = taxi_model_adab.predict(train_predictors)
    val_predictions = taxi_model_adab.predict(validation_predictors)
        
    error_train = rmse(train_target, train_predictions)
    error_val = rmse(validation_target, val_predictions)
    difference = error_val - error_train
        
    dicc_adab[str(depth) + '_' + str(estimator)]= [error_train,error_val,difference]
        
    print ('For a %d depth and the RMSE_train is %.5f $, the RMSE_val is %.5f $ and the difference %.5f $'
               % (depth,estimator,error_train,error_val,difference)) 
    
    

Depth: 2


TypeError: not all arguments converted during string formatting

In [25]:
min(dicc_adab.items(), key=lambda x: x[1][0])

('10_100', [1.9953856553407552, 2.704205006434017, 0.7088193510932617])

In [26]:
min(dicc_adab.items(), key=lambda x: x[1][1])

('10_100', [1.9953856553407552, 2.704205006434017, 0.7088193510932617])

In [27]:
min(dicc_adab.items(), key=lambda x: x[1][2])

('9_100', [2.088268033258337, 2.7871465298112237, 0.6988784965528865])