In [1]:
import pandas as pd
import numpy as np
import re
import sklearn

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVC
from sklearn.cross_validation import KFold

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls



In [2]:
uber_trips_2015 = pd.read_csv('Datathon Materials/uber_trips_2015.csv')
uber_trips_2015.head()

Unnamed: 0,pickup_datetime,pickup_location_id,dispatch_base,affiliate_base
0,2015-05-17 09:47:00,141,B02617,B02617
1,2015-05-17 09:47:00,65,B02617,B02617
2,2015-05-17 09:47:00,100,B02617,B02617
3,2015-05-17 09:47:00,80,B02617,B02774
4,2015-05-17 09:47:00,90,B02617,B02617


In [47]:
uber_trips_2015.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14270479 entries, 0 to 14270478
Data columns (total 4 columns):
pickup_datetime       object
pickup_location_id    int64
dispatch_base         object
affiliate_base        object
dtypes: int64(1), object(3)
memory usage: 435.5+ MB


In [3]:
from datetime import datetime
def convert_datetime_date(x):
    [date,time] = x.split(' ')
    return date
def convert_datetime_time(x):
    [date,time] = x.split(' ')
    [hh,mm,ss] = time.split(':')
    return int(hh)


uber_trips_2015['date'] = uber_trips_2015['pickup_datetime'].apply(convert_datetime_date)
uber_trips_2015['hour'] = uber_trips_2015['pickup_datetime'].apply(convert_datetime_time)

uber_trips_2015['demand']=1

uber_trips_2015.head()


Unnamed: 0,pickup_datetime,pickup_location_id,dispatch_base,affiliate_base,date,hour,demand
0,2015-05-17 09:47:00,141,B02617,B02617,2015-05-17,9,1
1,2015-05-17 09:47:00,65,B02617,B02617,2015-05-17,9,1
2,2015-05-17 09:47:00,100,B02617,B02617,2015-05-17,9,1
3,2015-05-17 09:47:00,80,B02617,B02774,2015-05-17,9,1
4,2015-05-17 09:47:00,90,B02617,B02617,2015-05-17,9,1


In [5]:
dataset = uber_trips_2015.groupby(['date','hour','pickup_location_id'], as_index=False)['demand'].sum()
dataset.head()

Unnamed: 0,date,hour,pickup_location_id,demand
0,2015-01-01,0,3,2
1,2015-01-01,0,4,58
2,2015-01-01,0,7,54
3,2015-01-01,0,9,3
4,2015-01-01,0,11,6


In [6]:
def convert_datetime_isweekday(x):
    datetimeobj = datetime.strptime(x, "%Y-%m-%d")
    weekday = datetimeobj.isoweekday()
    if weekday<=5:
        isweekday = 1
    else:
        isweekday = 0
    return isweekday

dataset['isweekday'] = dataset['date'].apply(convert_datetime_isweekday)
dataset.head()

Unnamed: 0,date,hour,pickup_location_id,demand,isweekday
0,2015-01-01,0,3,2,1
1,2015-01-01,0,4,58,1
2,2015-01-01,0,7,54,1
3,2015-01-01,0,9,3,1
4,2015-01-01,0,11,6,1


In [9]:
demographics = pd.read_csv('Datathon Materials/demographics.csv')
demographics.head()

Unnamed: 0,nta_name,borough,nta_code,population,under_5_years,5-9_years,10-14_years,15-19_years,20-24_years,25-29_years,...,15000_to_24999,25000_to_34999,35000_to_49999,50000_to_74999,75000_to_99999,100000_to_149999,150000_to_199999,200000_or_more,median_income,mean_income
0,Allerton-Pelham Gardens,Bronx,BX31,28903,1679,1706,1763,2039,1964,1703,...,797,773,1160,1764,1155,1562,765,427,61638,78489
1,Annadale-Huguenot-Prince's Bay-Eltingville,Staten Island,SI01,27770,1397,1698,1817,1880,1720,1594,...,571,405,1008,1523,1346,2075,1086,1151,88288,109187
2,Arden Heights,Staten Island,SI48,25238,1507,1540,1596,1752,1614,1561,...,337,516,707,1421,1611,2021,1047,740,89570,101627
3,Astoria,Queens,QN70,78793,3480,3037,3060,3392,6630,11586,...,3673,2816,4725,6463,4557,4698,1627,1197,54882,70094
4,Auburndale,Queens,QN48,19996,917,966,1063,1168,1214,1307,...,445,632,690,1417,1060,1237,589,433,70772,84402


In [65]:
demographics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 33 columns):
nta_name            188 non-null object
borough             188 non-null object
nta_code            188 non-null object
population          188 non-null int64
under_5_years       188 non-null int64
5-9_years           188 non-null int64
10-14_years         188 non-null int64
15-19_years         188 non-null int64
20-24_years         188 non-null int64
25-29_years         188 non-null int64
30-34_years         188 non-null int64
35-39_years         188 non-null int64
40-44_years         188 non-null int64
45-49_years         188 non-null int64
50-54_years         188 non-null int64
55-59_years         188 non-null int64
60-64_years         188 non-null int64
over_65_years       188 non-null int64
median_age          188 non-null int64
people_per_acre     188 non-null float64
households          188 non-null int64
less_than_10,000    188 non-null int64
10000_to_14999      188 non-null

In [10]:
zone = pd.read_csv('Datathon Materials/zones.csv')
zone.head(20)

Unnamed: 0,location_id,borough,zone,service_zone,nta_code
0,1,EWR,Newark Airport,EWR,NJ01
1,2,Queens,Jamaica Bay,Boro Zone,QN61
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone,BX31
3,4,Manhattan,Alphabet City,Yellow Zone,MN22
4,5,Staten Island,Arden Heights,Boro Zone,SI48
5,6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone,SI14
6,7,Queens,Astoria,Boro Zone,QN70
7,8,Queens,Astoria Park,Boro Zone,QN70
8,9,Queens,Auburndale,Boro Zone,QN48
9,10,Queens,Baisley Park,Boro Zone,QN76


In [67]:
zone.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 5 columns):
location_id     263 non-null int64
borough         263 non-null object
zone            263 non-null object
service_zone    263 non-null object
nta_code        263 non-null object
dtypes: int64(1), object(4)
memory usage: 10.3+ KB


In [11]:
df = pd.merge(demographics,zone,how='left', on=['nta_code','borough'],  
      left_index=False, right_index=False, sort=True,  copy=True, indicator=False)
df.head()

Unnamed: 0,nta_name,borough,nta_code,population,under_5_years,5-9_years,10-14_years,15-19_years,20-24_years,25-29_years,...,50000_to_74999,75000_to_99999,100000_to_149999,150000_to_199999,200000_or_more,median_income,mean_income,location_id,zone,service_zone
0,Brooklyn Heights-Cobble Hill,Brooklyn,BK09,22887,1300,695,499,941,1628,2519,...,1225,1187,1791,1173,2776,105398,169555,33.0,Brooklyn Heights,Boro Zone
1,Brooklyn Heights-Cobble Hill,Brooklyn,BK09,22887,1300,695,499,941,1628,2519,...,1225,1187,1791,1173,2776,105398,169555,52.0,Cobble Hill,Boro Zone
2,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,Brooklyn,BK17,64518,3276,3184,3135,3608,4216,4161,...,3967,2869,3735,1151,1369,51314,71041,150.0,Manhattan Beach,Boro Zone
3,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,Brooklyn,BK17,64518,3276,3184,3135,3608,4216,4161,...,3967,2869,3735,1151,1369,51314,71041,210.0,Sheepshead Bay,Boro Zone
4,Brighton Beach,Brooklyn,BK19,35547,1780,1526,1514,1728,2221,2393,...,1728,1114,1228,614,665,30872,59875,29.0,Brighton Beach,Boro Zone


In [12]:
data_uber_2015 = pd.merge(dataset,df,how = 'inner', left_on='nta_code', right_on='location_id',  
      left_index=False, right_index=False, sort=True,  
      suffixes=('_x', '_y'), copy=True, indicator=False)
data_uber_2015.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 761859 entries, 0 to 761858
Data columns (total 41 columns):
date                  761859 non-null object
hour                  761859 non-null int64
pickup_location_id    761859 non-null int64
demand                761859 non-null int64
isweekday             761859 non-null int64
nta_name              761859 non-null object
borough               761859 non-null object
nta_code              761859 non-null object
population            761859 non-null int64
under_5_years         761859 non-null int64
5-9_years             761859 non-null int64
10-14_years           761859 non-null int64
15-19_years           761859 non-null int64
20-24_years           761859 non-null int64
25-29_years           761859 non-null int64
30-34_years           761859 non-null int64
35-39_years           761859 non-null int64
40-44_years           761859 non-null int64
45-49_years           761859 non-null int64
50-54_years           761859 non-null int64
55-59

In [13]:
data_uber_2015 = data_uber_2015.sort_values(['date','hour'])
data_uber_2015.head()

Unnamed: 0,date,hour,pickup_location_id,demand,isweekday,nta_name,borough,nta_code,population,under_5_years,...,50000_to_74999,75000_to_99999,100000_to_149999,150000_to_199999,200000_or_more,median_income,mean_income,location_id,zone,service_zone
8,2015-01-01,0,3,2,1,Allerton-Pelham Gardens,Bronx,BX31,28903,1679,...,1764,1155,1562,765,427,61638,78489,3.0,Allerton/Pelham Gardens,Boro Zone
2255,2015-01-01,0,4,58,1,East Village,Manhattan,MN22,44136,862,...,3160,3057,3201,2155,2478,72665,100130,4.0,Alphabet City,Yellow Zone
7166,2015-01-01,0,7,54,1,Astoria,Queens,QN70,78793,3480,...,6463,4557,4698,1627,1197,54882,70094,7.0,Astoria,Boro Zone
11830,2015-01-01,0,9,3,1,Auburndale,Queens,QN48,19996,917,...,1417,1060,1237,589,433,70772,84402,9.0,Auburndale,Boro Zone
16911,2015-01-01,0,11,6,1,Bath Beach,Brooklyn,BK27,29931,1766,...,2016,1176,1606,711,521,55193,76037,11.0,Bath Beach,Boro Zone


In [27]:
data_uber_2014 = pd.merge(dataset_2014,df,how = 'inner', on='nta_code',  
      left_index=False, right_index=False, sort=True,  
      suffixes=('_x', '_y'), copy=True, indicator=False)
data_uber_2014.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 551569 entries, 0 to 551568
Data columns (total 40 columns):
date                551569 non-null object
hour                551569 non-null int64
nta_code            551569 non-null object
demand              551569 non-null int64
isweekday           551569 non-null int64
nta_name            551569 non-null object
borough             551569 non-null object
population          551569 non-null int64
under_5_years       551569 non-null int64
5-9_years           551569 non-null int64
10-14_years         551569 non-null int64
15-19_years         551569 non-null int64
20-24_years         551569 non-null int64
25-29_years         551569 non-null int64
30-34_years         551569 non-null int64
35-39_years         551569 non-null int64
40-44_years         551569 non-null int64
45-49_years         551569 non-null int64
50-54_years         551569 non-null int64
55-59_years         551569 non-null int64
60-64_years         551569 non-null int64
ove

In [28]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
        

In [104]:
from sklearn.model_selection import train_test_split
label = data['demand']
feature = data.drop(['demand','pickup_location_id','nta_name','borough','nta_code','zone','service_zone','date'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.33, random_state=42)

ntrain = y_train.shape[0]
ntest = y_test.shape[0]

SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)



In [31]:
from sklearn.model_selection import train_test_split
label = data_uber_2014['demand']
feature = data_uber_2014.drop(['demand','nta_name','borough','location_id','nta_code','zone','service_zone','date'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.33, random_state=42)

ntrain = y_train.shape[0]
ntest = y_test.shape[0]

SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)
feature.head()


Unnamed: 0,hour,isweekday,population,under_5_years,5-9_years,10-14_years,15-19_years,20-24_years,25-29_years,30-34_years,...,15000_to_24999,25000_to_34999,35000_to_49999,50000_to_74999,75000_to_99999,100000_to_149999,150000_to_199999,200000_or_more,median_income,mean_income
0,4,1,22887,1300,695,499,941,1628,2519,2885,...,657,606,760,1225,1187,1791,1173,2776,105398,169555
1,4,1,22887,1300,695,499,941,1628,2519,2885,...,657,606,760,1225,1187,1791,1173,2776,105398,169555
2,5,1,22887,1300,695,499,941,1628,2519,2885,...,657,606,760,1225,1187,1791,1173,2776,105398,169555
3,5,1,22887,1300,695,499,941,1628,2519,2885,...,657,606,760,1225,1187,1791,1173,2776,105398,169555
4,6,1,22887,1300,695,499,941,1628,2519,2885,...,657,606,760,1225,1187,1791,1173,2776,105398,169555


In [109]:
from sklearn.grid_search import GridSearchCV
rf_grid = GridSearchCV(
  estimator = RandomForestRegressor(warm_start=True,max_features='sqrt'),
    param_grid = {
        "n_estimators": [200,400,600],
        "max_depth": [ 12, 14, 16, 18],
        "min_samples_leaf": [4,8], 
    },
    cv = kf,
    scoring = "r2"    
)

rf_grid.fit(X_train,y_train.values)
rf_params = rf_grid.best_params_ 
print(rf_params)

KeyboardInterrupt: 

In [32]:
rf_params = {
    'n_jobs': -1,
    'n_estimators': 400,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 18,
    'min_samples_leaf': 6,
    'max_features' : 'sqrt',
    'verbose': 0
}

rf = SklearnHelper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rf.train(X_train,y_train)
rf_pred = rf.predict(X_test)

In [33]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

rf_MAE = mean_absolute_error(y_test,rf_pred)
rf_MSE = mean_squared_error(y_test,rf_pred)
rf_R2 = r2_score(y_test,rf_pred)
print(rf_MAE,rf_MSE,rf_R2)

(7.3400036679197056, 371.65269165634265, 0.86949535509593567)


In [34]:
rf_feature = rf.feature_importances(X_train,y_train)


Warm-start fitting without increasing n_estimators does not fit new trees.



[ 0.37779355  0.06300484  0.01262171  0.04448611  0.0758526   0.07287776
  0.01206744  0.003831    0.00885585  0.00827921  0.00967736  0.01251869
  0.00900932  0.01117112  0.00511474  0.00445774  0.00770928  0.00525248
  0.00900422  0.01054114  0.00411583  0.00435968  0.00421865  0.00519506
  0.00548803  0.01794573  0.01209842  0.01899775  0.03039871  0.03488947
  0.04128799  0.05687852]


In [35]:
y_test[1:20]

228413     28
304170     16
496420      1
129506      6
19495       2
480375      2
380593     17
415024      3
388316      2
537758      1
327936     29
190552      7
351552     10
266941    104
328464     79
318877     63
247707      5
63553       1
293058     20
Name: demand, dtype: int64

In [36]:
rf_pred[1:20]

array([ 107.71176588,   16.04349016,    1.0748406 ,    5.33853319,
          2.14992972,    1.28547674,   25.73834952,    7.40339269,
          4.63586072,    1.80045695,   34.63804732,    2.81892699,
         12.42668471,  150.60329788,   50.88956836,   89.21191702,
          7.06881402,    1.92776326,   17.94497006])

In [37]:
rf_features = [ 0.37779355,  0.06300484,  0.01262171,  0.04448611,  0.0758526  , 0.07287776,
  0.01206744,  0.003831,    0.00885585,  0.00827921,  0.00967736,  0.01251869,
  0.00900932,  0.01117112,  0.00511474,  0.00445774,  0.00770928,  0.00525248,
  0.00900422,  0.01054114,  0.00411583,  0.00435968,  0.00421865,  0.00519506,
  0.00548803,  0.01794573,  0.01209842,  0.01899775,  0.03039871,  0.03488947,
  0.04128799,  0.05687852]
# Scatter plot 
trace = go.Scatter(
    y = rf_features,
    x = X_train.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
        color = rf_features,
        colorscale='Portland',
        showscale=True
    ),
    text = X_train.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',

    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')