In [1]:
import pandas as pd
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sklearn.metrics as skm

import plotly.figure_factory as ff

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go

pd.options.plotting.backend = "plotly"
pio.templates.default = 'plotly_dark'

from lazypredict.Supervised import LazyRegressor

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb


The sklearn.utils.testing module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.



In [2]:
init_time = time.time()
df = pd.read_csv('train_fare_data.csv')
fin_time = time.time()
print ('Time taken for read : ', fin_time-init_time)
df.shape

Time taken for read :  6.537950754165649


(4871473, 9)

In [3]:
df['passenger_count'] = df['passenger_count'][(df['passenger_count'] > 0) & (df['passenger_count'] <= 4)]

In [4]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
count,4871473.0,4871473.0,4871473.0,4871473.0,4871473.0,4422145.0,4871473.0
mean,11.29,-73.98,40.75,-73.97,40.75,1.33,3.31
std,9.45,0.03,0.03,0.03,0.03,0.68,3.56
min,0.01,-74.28,40.45,-74.28,40.45,1.0,0.0
25%,6.0,-73.99,40.74,-73.99,40.74,1.0,1.25
50%,8.5,-73.98,40.75,-73.98,40.75,1.0,2.15
75%,12.5,-73.97,40.77,-73.97,40.77,1.0,3.91
max,250.0,-73.65,40.92,-73.65,40.92,4.0,48.3


In [5]:
df.isnull().sum()

key                       0
fare_amount               0
pickup_datetime           0
pickup_longitude          0
pickup_latitude           0
dropoff_longitude         0
dropoff_latitude          0
passenger_count      449328
distance                  0
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
df.shape

(4422145, 9)

In [8]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [9]:
df['Year'] = df['pickup_datetime'].dt.year
df['Month'] = df['pickup_datetime'].dt.month
df['Date'] = df['pickup_datetime'].dt.day
df['Day of Week'] = df['pickup_datetime'].dt.dayofweek
df['Hour'] = df['pickup_datetime'].dt.hour

In [10]:
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,Year,Month,Date,Day of Week,Hour
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21+00:00,-73.84,40.72,-73.84,40.71,1.0,1.03,2009,6,15,0,17
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16+00:00,-74.02,40.71,-73.98,40.78,1.0,8.45,2010,1,5,1,16
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00+00:00,-73.98,40.76,-73.99,40.75,2.0,1.39,2011,8,18,3,0
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42+00:00,-73.99,40.73,-73.99,40.76,1.0,2.8,2012,4,21,5,4
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00+00:00,-73.97,40.77,-73.96,40.78,1.0,2.0,2010,3,9,1,7


In [11]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,Year,Month,Date,Day of Week,Hour
count,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0
mean,11.28,-73.98,40.75,-73.97,40.75,1.33,3.3,2011.74,6.27,15.71,3.04,13.51
std,9.45,0.03,0.03,0.03,0.03,0.68,3.55,1.87,3.44,8.69,1.95,6.52
min,0.01,-74.28,40.45,-74.28,40.45,1.0,0.0,2009.0,1.0,1.0,0.0,0.0
25%,6.0,-73.99,40.74,-73.99,40.74,1.0,1.26,2010.0,3.0,8.0,1.0,9.0
50%,8.5,-73.98,40.75,-73.98,40.75,1.0,2.15,2012.0,6.0,16.0,3.0,14.0
75%,12.5,-73.97,40.77,-73.97,40.77,1.0,3.9,2013.0,9.0,23.0,5.0,19.0
max,250.0,-73.65,40.92,-73.65,40.92,4.0,48.3,2015.0,12.0,31.0,6.0,23.0


In [12]:
df.columns

Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'distance', 'Year', 'Month', 'Date', 'Day of Week',
       'Hour'],
      dtype='object')

In [13]:
df['passenger_count'].value_counts()

1.00    3380918
2.00     722838
3.00     214600
4.00     103789
Name: passenger_count, dtype: int64

In [14]:
def jfk_dist(drop_lat, drop_lon):    
    pick_lat = 40.6413
    pick_lon = -73.7781
    
    R_earth = 6371 # Earth radius (in km)
    # Convert degrees to radians
    pick_lat, pick_lon, drop_lat, drop_lon = map(np.radians, [pick_lat, pick_lon,
                                                              drop_lat, drop_lon])
    # Compute distances along lat, lon dimensions
    dlat = drop_lat - pick_lat
    dlon = drop_lon - pick_lon
    
    # Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pick_lat) * np.cos(drop_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

def lga_dist(drop_lat, drop_lon):
    pick_lat = 40.7769
    pick_lon = -73.8740
    R_earth = 6371 # Earth radius (in km)
    # Convert degrees to radians
    pick_lat, pick_lon, drop_lat, drop_lon = map(np.radians, [pick_lat, pick_lon,
                                                              drop_lat, drop_lon])
    # Compute distances along lat, lon dimensions
    dlat = drop_lat - pick_lat
    dlon = drop_lon - pick_lon
    
    # Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pick_lat) * np.cos(drop_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

def ewr_dist(drop_lat, drop_lon):
    pick_lat = 40.6895
    pick_lon = -74.1745
    
    R_earth = 6371 # Earth radius (in km)
    # Convert degrees to radians
    pick_lat, pick_lon, drop_lat, drop_lon = map(np.radians, [pick_lat, pick_lon,
                                                              drop_lat, drop_lon])
    # Compute distances along lat, lon dimensions
    dlat = drop_lat - pick_lat
    dlon = drop_lon - pick_lon
    
    # Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pick_lat) * np.cos(drop_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

def nyc_dist(pick_lat, pick_lon):
    drop_lat = 40.7577
    drop_lon = -73.9782
    
    R_earth = 6371 # Earth radius (in km)
    # Convert degrees to radians
    pick_lat, pick_lon, drop_lat, drop_lon = map(np.radians, [pick_lat, pick_lon,
                                                              drop_lat, drop_lon])
    # Compute distances along lat, lon dimensions
    dlat = drop_lat - pick_lat
    dlon = drop_lon - pick_lon
    
    # Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pick_lat) * np.cos(drop_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

def nss_dist(pick_lat, pick_lon):
    drop_lat = 40.7091
    drop_lon = -74.0078
    
    R_earth = 6371 # Earth radius (in km)
    # Convert degrees to radians
    pick_lat, pick_lon, drop_lat, drop_lon = map(np.radians, [pick_lat, pick_lon,
                                                              drop_lat, drop_lon])
    # Compute distances along lat, lon dimensions
    dlat = drop_lat - pick_lat
    dlon = drop_lon - pick_lon
    
    # Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pick_lat) * np.cos(drop_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

def sfs_dist(pick_lat, pick_lon):
    drop_lat = 40.7187
    drop_lon = -73.9861
    
    R_earth = 6371 # Earth radius (in km)
    # Convert degrees to radians
    pick_lat, pick_lon, drop_lat, drop_lon = map(np.radians, [pick_lat, pick_lon,
                                                              drop_lat, drop_lon])
    # Compute distances along lat, lon dimensions
    dlat = drop_lat - pick_lat
    dlon = drop_lon - pick_lon
    
    # Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pick_lat) * np.cos(drop_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

In [15]:
init_time = time.time()
df['jfk'] = jfk_dist(df['dropoff_latitude'], df['dropoff_longitude'])
df['lga'] = lga_dist(df['dropoff_latitude'], df['dropoff_longitude'])
df['ewr'] = ewr_dist(df['dropoff_latitude'], df['dropoff_longitude'])
df['nyc'] = nyc_dist(df['pickup_latitude'], df['pickup_longitude'])
df['nss'] = nss_dist(df['pickup_latitude'], df['pickup_longitude'])
df['sfs'] = sfs_dist(df['pickup_latitude'], df['pickup_longitude'])
fin_time = time.time()
print ('Time taken for exec : ',fin_time-init_time)

Time taken for exec :  1.2361247539520264


In [16]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,Year,Month,Date,Day of Week,Hour,jfk,lga,ewr,nyc,nss,sfs
count,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0,4422145.0
mean,11.28,-73.98,40.75,-73.97,40.75,1.33,3.3,2011.74,6.27,15.71,3.04,13.51,20.89,9.65,18.44,2.99,5.91,4.64
std,9.45,0.03,0.03,0.03,0.03,0.68,3.55,1.87,3.44,8.69,1.95,6.52,2.72,2.6,3.26,3.01,3.37,3.06
min,0.01,-74.28,40.45,-74.28,40.45,1.0,0.0,2009.0,1.0,1.0,0.0,0.0,0.15,0.06,0.13,0.0,0.01,0.01
25%,6.0,-73.99,40.74,-73.99,40.74,1.0,1.26,2010.0,3.0,8.0,1.0,9.0,20.54,8.32,16.49,1.24,3.61,2.58
50%,8.5,-73.98,40.75,-73.98,40.75,1.0,2.15,2012.0,6.0,16.0,3.0,14.0,21.19,9.52,17.97,2.33,5.59,4.18
75%,12.5,-73.97,40.77,-73.97,40.77,1.0,3.9,2013.0,9.0,23.0,5.0,19.0,21.91,10.99,19.79,3.64,7.57,5.95
max,250.0,-73.65,40.92,-73.65,40.92,4.0,48.3,2015.0,12.0,31.0,6.0,23.0,50.14,48.25,49.79,40.12,38.31,37.59


# Linear Regression

In [17]:
x, y = df.drop(['fare_amount','key','pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'], axis = 1), df['fare_amount']

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 101)

In [19]:
lm = LinearRegression()

In [20]:
lm.fit(x_train,y_train)

LinearRegression()

In [21]:
pred = lm.predict(x_test)

In [22]:
print('RMSE = ',np.sqrt(skm.mean_squared_error(y_test,pred)))

RMSE =  4.564947553874314


# Linear Regression using one-hot encoding

In [23]:
values=['fare_amount', 'passenger_count','distance', 'Year', 'Month', 'Date', 'Day of Week','Hour']

In [24]:
ld = df[values]

In [25]:
cat_names = ['passenger_count']
for i in cat_names:
    temp = pd.get_dummies(df[i], prefix= i)
    ld = ld.join(temp)

In [26]:
ld.head()

Unnamed: 0,fare_amount,passenger_count,distance,Year,Month,Date,Day of Week,Hour,passenger_count_1.0,passenger_count_2.0,passenger_count_3.0,passenger_count_4.0
0,4.5,1.0,1.03,2009,6,15,0,17,1,0,0,0
1,16.9,1.0,8.45,2010,1,5,1,16,1,0,0,0
2,5.7,2.0,1.39,2011,8,18,3,0,0,1,0,0
3,7.7,1.0,2.8,2012,4,21,5,4,1,0,0,0
4,5.3,1.0,2.0,2010,3,9,1,7,1,0,0,0


In [27]:
nx, ny = ld.drop(['fare_amount'], axis = 1), ld['fare_amount']

In [28]:
nx_train, nx_test, ny_train, ny_test = train_test_split(nx, ny, test_size = 0.3, random_state = 101)

In [29]:
lm.fit(nx_train,ny_train)

LinearRegression()

In [30]:
npred = lm.predict(nx_test)

In [31]:
print('RMSE = ',np.sqrt(skm.mean_squared_error(ny_test,npred)))

RMSE =  4.621234262329999


# Decision tree

In [32]:
dt = DecisionTreeRegressor(max_depth = 6)

In [33]:
dt.fit(x_train, y_train)

DecisionTreeRegressor(max_depth=6)

In [34]:
dt_pred = dt.predict(x_test)

In [35]:
print('RMSE = ',np.sqrt(skm.mean_squared_error(y_test,dt_pred)))

RMSE =  4.087234775831742


# Random Forest

In [37]:
rf = RandomForestRegressor(n_estimators = 20, min_samples_leaf = 60)

In [38]:
rf

RandomForestRegressor(min_samples_leaf=60, n_estimators=20)

In [39]:
rf.fit(x_train, y_train)

RandomForestRegressor(min_samples_leaf=60, n_estimators=20)

In [40]:
rf_pred = rf.predict(x_test)

In [41]:
print('RMSE = ',np.sqrt(skm.mean_squared_error(y_test,rf_pred)))

RMSE =  3.5758632267867614


# XG Boost

In [42]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_test, label=y_test)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

xgb_params = {
    'min_child_weight': 1, 
    'learning_rate': 0.05, 
    'colsample_bytree': 0.7, 
    'max_depth': 10,
    'subsample': 0.7,
    'n_estimators': 5000,
    'n_jobs': -1, 
    'booster' : 'gbtree', 
    'silent': 1,
    'eval_metric': 'rmse'}

model = xgb.train(xgb_params, dtrain, 700, watchlist, early_stopping_rounds=100, maximize=False, verbose_eval=50)

Parameters: { n_estimators, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:13.73034	valid-rmse:13.72587
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[50]	train-rmse:3.65696	valid-rmse:3.75330
[100]	train-rmse:3.31427	valid-rmse:3.48601
[150]	train-rmse:3.21807	valid-rmse:3.44205
[200]	train-rmse:3.15378	valid-rmse:3.41885
[250]	train-rmse:3.10646	valid-rmse:3.40365
[300]	train-rmse:3.06102	valid-rmse:3.39025
[350]	train-rmse:3.02488	valid-rmse:3.38101
[400]	train-rmse:2.98996	valid-rmse:3.37334
[450]	train-rmse:2.96037	valid-rmse:3.36946
[500]	train-rmse:2.93302	valid-rmse:3.36517
[550]	train-rmse:2.90959	valid-rmse:3.36124
[600]	train-rmse:2.88668	valid-