In [1]:
import os
import numpy as np 
import pandas as pd 

In [2]:
train_data = pd.read_csv('D:/new-york-city-taxi-fare-prediction/train.csv', nrows = 2000000)

In [3]:
train_data.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [4]:
test_data = pd.read_csv('D:/new-york-city-taxi-fare-prediction/test.csv')

In [5]:
test_data.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


Dropped the NaN columns

In [6]:
train_data.isnull().sum()

key                   0
fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude    14
dropoff_latitude     14
passenger_count       0
dtype: int64

In [7]:
train_data = train_data.dropna()
train_data.isnull().sum()

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [8]:
(train_data==0).astype(int).sum()

key                      0
fare_amount             56
pickup_datetime          0
pickup_longitude     37686
pickup_latitude      37567
dropoff_longitude    37660
dropoff_latitude     37553
passenger_count       7109
dtype: int64

In [9]:
train_data = train_data.loc[~(train_data==0).any(axis =1)]

In [10]:
(train_data==0).astype(int).sum()

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [11]:
train_data.drop('key', axis = 1, inplace = True)
train_data.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


Extracted the dates separately and Dropped the pickup_datetime column

In [12]:
import datetime as dt

def date_time_features(data):
    data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
    data['year'] = data['pickup_datetime'].dt.year
    data['month'] = data['pickup_datetime'].dt.month
    data['weekday'] = data['pickup_datetime'].dt.day
    data['hour'] = data['pickup_datetime'].dt.hour
    data = data.drop('pickup_datetime', axis = 1, inplace = True)
    
    return data

In [13]:
date_time_features(train_data)
train_data.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,17
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,16
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,0
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,4
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,7


In [14]:
date_time_features(test_data)
test_data.head()

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour
0,2015-01-27 13:08:24.0000002,-73.97332,40.763805,-73.98143,40.743835,1,2015,1,27,13
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,2015,1,27,13
2,2011-10-08 11:53:44.0000002,-73.982524,40.75126,-73.979654,40.746139,1,2011,10,8,11
3,2012-12-01 21:12:12.0000002,-73.98116,40.767807,-73.990448,40.751635,1,2012,12,1,21
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,2012,12,1,21


In [15]:
def longitude_latitude_distance (dist):
    dist['Longitude_distance'] = np.radians(dist['pickup_longitude'] - dist['dropoff_longitude'])
    dist['Latitude_distance'] = np.radians(dist['pickup_latitude'] - dist['dropoff_latitude']) 
    dist['Distance Travelled/1000'] = ((dist['Longitude_distance']**2 + dist['Latitude_distance']**2)**0.5) *1000
    return dist

In [16]:
for i in [train_data, test_data]:
    longitude_latitude_distance(i)

In [17]:
train_data.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,Longitude_distance,Latitude_distance,Distance Travelled/1000
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,17,-4.7e-05,0.000158,0.164686
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,16,-0.000642,-0.001234,1.390952
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,0,0.000148,0.000187,0.238657
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,4,7.7e-05,-0.000435,0.442275
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,7,-0.0002,-0.000275,0.339807


In [18]:
test_data.head()

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,Longitude_distance,Latitude_distance,Distance Travelled/1000
0,2015-01-27 13:08:24.0000002,-73.97332,40.763805,-73.98143,40.743835,1,2015,1,27,13,0.000142,0.000349,0.376187
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,2015,1,27,13,0.00021,-0.000346,0.404563
2,2011-10-08 11:53:44.0000002,-73.982524,40.75126,-73.979654,40.746139,1,2011,10,8,11,-5e-05,8.9e-05,0.102458
3,2012-12-01 21:12:12.0000002,-73.98116,40.767807,-73.990448,40.751635,1,2012,12,1,21,0.000162,0.000282,0.325494
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,2012,12,1,21,0.000393,0.000791,0.883686


In [19]:
def cal_distance(x):
    R = 6371000   #Radius of Earth
    from_lat = np.radians(x['pickup_latitude'])
    to_lat = np.radians(x['dropoff_latitude'])
    from_long = np.radians(x['pickup_longitude'])
    to_long = np.radians(x['dropoff_longitude'])
    lat_diff = x['Latitude_distance']
    long_diff = x['Longitude_distance']
   
    a = np.sin(long_diff/2)**2 + np.cos(to_lat)*np.cos(from_lat)*np.sin(lat_diff/2)**2
    c = 2 * np.arctan2(a**0.5, (1-a)**0.5)
    x['Harvesine Distance/km'] = (R * c)/1000

In [20]:
for i in [train_data, test_data]:
    cal_distance(i)

In [21]:
train_data.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,Longitude_distance,Latitude_distance,Distance Travelled/1000,Harvesine Distance/km
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,17,-4.7e-05,0.000158,0.164686,0.819025
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,16,-0.000642,-0.001234,1.390952,7.224927
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,0,0.000148,0.000187,0.238657,1.30677
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,4,7.7e-05,-0.000435,0.442275,2.158908
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,7,-0.0002,-0.000275,0.339807,1.837912


In [22]:
test_data.head()

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,Longitude_distance,Latitude_distance,Distance Travelled/1000,Harvesine Distance/km
0,2015-01-27 13:08:24.0000002,-73.97332,40.763805,-73.98143,40.743835,1,2015,1,27,13,0.000142,0.000349,0.376187,1.908602
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,2015,1,27,13,0.00021,-0.000346,0.404563,2.139176
2,2011-10-08 11:53:44.0000002,-73.982524,40.75126,-73.979654,40.746139,1,2011,10,8,11,-5e-05,8.9e-05,0.102458,0.5366
3,2012-12-01 21:12:12.0000002,-73.98116,40.767807,-73.990448,40.751635,1,2012,12,1,21,0.000162,0.000282,0.325494,1.709361
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,2012,12,1,21,0.000393,0.000791,0.883686,4.566714


In [23]:
train_data.isnull().sum()

fare_amount                0
pickup_longitude           0
pickup_latitude            0
dropoff_longitude          0
dropoff_latitude           0
passenger_count            0
year                       0
month                      0
weekday                    0
hour                       0
Longitude_distance         0
Latitude_distance          0
Distance Travelled/1000    0
Harvesine Distance/km      6
dtype: int64

In [24]:
train_data = train_data.dropna()

In [25]:
test_data.isnull().sum()

key                        0
pickup_longitude           0
pickup_latitude            0
dropoff_longitude          0
dropoff_latitude           0
passenger_count            0
year                       0
month                      0
weekday                    0
hour                       0
Longitude_distance         0
Latitude_distance          0
Distance Travelled/1000    0
Harvesine Distance/km      0
dtype: int64

Feature Selection

In [26]:
training_data = train_data.drop(['pickup_longitude', 'dropoff_longitude','pickup_latitude','dropoff_latitude',
                    'Longitude_distance', 'Latitude_distance'], axis =1)

training_data.head()

Unnamed: 0,fare_amount,passenger_count,year,month,weekday,hour,Distance Travelled/1000,Harvesine Distance/km
0,4.5,1,2009,6,15,17,0.164686,0.819025
1,16.9,1,2010,1,5,16,1.390952,7.224927
2,5.7,2,2011,8,18,0,0.238657,1.30677
3,7.7,1,2012,4,21,4,0.442275,2.158908
4,5.3,1,2010,3,9,7,0.339807,1.837912


In [27]:
testing_data = test_data.drop(['pickup_longitude', 'dropoff_longitude','pickup_latitude','dropoff_latitude',
                    'Longitude_distance', 'Latitude_distance'], axis =1)

testing_data.head()

Unnamed: 0,key,passenger_count,year,month,weekday,hour,Distance Travelled/1000,Harvesine Distance/km
0,2015-01-27 13:08:24.0000002,1,2015,1,27,13,0.376187,1.908602
1,2015-01-27 13:08:24.0000003,1,2015,1,27,13,0.404563,2.139176
2,2011-10-08 11:53:44.0000002,1,2011,10,8,11,0.102458,0.5366
3,2012-12-01 21:12:12.0000002,1,2012,12,1,21,0.325494,1.709361
4,2012-12-01 21:12:12.0000003,1,2012,12,1,21,0.883686,4.566714


Modelling and Prediction

In [28]:
from sklearn.model_selection import train_test_split
features = [i for i in training_data.columns if i!= 'fare_amount']
X = training_data[features]
Y = training_data['fare_amount']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [29]:
testing_data.drop('key', axis = 1, inplace = True)
testing_data.head()

Unnamed: 0,passenger_count,year,month,weekday,hour,Distance Travelled/1000,Harvesine Distance/km
0,1,2015,1,27,13,0.376187,1.908602
1,1,2015,1,27,13,0.404563,2.139176
2,1,2011,10,8,11,0.102458,0.5366
3,1,2012,12,1,21,0.325494,1.709361
4,1,2012,12,1,21,0.883686,4.566714


Model - 1 : Linear Regression

In [30]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression(fit_intercept=True, normalize=False)
linear_reg.fit(X_train, Y_train)
y_pred = linear_reg.predict(X_test)

In [31]:
prediction_1 = linear_reg.predict(testing_data)

Result_1 = pd.DataFrame({'key': test_data.key, 'fare_amount': prediction_1})
Result_1.to_csv('Submission_LinearRegression.csv', index = False)

In [32]:
Result_1.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,12.813115
1,2015-01-27 13:08:24.0000003,12.813833
2,2011-10-08 11:53:44.0000002,11.260782
3,2012-12-01 21:12:12.0000002,11.829465
4,2012-12-01 21:12:12.0000003,11.838289


Model - 2 : Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators = 100, max_features = 5)
rf_reg.fit(X_train, Y_train)

In [35]:
y_pred = rf_reg.predict(X_test)

In [38]:
prediction_2 = rf_reg.predict(testing_data)

Result_2 = pd.DataFrame({'key': test_data.key, 'fare_amount': prediction_2})
Result_2.to_csv('Submission_RandomForest.csv', index = False)

In [39]:
Result_2.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,9.385
1,2015-01-27 13:08:24.0000003,10.555
2,2011-10-08 11:53:44.0000002,4.536
3,2012-12-01 21:12:12.0000002,7.747
4,2012-12-01 21:12:12.0000003,16.705


Model - 3 : XGBoost Regression

In [43]:
from xgboost import XGBRegressor
XGB_model = XGBRegressor(learning_rate=0.3, max_depth=6, n_estimators=100)
XGB_model.fit(X_train, Y_train)
y_predict = XGB_model.predict(X_test)

In [48]:
prediction_3 = XGB_model.predict(testing_data)

Result_3 = pd.DataFrame({'key': test_data.key, 'fare_amount': prediction_3})
Result_3.to_csv('Submission_XGBoost.csv', index = False)

In [49]:
Result_3.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.129586
1,2015-01-27 13:08:24.0000003,10.335686
2,2011-10-08 11:53:44.0000002,4.835765
3,2012-12-01 21:12:12.0000002,7.587176
4,2012-12-01 21:12:12.0000003,15.153477
