**OBJECTIVE** : 
             The objective of this project is to develop a machine learning model that can predict the fare amount of a taxi ride based on key          ride features such as pickup and dropoff location, date and time, and passenger count.

# Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score 
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore")

# Load Dataset

In [3]:
# Train Dataset
train = pd.read_csv("train_cab_fare.csv.zip")

# Test Dataset
test = pd.read_csv("test_cab_fare.csv")

# Basic Understanding

**Train Data**

In [4]:
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1.0
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2.0
3,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1.0
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1.0


In [5]:
train.shape

(16067, 7)

In [6]:
train.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,16067.0,16067.0,16067.0,16067.0,16012.0
mean,-72.462787,39.914725,-72.462328,39.897906,2.62507
std,10.578384,6.826587,10.575062,6.187087,60.844122
min,-74.438233,-74.006893,-74.429332,-74.006377,0.0
25%,-73.992156,40.734927,-73.991182,40.734651,1.0
50%,-73.981698,40.752603,-73.980172,40.753567,1.0
75%,-73.966838,40.767381,-73.963642,40.768014,2.0
max,40.766125,401.083332,40.802437,41.366138,5345.0


In [7]:
train.dtypes

fare_amount           object
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      float64
dtype: object

**Test Data**

In [8]:
test.shape

(9914, 6)

In [9]:
test.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.974722,40.751041,-73.973657,40.751743,1.671273
std,0.042774,0.033541,0.039072,0.035435,1.278747
min,-74.252193,40.573143,-74.263242,40.568973,1.0
25%,-73.992501,40.736125,-73.991247,40.735254,1.0
50%,-73.982326,40.753051,-73.980015,40.754065,1.0
75%,-73.968013,40.767113,-73.964059,40.768757,2.0
max,-72.986532,41.709555,-72.990963,41.696683,6.0


In [10]:
test.dtypes

pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

# Data Cleaning And Missing Value Analysis

**Clean Train data**

In [11]:
# Convert fare_amount from object to numeric
# Using errors=’coerce’. It will replace all non-numeric values with NaN

train["fare_amount"] = pd.to_numeric(train["fare_amount"],errors = "coerce")  

In [12]:
train.shape

(16067, 7)

In [13]:
train.dtypes

fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      float64
dtype: object

In [14]:
train.dropna(subset=["pickup_datetime"])

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.841610,40.712278,1.0
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.761270,-73.991242,40.750562,2.0
3,7.7,2012-04-21 04:30:42 UTC,-73.987130,40.733143,-73.991567,40.758092,1.0
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1.0
...,...,...,...,...,...,...,...
16062,6.5,2014-12-12 07:41:00 UTC,-74.008820,40.718757,-73.998865,40.719987,1.0
16063,16.1,2009-07-13 07:58:00 UTC,-73.981310,40.781695,-74.014392,40.715527,2.0
16064,8.5,2009-11-11 11:19:07 UTC,-73.972507,40.753417,-73.979577,40.765495,1.0
16065,8.1,2010-05-11 23:53:00 UTC,-73.957027,40.765945,-73.981983,40.779560,1.0


In [15]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'], errors='coerce', utc=True)

In [16]:
# Separately extract feature

train["Year"] = train["pickup_datetime"].dt.year
train["Month"] = train["pickup_datetime"].dt.month
train["Date"] = train["pickup_datetime"].dt.date

train["Day"] = train["pickup_datetime"].dt.day
train["Hour"] = train["pickup_datetime"].dt.hour
train["Minute"] = train["pickup_datetime"].dt.minute

In [17]:
# Re-check Train data type after conversion

train.dtypes

fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                  float64
Year                             float64
Month                            float64
Date                              object
Day                              float64
Hour                             float64
Minute                           float64
dtype: object

**Clean Test data**

In [18]:
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'], errors='coerce', utc=True)

In [19]:
# Separately extract feature

test["Year"] = test["pickup_datetime"].dt.year
test["Month"] = test["pickup_datetime"].dt.month
test["Date"] = test["pickup_datetime"].dt.date

test["Day"] = test["pickup_datetime"].dt.day
test["Hour"] = test["pickup_datetime"].dt.hour
test["Minute"] = test["pickup_datetime"].dt.minute

In [20]:
# Re-check Test data type after conversion

test.dtypes

pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
Year                               int32
Month                              int32
Date                              object
Day                                int32
Hour                               int32
Minute                             int32
dtype: object

**Pickup_Datetime**

In [21]:
# Removing pickup_datetime missing values rows

train = train.drop(train[train['pickup_datetime'].isnull()].index, axis=0)
print(train.shape)
print(train['pickup_datetime'].isnull().sum())

(16066, 13)
0


**Passenger_Count**

In [22]:
train['passenger_count'].describe()

count    16011.000000
mean         2.625171
std         60.846021
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max       5345.000000
Name: passenger_count, dtype: float64

In [23]:
# Removing the passenger count having count>6

train = train.drop(train[train["passenger_count"]> 6 ].index, axis=0)

In [24]:
# Removing the values with passenger count of 0

train = train.drop(train[train["passenger_count"] == 0 ].index, axis=0)

In [25]:
# Re-check

train["passenger_count"].describe()

count    15934.000000
mean         1.649581
std          1.265943
min          0.120000
25%          1.000000
50%          1.000000
75%          2.000000
max          6.000000
Name: passenger_count, dtype: float64

In [26]:
train["passenger_count"].sort_values(ascending=True)

8862     0.12
16048    1.00
22       1.00
21       1.00
20       1.00
         ... 
8076      NaN
8139      NaN
8259      NaN
8306      NaN
16066     NaN
Name: passenger_count, Length: 15989, dtype: float64

In [27]:
# Removing passenger_count missing values rows

train = train.drop(train[train['passenger_count'].isnull()].index, axis=0)
print(train.shape)
print(train['passenger_count'].isnull().sum())

(15934, 13)
0


In [28]:
# There is one passenger count value of 0.12 which is not possible. 
# Hence we will remove fractional passenger value

train = train.drop(train[train['passenger_count'] == 0.12].index, axis=0)

In [29]:
train.shape

(15933, 13)

**Fare_Amount**

In [30]:
# Finding decending order of fare_amount to get to know whether the outliers are present or not

train["fare_amount"].sort_values(ascending=False)

1015    54343.0
1072     4343.0
607       453.0
980       434.0
1335      180.0
         ...   
1712        NaN
2412        NaN
2458        NaN
8178        NaN
8226        NaN
Name: fare_amount, Length: 15933, dtype: float64

In [31]:
# Fare amount has a negative value, which doesn't make sense. 

Counter(train["fare_amount"] < 0)

Counter({False: 15930, True: 3})

In [32]:
# Remove those fields which has -ve values

train = train.drop(train[train['fare_amount'] < 0].index, axis=0)
print(train.shape)

(15930, 13)


In [33]:
# Make sure there is no negative values in the fare_amount variable column

train["fare_amount"].min()

0.0

In [34]:
# Remove the row where fare amount is zero

train = train.drop(train[train['fare_amount'] < 1].index,axis=0)
train.shape

(15928, 13)

In [35]:
# Fare amount variable in descending order

train["fare_amount"].sort_values(ascending=False)

1015    54343.0
1072     4343.0
607       453.0
980       434.0
1335      180.0
         ...   
1712        NaN
2412        NaN
2458        NaN
8178        NaN
8226        NaN
Name: fare_amount, Length: 15928, dtype: float64

In [36]:
# Removing the missing value rows

train = train.drop(train[train["fare_amount"].isnull()].index, axis=0)
train.shape

(15904, 13)

In [37]:
# Removing the outlier 

train = train.drop(train[train["fare_amount"] > 453].index, axis=0)
train.shape

(15902, 13)

In [38]:
# Fare_amount variable in descending order

train['fare_amount'].sort_values(ascending=False)

607     453.00
980     434.00
1335    180.00
1483    165.00
6630    128.83
         ...  
4367      2.50
6297      2.50
3558      2.50
6226      2.50
1427      1.14
Name: fare_amount, Length: 15902, dtype: float64

In [39]:
train['fare_amount'].describe()

count    15902.000000
mean        11.376356
std         10.814908
min          1.140000
25%          6.000000
50%          8.500000
75%         12.500000
max        453.000000
Name: fare_amount, dtype: float64

**Pickup and Dropoff Latitude/Longitude**

In [40]:
# Filter pickup latitude between 40.5 and 41.0
train = train[(train['pickup_latitude'] >= 40.5) & (train['pickup_latitude'] <= 41.0)]

# Filter pickup longitude between -74.3 and -73.5
train = train[(train['pickup_longitude'] >= -74.3) & (train['pickup_longitude'] <= -73.5)]


In [41]:
train['pickup_latitude'].sort_values(ascending=False)

3394     40.956018
4004     40.876747
12934    40.876620
11366    40.866375
4313     40.865160
           ...    
4800     40.614803
11964    40.609495
11859    40.604462
12940    40.572183
13736    40.571310
Name: pickup_latitude, Length: 15560, dtype: float64

In [42]:
train['pickup_longitude'].sort_values(ascending=False)

12915   -73.652179
13175   -73.688170
7908    -73.725993
4086    -73.776293
15366   -73.776343
           ...    
7635    -74.102257
1494    -74.105395
9090    -74.133710
10524   -74.181602
12940   -74.229138
Name: pickup_longitude, Length: 15560, dtype: float64

In [43]:
# Filter pickup latitude between 40.5 and 41.0
train = train[(train['dropoff_latitude'] >= 40.5) & (train['dropoff_latitude'] <= 41.0)]

# Filter pickup longitude between -74.3 and -73.5
train = train[(train['dropoff_longitude'] >= -74.3) & (train['dropoff_longitude'] <= -73.5)]


In [44]:
train['dropoff_latitude'].sort_values(ascending=False)

3394     40.954298
4118     40.921800
7769     40.911461
7021     40.905170
15207    40.904590
           ...    
8552     40.574729
12398    40.574652
4291     40.551710
10000    40.550097
13736    40.543870
Name: dropoff_latitude, Length: 15541, dtype: float64

In [45]:
train['dropoff_latitude'].sort_values(ascending=False)

3394     40.954298
4118     40.921800
7769     40.911461
7021     40.905170
15207    40.904590
           ...    
8552     40.574729
12398    40.574652
4291     40.551710
10000    40.550097
13736    40.543870
Name: dropoff_latitude, Length: 15541, dtype: float64

In [46]:
train.shape

(15541, 13)

In [47]:
# Check null value for Train data

train.isnull().sum()


# Check null value for Test data

test.isnull().sum()

pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
Year                 0
Month                0
Date                 0
Day                  0
Hour                 0
Minute               0
dtype: int64

**Calculate Distance**

In [48]:
# # To Calculate Distance we use Haversine Formula 

from math import radians, cos, sin, asin, sqrt
def haversine(a):
      lon1=a[0]
      lat1=a[1]
      lon2=a[2]
      lat2=a[3]
    
# Convert decimal degrees to radians 
      lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

# Haversine Formula 
      dlon = lon2 - lon1 
      dlat = lat2 - lat1 
      a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
      c =  2 * asin(sqrt(a))

# Radius of earth in kilometers is 6371
      km = 6371* c
      return km
    

In [49]:
# Distance (Train)

train["distance"] = train[["pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude"]].apply(haversine, axis=1)

In [50]:
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Year,Month,Date,Day,Hour,Minute,distance
0,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1.0,2009.0,6.0,2009-06-15,15.0,17.0,26.0,1.030764
1,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1.0,2010.0,1.0,2010-01-05,5.0,16.0,52.0,8.450134
2,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2.0,2011.0,8.0,2011-08-18,18.0,0.0,35.0,1.389525
3,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1.0,2012.0,4.0,2012-04-21,21.0,4.0,30.0,2.79927
4,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1.0,2010.0,3.0,2010-03-09,9.0,7.0,51.0,1.999157


In [51]:
# Distance (Test)

test["distance"] = test[["pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude"]].apply(haversine, axis=1)

In [52]:
test.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Year,Month,Date,Day,Hour,Minute,distance
0,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1,2015,1,2015-01-27,27,13,8,2.323259
1,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1,2015,1,2015-01-27,27,13,8,2.425353
2,2011-10-08 11:53:44+00:00,-73.982524,40.75126,-73.979654,40.746139,1,2011,10,2011-10-08,8,11,53,0.618628
3,2012-12-01 21:12:12+00:00,-73.98116,40.767807,-73.990448,40.751635,1,2012,12,2012-12-01,1,21,12,1.961033
4,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1,2012,12,2012-12-01,1,21,12,5.387301


In [53]:
# Finding decending order of distance to get to know whether the outliers are presented or not.

train['distance'].sort_values(ascending=False)

4118     32.602535
7021     29.478280
6677     26.369072
8105     24.690884
15023    24.125745
           ...    
4086      0.000000
5449      0.000000
5452      0.000000
11437     0.000000
15524     0.000000
Name: distance, Length: 15541, dtype: float64

In [54]:
Counter(train['distance'] == 0)

Counter({False: 15393, True: 148})

In [55]:
Counter(test['distance'] == 0)

Counter({False: 9829, True: 85})

In [56]:
# We will remove the rows whose distance value is zero

train = train.drop(train[train['distance'] == 0].index, axis=0)
train.shape

(15393, 14)

In [57]:
# We will remove the rows whose distance values is very high, which is more than 129kms.

train = train.drop(train[train['distance'] > 130 ].index, axis=0)
train.shape

(15393, 14)

In [58]:
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Year,Month,Date,Day,Hour,Minute,distance
0,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1.0,2009.0,6.0,2009-06-15,15.0,17.0,26.0,1.030764
1,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1.0,2010.0,1.0,2010-01-05,5.0,16.0,52.0,8.450134
2,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2.0,2011.0,8.0,2011-08-18,18.0,0.0,35.0,1.389525
3,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1.0,2012.0,4.0,2012-04-21,21.0,4.0,30.0,2.79927
4,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1.0,2010.0,3.0,2010-03-09,9.0,7.0,51.0,1.999157


In [59]:
# Drop (Train) Dataset Columns

drop1 = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'Minute']

train = train.drop(drop1, axis = 1, errors='ignore')

In [60]:
train.head()

Unnamed: 0,fare_amount,passenger_count,Year,Month,Date,Day,Hour,distance
0,4.5,1.0,2009.0,6.0,2009-06-15,15.0,17.0,1.030764
1,16.9,1.0,2010.0,1.0,2010-01-05,5.0,16.0,8.450134
2,5.7,2.0,2011.0,8.0,2011-08-18,18.0,0.0,1.389525
3,7.7,1.0,2012.0,4.0,2012-04-21,21.0,4.0,2.79927
4,5.3,1.0,2010.0,3.0,2010-03-09,9.0,7.0,1.999157


In [61]:
train['Date'] = pd.to_datetime(train['Date'])
train['Date'] = train['Date'].dt.strftime('%Y%m%d').astype(int)

In [62]:
train['passenger_count'] = train['passenger_count'].astype('int64')
train['Year'] = train['Year'].astype('int64')
train['Month'] = train['Month'].astype('int64')
train['Date'] = train['Date'].astype('int64')
train['Day'] = train['Day'].astype('int64')
train['Hour'] = train['Hour'].astype('int64')

In [63]:
train.dtypes

fare_amount        float64
passenger_count      int64
Year                 int64
Month                int64
Date                 int64
Day                  int64
Hour                 int64
distance           float64
dtype: object

In [64]:
# Remove (Test) Dataset Columns

drop2 = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'Minute']

test = test.drop(drop2, axis = 1)

In [65]:
test.head()

Unnamed: 0,passenger_count,Year,Month,Date,Day,Hour,distance
0,1,2015,1,2015-01-27,27,13,2.323259
1,1,2015,1,2015-01-27,27,13,2.425353
2,1,2011,10,2011-10-08,8,11,0.618628
3,1,2012,12,2012-12-01,1,21,1.961033
4,1,2012,12,2012-12-01,1,21,5.387301


In [66]:
test['Date'] = pd.to_datetime(test['Date'])
test['Date'] = test['Date'].dt.strftime('%Y%m%d').astype(int)

In [67]:
test['passenger_count'] = test['passenger_count'].astype('int64')
test['Year'] = test['Year'].astype('int64')
test['Month'] = test['Month'].astype('int64')
test['Date'] = test['Date'].astype('int64')
test['Day'] = test['Day'].astype('int64')
test['Hour'] = test['Hour'].astype('int64')

In [68]:
test.dtypes

passenger_count      int64
Year                 int64
Month                int64
Date                 int64
Day                  int64
Hour                 int64
distance           float64
dtype: object