## Taxifare prediction using RandomForest Model

In [2]:
!pip install haversine



In [3]:
import pandas as pd
import numpy as np
import haversine as hs
import os 

In [4]:
os.getcwd()

'C:\\Users\\ANAMIKA\\Downloads'

In [5]:
df = pd.read_csv('TaxiFare.csv')
df.head()

Unnamed: 0,unique_id,amount,date_time_of_pickup,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,26:21.0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,52:16.0,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,35:00.0,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,30:42.0,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,51:00.0,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   unique_id             50000 non-null  object 
 1   amount                50000 non-null  float64
 2   date_time_of_pickup   50000 non-null  object 
 3   longitude_of_pickup   50000 non-null  float64
 4   latitude_of_pickup    50000 non-null  float64
 5   longitude_of_dropoff  50000 non-null  float64
 6   latitude_of_dropoff   50000 non-null  float64
 7   no_of_passenger       50000 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 3.1+ MB


In [7]:
df.describe()

Unnamed: 0,amount,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,11.364171,-72.509756,39.933759,-72.504616,39.926251,1.66784
std,9.685557,10.39386,6.224857,10.40757,6.014737,1.289195
min,-5.0,-75.423848,-74.006893,-84.654241,-74.006377,0.0
25%,6.0,-73.992062,40.73488,-73.991152,40.734372,1.0
50%,8.5,-73.98184,40.752678,-73.980082,40.753372,1.0
75%,12.5,-73.967148,40.76736,-73.963584,40.768167,2.0
max,200.0,40.783472,401.083332,40.851027,43.41519,6.0


In [8]:
df[df.latitude_of_pickup > 180]
df.drop(5686, axis=0, inplace=True)

In [9]:
df.shape

(49999, 8)

In [10]:
def distance_calculator(plong, plat, dlong, dlat):
    dist = round(hs.haversine((plong, plat), (dlong, dlat)),2)
    return dist

In [11]:
df['distance'] = df.apply(lambda x: distance_calculator(x.longitude_of_pickup, x.latitude_of_pickup, \
                                                        x.longitude_of_dropoff,x.latitude_of_dropoff ), axis=1)

In [12]:
df.head()

Unnamed: 0,unique_id,amount,date_time_of_pickup,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger,distance
0,26:21.0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1,0.41
1,52:16.0,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,4.63
2,35:00.0,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2,1.0
3,30:42.0,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1,0.91
4,51:00.0,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,1.36


In [13]:
## Converting date_time_of_pickup to a date_time object so that we can parse the parts of the dates.
df['date_time_of_pickup'] = pd.to_datetime(df.date_time_of_pickup)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49999 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   unique_id             49999 non-null  object             
 1   amount                49999 non-null  float64            
 2   date_time_of_pickup   49999 non-null  datetime64[ns, UTC]
 3   longitude_of_pickup   49999 non-null  float64            
 4   latitude_of_pickup    49999 non-null  float64            
 5   longitude_of_dropoff  49999 non-null  float64            
 6   latitude_of_dropoff   49999 non-null  float64            
 7   no_of_passenger       49999 non-null  int64              
 8   distance              49999 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(6), int64(1), object(1)
memory usage: 3.8+ MB


In [15]:
# extract various date time components as seperate variables
df = df.assign(hour = df.date_time_of_pickup.dt.hour, 
                         day = df.date_time_of_pickup.dt.day,
                        month = df.date_time_of_pickup.dt.month, 
                        year = df.date_time_of_pickup.dt.year, 
                        dayofweek = df.date_time_of_pickup.dt.dayofweek)

In [16]:
df.head()

Unnamed: 0,unique_id,amount,date_time_of_pickup,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger,distance,hour,day,month,year,dayofweek
0,26:21.0,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1,0.41,17,15,6,2009,0
1,52:16.0,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1,4.63,16,5,1,2010,1
2,35:00.0,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2,1.0,0,18,8,2011,3
3,30:42.0,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1,0.91,4,21,4,2012,5
4,51:00.0,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1,1.36,7,9,3,2010,1


In [17]:
df.columns

Index(['unique_id', 'amount', 'date_time_of_pickup', 'longitude_of_pickup',
       'latitude_of_pickup', 'longitude_of_dropoff', 'latitude_of_dropoff',
       'no_of_passenger', 'distance', 'hour', 'day', 'month', 'year',
       'dayofweek'],
      dtype='object')

In [18]:
req_columns = ['amount','no_of_passenger', 'distance', 'hour', 'day', 'month', 'year','dayofweek']

df_new = df[req_columns]
df_new.shape

(49999, 8)

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [20]:
X = df_new.drop('amount',axis=1)
y = df['amount']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state = 1)
print(X_train.shape, X_test.shape,y_train.shape, y_test.shape)

(37499, 7) (12500, 7) (37499,) (12500,)


In [22]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_tr_pred = lr_model.predict(X_train)
lr_pred = lr_model.predict(X_test)

In [23]:
## Evaluating the model
from sklearn.metrics import mean_squared_error, r2_score

print('LR Model R2_score : ', r2_score(y_train, lr_tr_pred))
print('LR Model RMSE : ', np.sqrt(mean_squared_error(y_test, lr_pred)))

LR Model R2_score :  0.017202545423987248
LR Model RMSE :  9.702442296374347


In [24]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_tr_pred = rf_model.predict(X_train)
rf_pred = rf_model.predict(X_test)

In [25]:
## Evaluating the model
print('RF Model R2_score : ', r2_score(y_test, rf_pred))
print('RF Model RMSE : ', np.sqrt(mean_squared_error(y_test, rf_pred)))

RF Model R2_score :  0.6630082508325282
RF Model RMSE :  5.68114925528723


In [26]:
df_fea_imp = pd.DataFrame(zip(X_train.columns, rf_model.feature_importances_), columns=['cols','imp'])
df_fea_imp.sort_values('imp', ascending=False)

Unnamed: 0,cols,imp
1,distance,0.725258
2,hour,0.066409
3,day,0.064213
4,month,0.046491
5,year,0.042581
6,dayofweek,0.036735
0,no_of_passenger,0.018312


In [27]:
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

In [28]:
## Evaluating the model
print('RF Model R2_score : ', r2_score(y_test, dt_pred))
print('RF Model RMSE : ', np.sqrt(mean_squared_error(y_test, dt_pred)))

RF Model R2_score :  0.26045468049800036
RF Model RMSE :  8.416062687979457


In [29]:
df.amount.mean()

11.364332686653714

In [None]:
hour_mean

In [32]:
weekday_mean=df_new.groupby('dayofweek')['amount'].mean()

In [38]:
#feature engineering

In [None]:
weekday_fare=df_new.loc[(df_new.dayofweek>=0)&(df_new.dayofweek<=4)].groupby('hour')['amount'].mean().round(2)
weekday_fare=df_new.loc[(df_new.dayofweek>=5)&(df_new.dayofweek<=6)].groupby('hour')['amount'].mean().round(2)

In [37]:
df_new.loc[(df_new.dayofweek>=0)&(df_new.dayofweek<=4)].head()

Unnamed: 0,amount,no_of_passenger,distance,hour,day,month,year,dayofweek,daytime,day_time
0,4.5,1,0.41,17,15,6,2009,0,NT,1.0
1,16.9,1,4.63,16,5,1,2010,1,NT,1.0
2,5.7,2,1.0,0,18,8,2011,3,NT,
4,5.3,1,1.36,7,9,3,2010,1,NT,1.0
5,12.1,1,3.23,9,6,1,2011,3,NT,1.0


In [39]:
x=weekday_fare.values
z=weekend_fare.values

NameError: name 'weekday_fare' is not defined

In [40]:
#set width of bar
barwidth=0.25
fig=plt.subplots(figsize=(12,8))



plt.bar(x=x,y=y,color='r',label='week day')
plt.bar(x=x,y=z,color)

SyntaxError: positional argument follows keyword argument (902952953.py, line 8)

In [35]:
#creating a new column 'daytime' to update

df_new.loc[:,'daytime']=np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.loc[:,'daytime']=np.nan


In [36]:
df_new.loc[(df_new.hour>5)& (df_new.hour<20),'day_time']='01'
df_new.daytime.fillna('NT',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.loc[(df_new.hour>5)& (df_new.hour<20),'day_time']='01'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.daytime.fillna('NT',inplace=True)


In [None]:
df_new=df_new.pd.get_dummies(df_new,drop_first=True)

In [44]:
df_new.head()

Unnamed: 0,amount,no_of_passenger,distance,hour,day,month,year,dayofweek,daytime,day_time
0,4.5,1,0.41,17,15,6,2009,0,NT,1.0
1,16.9,1,4.63,16,5,1,2010,1,NT,1.0
2,5.7,2,1.0,0,18,8,2011,3,NT,
3,7.7,1,0.91,4,21,4,2012,5,NT,
4,5.3,1,1.36,7,9,3,2010,1,NT,1.0


In [41]:
x=df_new.drop('amount',axis=1)
y=df_new['amount']

In [43]:
x.shape,y.shape

((49999, 9), (49999,))

In [None]:
rf_model=RandomForestRegressor()

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state = 1)
rf_model.fit(X_train,y_train)
rf_tr_pred=rf_model.predict(X_train)
rf_test_pred=rf_model.predict(X_test)

In [49]:
print('LR Model R2_score : ', r2_score(y_test, rf_tr_pred))
print('LR Model RMSE : ', np.sqrt(mean_squared_error(y_test, rf_test_pred)))

ValueError: Found input variables with inconsistent numbers of samples: [12500, 37499]