In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv('/content/drive/MyDrive/NYC Taxi Duration Trip Project/nyc_taxi_modelling.csv',)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,...,id_puzone,pickup_zone,id_dozone,dropoff_zone,distance_km,speed,pickup_date,pickup_month,pickup_year,pickup_hour
0,0,id1080784,2,2016-02-29 16:40:00,29/02/2016 16:47,1,-73.953918,40.778873,-73.963875,40.771164,...,59,Crotona Park,59,Crotona Park,1.2,10.8,29,2,2016,16
1,1,id0889885,1,2016-03-11 23:35:00,11/03/2016 23:53,2,-73.988312,40.731743,-73.994751,40.694931,...,59,Crotona Park,59,Crotona Park,4.12,13.5,11,3,2016,23
2,2,id0857912,2,2016-02-21 17:59:00,21/02/2016 18:26,2,-73.997314,40.721458,-73.948029,40.774918,...,59,Crotona Park,59,Crotona Park,7.25,15.96,21,2,2016,17
3,3,id3744273,2,2016-01-05 09:44:00,05/01/2016 10:03,6,-73.96167,40.75972,-73.956779,40.780628,...,59,Crotona Park,59,Crotona Park,2.36,7.44,5,1,2016,9
4,4,id0232939,1,2016-02-17 06:42:00,17/02/2016 06:56,1,-74.01712,40.708469,-73.988182,40.740631,...,59,Crotona Park,59,Crotona Park,4.33,18.37,17,2,2016,6


# Feature Engineering

### Columns Transformation

In [5]:
df.columns

Index(['Unnamed: 0', 'id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'id_puzone', 'pickup_zone', 'id_dozone',
       'dropoff_zone', 'distance_km', 'speed', 'pickup_date', 'pickup_month',
       'pickup_year', 'pickup_hour'],
      dtype='object')

In [6]:
col_list = ['Unnamed: 0', 'id', 'pickup_datetime', 'dropoff_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag', 'pickup_zone', 'id_dozone', 'dropoff_zone']
df.drop(columns=col_list, inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729322 entries, 0 to 729321
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   vendor_id        729322 non-null  int64  
 1   passenger_count  729322 non-null  int64  
 2   trip_duration    729322 non-null  int64  
 3   id_puzone        729322 non-null  int64  
 4   distance_km      729322 non-null  float64
 5   speed            729322 non-null  float64
 6   pickup_date      729322 non-null  int64  
 7   pickup_month     729322 non-null  int64  
 8   pickup_year      729322 non-null  int64  
 9   pickup_hour      729322 non-null  int64  
dtypes: float64(2), int64(8)
memory usage: 55.6 MB


In [8]:
df.head()

Unnamed: 0,vendor_id,passenger_count,trip_duration,id_puzone,distance_km,speed,pickup_date,pickup_month,pickup_year,pickup_hour
0,2,1,400,59,1.2,10.8,29,2,2016,16
1,1,2,1100,59,4.12,13.5,11,3,2016,23
2,2,2,1635,59,7.25,15.96,21,2,2016,17
3,2,6,1141,59,2.36,7.44,5,1,2016,9
4,1,1,848,59,4.33,18.37,17,2,2016,6


In [9]:
df = df[['vendor_id', 'passenger_count', 'id_puzone', 'pickup_date', 'pickup_month', 'pickup_year', 'pickup_hour', 'distance_km', 'speed', 'trip_duration']]

## Delete Outlier

**Passenger_Count**
for passenger number is 0 and greater than 5

In [10]:
df = df[~(df['passenger_count'] == 0) & (df['passenger_count'] <= 5)]

**Duration_Trip** for long time than 7200 sec

In [44]:
df = df[(df['trip_duration'] >= 300) & (df['trip_duration'] <= 7200)]

**Distance** more than 150 km and equal with 0

In [12]:
df = df[(df['distance_km'] >= 1) & (df['distance_km'] <= 150)]

**Speed** more than 120 km/hour

In [13]:
df = df[(df['speed'] >= 1) & (df['speed'] <= 120)]

In [50]:
df.describe()

Unnamed: 0,vendor_id,passenger_count,id_puzone,pickup_date,pickup_month,pickup_year,pickup_hour,distance_km,speed,trip_duration
count,548536.0,548536.0,548536.0,548536.0,548536.0,548536.0,548536.0,548536.0,548536.0,548536.0
mean,1.519342,1.519778,62.937454,15.538273,3.534012,2016.0,13.708247,4.169602,14.758715,986.846158
std,0.499626,1.058339,31.259224,8.696638,1.680512,0.0,6.441051,4.179051,7.513394,654.186687
min,1.0,1.0,4.0,1.0,1.0,2016.0,0.0,1.0,1.0,300.0
25%,1.0,1.0,59.0,8.0,2.0,2016.0,9.0,1.74,9.65,544.0
50%,2.0,1.0,59.0,15.0,4.0,2016.0,14.0,2.66,13.05,802.0
75%,2.0,2.0,59.0,23.0,5.0,2016.0,19.0,4.72,17.94,1214.0
max,2.0,5.0,254.0,31.0,6.0,2016.0,23.0,116.61,116.72,7153.0


In [45]:
df.shape

(548536, 10)

In [46]:
((729322-586691)/729322)*100

19.55665673049764

# Baseline Model Linear

In [47]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [48]:
X

array([[ 2.  ,  1.  , 59.  , ..., 16.  ,  1.2 , 10.8 ],
       [ 1.  ,  2.  , 59.  , ..., 23.  ,  4.12, 13.5 ],
       [ 2.  ,  2.  , 59.  , ..., 17.  ,  7.25, 15.96],
       ...,
       [ 2.  ,  1.  , 59.  , ..., 22.  ,  2.2 , 17.68],
       [ 1.  ,  1.  , 59.  , ..., 18.  ,  2.3 , 12.28],
       [ 2.  ,  4.  , 59.  , ..., 17.  ,  5.95, 17.51]])

In [49]:
y

array([ 400, 1100, 1635, ...,  449,  673, 1224])

In [51]:
# split train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [52]:
# Multiple Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [55]:
y_predict = lin_reg.predict(X_train)

In [56]:
# r2_score
r2_score(y_train, y_predict)

0.82611040948774

# Ridge and Lasso Regression

In [57]:
rid_reg = Ridge(alpha = 0.1)
rid_reg.fit(X_train, y_train)

In [58]:
y_rpredict = rid_reg.predict(X_train)

In [59]:
# r2_score
r2_score(y_train, y_rpredict)

0.8261104094877394

In [60]:
las_reg = Lasso(alpha = 0.1)
las_reg.fit(X_train, y_train)

In [61]:
y_lpredict = las_reg.predict(X_train)

In [62]:
# r2_score
r2_score(y_train, y_lpredict)

0.8261102959763996

# Decission Tree and Random Forest

In [63]:
from sklearn.tree import DecisionTreeRegressor
dc_reg = DecisionTreeRegressor(random_state=0)
dc_reg.fit(X_train, y_train)

In [64]:
y_dcpredict = dc_reg.predict(X_train)

In [65]:
# r2_score
r2_score(y_train, y_dcpredict)

0.9999999998533425

In [66]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(max_depth=4, random_state=0)
rf_reg.fit(X_train, y_train)

In [67]:
y_rfpredict = rf_reg.predict(X_train)

In [68]:
# r2_score
r2_score(y_train, y_rfpredict)

0.8453905584902665

Berdasarkan hasil training pada, nilai r kuadrat pada Decision Tree menjadi yang paling tinggi, disusul dengan Random Forest.
perlu mengecek apakah terjadi over fitting atau tidak.

In [69]:
y_train_dcpredict = dc_reg.predict(X_train)
y_test_dcpredict = dc_reg.predict(X_test)

In [70]:
# Training evaluation
print('RMSE for training data is {}'.format(np.sqrt(mean_squared_error(y_train, y_train_dcpredict))))
print('MAPE for training data is {}'.format(mean_absolute_percentage_error(y_train, y_train_dcpredict)))

RMSE for training data is 0.007916244178393473
MAPE for training data is 6.556111452930565e-08


In [71]:
# Test evaluation
print('RMSE for testing data is {}'.format(np.sqrt(mean_squared_error(y_test, y_test_dcpredict))))
print('MAPE for testing data is {}'.format(mean_absolute_percentage_error(y_test, y_test_dcpredict)))

RMSE for testing data is 15.320400000594585
MAPE for testing data is 0.0035490160653652925


In [72]:
y_train_rfpredict = rf_reg.predict(X_train)
y_test_rfpredict = rf_reg.predict(X_test)

In [73]:
# Training evaluation of RF Model
print('RMSE for training data is {}'.format(np.sqrt(mean_squared_error(y_train, y_train_rfpredict))))
print('MAPE for training data is {}'.format(mean_absolute_percentage_error(y_train, y_train_rfpredict)))

RMSE for training data is 257.03076529133034
MAPE for training data is 0.20870377625217662


In [74]:
# Test evaluation of RF Model
print('RMSE for testing data is {}'.format(np.sqrt(mean_squared_error(y_test, y_test_rfpredict))))
print('MAPE for testing data is {}'.format(mean_absolute_percentage_error(y_test, y_test_rfpredict)))

RMSE for testing data is 257.4930722582167
MAPE for testing data is 0.20869508021093547


Model Decision Tree mengalami overfitting, dimana hasil training model lebih baik dibandingkan testing model. Sedangkan Random Forest memiliki hasil yang memuaskan antara training dan testing.