# Predicting "Trip Distance" Using Random Forests

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
import xgboost as xgb 
warnings.filterwarnings('ignore')

## Importing Pre-processed Data

In [25]:
df_new = pd.read_pickle(r'C:\Users\nishi\Desktop\EECS 731 - Data Science\Project\df_new.pkl')

In [26]:
df_new.head()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,...,total_amount,payment_type,trip_type,trip_duration,month,day,total_custom,weekend_Trip,hour,airport
0,2017-12-31 23:55:09,2017-12-31 23:59:14,1,17,61,1,1.1,5.5,0.5,0.5,...,6.8,2,1.0,4.083333,12,6,6.8,1,23,0
1,2017-12-31 23:30:36,2017-12-31 23:37:20,1,61,49,1,1.1,6.5,0.5,0.5,...,7.8,2,1.0,6.733333,12,6,7.8,1,23,0
2,2017-12-31 23:02:26,2017-12-31 23:19:37,1,228,188,1,4.1,15.5,0.5,0.5,...,20.15,1,1.0,17.183333,12,6,20.15,1,23,0
3,2017-12-31 23:12:24,2017-12-31 23:16:55,1,228,26,1,0.8,5.0,0.5,0.5,...,6.3,2,1.0,4.516667,12,6,6.3,1,23,0
4,2017-12-31 23:53:11,2017-12-31 23:55:40,1,225,225,1,0.5,4.0,0.5,0.5,...,5.3,2,1.0,2.483333,12,6,5.3,1,23,0


## Shuffling data

In [27]:
df_new = df_new.sample(frac=1).reset_index(drop=True)

## Different Models with different features for accuracy comparasion

In [28]:
# Creating a dataframes for model testing. Models cannot parse datetime columns, so have to drop them

df_1 = df_new[['airport','RatecodeID','payment_type','DOLocationID','trip_type','hour','PULocationID','trip_distance']]

## Splitting the data into Training and Testing

In [29]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

features_1 = df_1.drop(['trip_distance'], axis = 1)
labels_1 = df_1[['trip_distance']]

In [30]:
# Split the data into training and testing sets

train_features_1, test_features_1, train_labels_1, test_labels_1 = train_test_split(features_1, labels_1, test_size = 0.10, random_state = 42)

In [31]:
test_features_1.head()

Unnamed: 0,airport,RatecodeID,payment_type,DOLocationID,trip_type,hour,PULocationID
1937401,0,1,2,188,1.0,19,49
868829,0,1,2,74,1.0,3,75
1992601,0,1,1,189,1.0,19,181
1952741,0,1,1,61,1.0,14,49
1832239,0,1,2,136,1.0,17,243


# Using Random Forests to Predict Trip Distance

In [32]:
%%time
from sklearn.ensemble import RandomForestRegressor
rf1 = RandomForestRegressor()
rf1.fit(train_features_1, train_labels_1.values.ravel())

Wall time: 48.6 s


# Predicting Trip Distance for Test cases

In [33]:
prediction_1 = rf1.predict(test_features_1)

In [34]:
result_1 = test_features_1

## Creating Results

In [35]:
result_1['actual_distance'] = test_labels_1[['trip_distance']]
result_1['predicted_distance'] = prediction_1
result_1['difference'] = abs(result_1['predicted_distance']-result_1['actual_distance'])
result_1.head()

Unnamed: 0,airport,RatecodeID,payment_type,DOLocationID,trip_type,hour,PULocationID,actual_distance,predicted_distance,difference
1937401,0,1,2,188,1.0,19,49,2.9,2.601276,0.298724
868829,0,1,2,74,1.0,3,75,1.1,1.078178,0.021822
1992601,0,1,1,189,1.0,19,181,0.4,1.102159,0.702159
1952741,0,1,1,61,1.0,14,49,1.9,1.476262,0.423738
1832239,0,1,2,136,1.0,17,243,2.9,2.17,0.73


## Trip Distance Accuracy

In [36]:
print("Model_1 Accuracy % : " +str((result_1[result_1['difference'] < 1].shape[0]/result_1.shape[0]) * 100)[:5])

Model_1 Accuracy % : 88.11


# XGBoost Starts here
## This model will predict fare_amount, given the predicted trip distance and other available features

In [37]:
df_3 = df_new[['trip_distance','airport','RatecodeID','payment_type','DOLocationID','trip_type','passenger_count','PULocationID','hour','day','fare_amount']]

In [38]:
features_3 = df_3.drop(['fare_amount'], axis = 1)
labels_3 = df_3[['fare_amount']]

In [39]:
train_features_3, test_features_3, train_labels_3, test_labels_3 = train_test_split(features_3, labels_3, test_size = 0.10, random_state = 42)

In [40]:
train_features_3.dtypes

trip_distance      float64
airport              int32
RatecodeID           int64
payment_type         int64
DOLocationID         int64
trip_type          float64
passenger_count      int64
PULocationID         int64
hour                 int64
day                  int64
dtype: object

In [41]:
test_features_3['trip_distance'] = result_1['predicted_distance']

In [42]:
test_features_3.head()

Unnamed: 0,trip_distance,airport,RatecodeID,payment_type,DOLocationID,trip_type,passenger_count,PULocationID,hour,day
1937401,2.601276,0,1,2,188,1.0,1,49,19,0
868829,1.078178,0,1,2,74,1.0,1,75,3,5
1992601,1.102159,0,1,1,189,1.0,1,181,19,5
1952741,1.476262,0,1,1,61,1.0,2,49,14,2
1832239,2.17,0,1,2,136,1.0,3,243,17,6


In [43]:
dtrain_3 = xgb.DMatrix(train_features_3, label=train_labels_3)
dtest_3 = xgb.DMatrix(test_features_3)

In [44]:
#set parameters for xgboost
params = {'max_depth':7,
          'eta':1,
          'silent':1,
          'objective':'reg:linear',
         }
num_rounds = 50
rf3 = xgb.train(params,dtrain_3,num_rounds)

In [45]:
prediction_3 = rf3.predict(dtest_3)
result_3 = test_features_3
result_3['actual_fare'] = test_labels_3[['fare_amount']]
result_3['predicted_fare'] = prediction_3
result_3['difference'] = abs(result_3['predicted_fare']-result_3['actual_fare'])

In [46]:
print("Model_3 Accuracy % : " +str((result_3[result_3['difference'] < 1].shape[0]/result_3.shape[0]) * 100)[:5])

Model_3 Accuracy % : 36.85


# Results