<a href="https://colab.research.google.com/github/bavindu/rideFareClassification/blob/master/ride_fair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import DistanceMetric
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint
from math import radians

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plot
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [None]:
!pip install PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from google.colab import files
from oauth2client.client import GoogleCredentials

Authenticate and create the PyDrive client

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Get Files

In [None]:
test = drive.CreateFile({'id':"1rFfppUfFVqvi1mi7CpjWn7w9mNAi303R"})   
test.GetContentFile('test.csv') 
train = drive.CreateFile({'id':"1bigFo-e_MOg6Fhng0yOnQZ0GGeShCMl-"})   
train.GetContentFile('train.csv') 

In [None]:
trainData = pd.read_csv('train.csv',parse_dates=['pickup_time','drop_time'])
df_test = pd.read_csv('test.csv',parse_dates=['pickup_time','drop_time'])
print("train",trainData.shape)
print("test",df_test.shape)

train (17176, 14)
test (8576, 13)


In [None]:
#preprocessing
simple_mean_imputer = SimpleImputer(strategy = 'mean')
simple_frequency_imputer = SimpleImputer(strategy = 'most_frequent')

#getNumerical columns
numerical_columns = ['duration', 'meter_waiting',
                   'meter_waiting_fare', 'meter_waiting_till_pickup', 'pick_lat', 'pick_lon', 'drop_lat', 'drop_lon',
                   'fare']
mostFrequenceColumn = ['additional_fare']

In [None]:
trainData[numerical_columns] = simple_mean_imputer.fit_transform(trainData[numerical_columns])
trainData[mostFrequenceColumn] = simple_frequency_imputer.fit_transform(trainData[mostFrequenceColumn])

In [None]:
trainData['dayofweek'] = trainData['drop_time'].dt.dayofweek
# weekend or not
trainData['weekend'] = trainData['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
# pickup timestamp
trainData = trainData.assign(pickup_hour=trainData.pickup_time.dt.hour,
               pickup_day=trainData.pickup_time.dt.day,
               pickup_month=trainData.pickup_time.dt.month,
               pickup_year=trainData.pickup_time.dt.year)
# drop timestamp
trainData = trainData.assign(drop_hour=trainData.drop_time.dt.hour,
               drop_day=trainData.drop_time.dt.day,
               drop_month=trainData.drop_time.dt.month,
               drop_year=trainData.drop_time.dt.year)

In [None]:
# calculate distance from pickup to drop 
def haversine_vectorize(lon1, lat1, lon2, lat2):
 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
 
    newlon = lon2 - lon1
    newlat = lat2 - lat1
 
    haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2
 
    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    km = 6367 * dist #6367 for distance in KM for miles use 3958
    return km

In [None]:
trainData['haversine_dist'] = haversine_vectorize(trainData['pick_lon'],trainData['pick_lat'],trainData['drop_lon'],trainData['drop_lat'])

In [None]:
trainData[numerical_columns] = simple_mean_imputer.fit_transform(trainData[numerical_columns])
trainData[mostFrequenceColumn] = simple_frequency_imputer.fit_transform(trainData[mostFrequenceColumn])
trainData['total_waiting_time'] = trainData['meter_waiting'] + trainData['meter_waiting_till_pickup']
trainData['mobile_time'] = trainData['duration'] -trainData['meter_waiting']

In [None]:
#trainData['deference_of_pickupdrop'] = trainData['drop_time'] - trainData['pickup_time']
#trainData['deference_of_pickupdrop'] = trainData['deference_of_pickupdrop'].dt.seconds
#trainData['duration_defernence'] = trainData['duration'] - trainData['deference_of_pickupdrop']


In [None]:
considering_features = ['additional_fare', 'duration', 'meter_waiting','meter_waiting_fare', 'meter_waiting_till_pickup', 
                        'pick_lat', 'pick_lon', 'drop_lat', 'drop_lon', 'fare','dayofweek', 'pickup_day', 'pickup_hour', 'drop_day', 
                        'drop_hour','pickup_month','drop_month','haversine_dist','total_waiting_time','total_waiting_time']
X = trainData[considering_features].values

In [None]:
X = StandardScaler().fit(X).transform(X.astype(float))

In [None]:
trainData['label'].replace(to_replace=['incorrect','correct'], value=[0,1],inplace=True)
y = trainData['label']

In [None]:
RANDOM_SEED = 6  
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2,
    shuffle=True,
    stratify=y,
    random_state=RANDOM_SEED) 

Grid Search

In [None]:
params_RF_RS = {
                  "max_depth": randint(3,8),
                  'n_estimators': [50,100,200,300,400, 500],
                  "max_features":['auto', 'sqrt', 'log2'],
                  "min_samples_split":randint (2,10),
                  "min_samples_leaf":randint (1,10),
                  "criterion": ["gini", "entropy"]
                }

In [None]:
# create random forest classifier model
rf_model = RandomForestClassifier()

# set up random search meta-estimator
# this will train 100 models over 5 folds of cross validation (500 models total)
clf = RandomizedSearchCV(rf_model, params_RF_RS, n_iter=100, cv=5, random_state=1)

# train the random search meta-estimator to find the best model out of 100 candidates
model = clf.fit(X_train, Y_train)

# print winning set of hyperparameters
from pprint import pprint
pprint(model.best_estimator_.get_params())

Training

In [None]:
yhat = model.predict(X_test)
yhat[0:5]

array([1, 1, 1, 1, 1])

In [None]:
print("Test set Accuracy: ", metrics.accuracy_score(Y_test, yhat))
print("Test set F1 score: ", f1_score(Y_test,yhat));

Test set Accuracy:  0.9272409778812573
Test set F1 score:  0.9611318407960199


In [None]:
# fill missing values
df_test[numerical_columns] = simple_mean_imputer.fit_transform(df_test[numerical_columns])
df_test[mostFrequenceColumn] = simple_frequency_imputer.fit_transform(df_test[mostFrequenceColumn])
df_test['dayofweek'] = df_test['drop_time'].dt.dayofweek
df_test['weekend'] = df_test['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
# pickup timestamp
df_test = df_test.assign(pickup_hour=df_test.pickup_time.dt.hour,
               pickup_day=df_test.pickup_time.dt.day,
               pickup_month=df_test.pickup_time.dt.month,
               pickup_year=df_test.pickup_time.dt.year)
# drop timestamp
df_test = df_test.assign(drop_hour=df_test.drop_time.dt.hour,
               drop_day=df_test.drop_time.dt.day,
               drop_month=df_test.drop_time.dt.month,
               drop_year=df_test.drop_time.dt.year)

df_test['haversine_dist'] = haversine_vectorize(df_test['pick_lon'],df_test['pick_lat'],df_test['drop_lon'],df_test['drop_lat'])
df_test['total_waiting_time'] = df_test['meter_waiting'] + df_test['meter_waiting_till_pickup']
df_test['mobile_time'] = df_test['duration'] -df_test['meter_waiting']

In [None]:
X = df_test[considering_features].values

X = StandardScaler().fit(X).transform(X.astype(float))

In [None]:
yhat_ = model.predict(X)
data = {'tripid':df_test['tripid'].values}
df_res = pd.DataFrame(data)

In [None]:
print(len(yhat_))
print(len(df_res))

8576
8576


In [None]:
df_res['prediction'] = yhat_
df_res.to_csv('output.csv', index=False)
files.download('output.csv')
df_res['prediction'].value_counts()

1    8388
0     188
Name: prediction, dtype: int64