<a href="https://colab.research.google.com/github/bhanukad610/Fare-Classification/blob/master/Fare_Classification_with_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
sns.set_style('whitegrid')
import tensorflow as tf
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

SMALL_SIZE = 10
MEDIUM_SIZE = 12

plt.rc('font', size=SMALL_SIZE)
plt.rc('axes', titlesize=MEDIUM_SIZE)
plt.rc('axes', labelsize=MEDIUM_SIZE)
plt.rcParams['figure.dpi']=150

  import pandas.util.testing as tm


### getGeneralDataFromDateTimeObj

In [0]:
import datetime
from datetime import timedelta

def getGeneralDataFromDateTimeObj(time):
  datetimeFormat = '%m/%d/%Y %H:%M'
  dateTimeObj = datetime.datetime.strptime(time, datetimeFormat)
  return [dateTimeObj.month, dateTimeObj.day, dateTimeObj.hour, dateTimeObj.minute]

In [0]:
def generate_drop_time(df):
  df_drop_time_month = []
  df_drop_time_day = []
  df_drop_time_hour = []
  df_drop_time_minute = []

  for time in df['drop_time']:
    data = getGeneralDataFromDateTimeObj(time)
    df_drop_time_month.append(data[0])
    df_drop_time_day.append(data[1])
    df_drop_time_hour.append(data[2])
    df_drop_time_minute.append(data[3])

  df['drop_time_month'] = df_drop_time_month
  df['df_drop_time_day'] = df_drop_time_day
  df['df_drop_time_hour'] = df_drop_time_hour
  df['df_drop_time_minute'] = df_drop_time_minute

In [0]:
def generate_pickup_time(df):
  df_pickup_time_month = []
  df_pickup_time_day = []
  df_pickup_time_hour = []
  df_pickup_time_minute = []

  for time in df['pickup_time']:
    data = getGeneralDataFromDateTimeObj(time)
    df_pickup_time_month.append(data[0])
    df_pickup_time_day.append(data[1])
    df_pickup_time_hour.append(data[2])
    df_pickup_time_minute.append(data[3])

  df['df_pickup_time_month'] = df_pickup_time_month
  df['df_pickup_time_day'] = df_pickup_time_day
  df['df_pickup_time_hour'] = df_pickup_time_hour
  df['df_pickup_time_minute'] = df_pickup_time_minute

### Handle data and time

In [0]:
import datetime
from datetime import timedelta

#function to calculate duration in seconds
def calculateDuration(pickup_time, drop_time):
  datetimeFormat = '%m/%d/%Y %H:%M'
  diff = datetime.datetime.strptime(drop_time, datetimeFormat)\
   - datetime.datetime.strptime(pickup_time, datetimeFormat)
  return diff.seconds

df_duration = df['tripid']

def calculateDurationDf(pickup_time_frame, drop_time_frame):
  
  for i in range(len(pickup_time_frame)):

    str_pickup_time = str(pickup_time_frame[i])
    str_drop_time = str(drop_time_frame[i])

    duration = calculateDuration(str_pickup_time, str_drop_time)
    df_duration[i] = np.int64(duration)

### Functions

In [0]:
import numpy as np
from sklearn.impute import SimpleImputer

def naImputation(Features, column_names):
  # df.replace(np.NaN,np.NaN,inplace=True)
  imp=SimpleImputer(missing_values=np.NaN)
  idf=pd.DataFrame(imp.fit_transform(Features))
  idf.columns=Features.columns
  idf.index=Features.index

  for column_name in column_names:
    idf[column_name].isna().sum()

  
  return idf

In [0]:
from math import radians, sin, cos, acos

#function to calculte distance from lat and longs
def calculateDistance(pick_lat,pick_lon, drop_lat, drop_lon):
      pick_lat = radians(float(pick_lat))
      pick_lon = radians(float(pick_lon))
      drop_lat = radians(float(drop_lat))
      drop_lon = radians(float(drop_lon))

      if (pick_lon == drop_lon):
        distance = 0.0
        return round(distance, 2)
      else:
        distance = 6371.01 * acos(sin(pick_lat)*sin(drop_lat) + cos(pick_lat)*cos(drop_lat)*cos(pick_lon - drop_lon))

      # distance = 6371.01 * acos(sin(pick_lat)*sin(drop_lat) + cos(pick_lat)*cos(drop_lat)*cos(pick_lon - drop_lon))
      return round(distance, 2)

def calculateDistanceDf(pick_lat_frame,pick_lon_frame, drop_lat_frame, drop_lon_frame):
  distanceList = []
  for i in range(len(drop_lon_frame)):
    pick_lat = pick_lat_frame[i]
    pick_lon = pick_lon_frame[i]
    drop_lat = drop_lat_frame[i]
    drop_lon = drop_lon_frame[i]

    try:
      distance = calculateDistance(pick_lat,pick_lon, drop_lat, drop_lon)
    except:
      print("Error occured! , at", i)

    distanceList.append(distance)
  return distanceList

In [0]:
def preprocess(df, features):
  generate_pickup_time(df)
  generate_drop_time(df)

  df['distance'] = calculateDistanceDf(df['pick_lat'], df['pick_lon'], df['drop_lat'], df['drop_lon'])


  
  features += ['distance', 'df_pickup_time_month', 'df_pickup_time_day', 'df_pickup_time_hour', 'df_pickup_time_minute', 'drop_time_month', 'df_drop_time_day', 'df_drop_time_hour', 'df_drop_time_minute']
  Features = df[features]
  Features = naImputation(Features,features)
  print(features)
  
  return Features

In [0]:
from sklearn import preprocessing
def getX(Features):
    X= preprocessing.StandardScaler().fit(Features).transform(Features)
    return X

### Load train and test data

In [0]:
df = pd.read_csv('/content/drive/My Drive/Semester 7/ML/Project/Data/train.csv')
df['label'].replace(to_replace=['correct','incorrect'], value=[1,0],inplace=True)
features = ['additional_fare','duration', 'meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup', 'fare']

In [12]:
Features = preprocess(df, features)
X = getX(Features)
y = df['label'].values
Features.head()

['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup', 'fare', 'distance', 'df_pickup_time_month', 'df_pickup_time_day', 'df_pickup_time_hour', 'df_pickup_time_minute', 'drop_time_month', 'df_drop_time_day', 'df_drop_time_hour', 'df_drop_time_minute']


Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,distance,df_pickup_time_month,df_pickup_time_day,df_pickup_time_hour,df_pickup_time_minute,drop_time_month,df_drop_time_day,df_drop_time_hour,df_drop_time_minute
0,10.5,834.0,56.0,0.0,64.0,270.32,5.09,11.0,1.0,0.0,20.0,11.0,1.0,0.0,34.0
1,10.5,791.0,47.0,0.0,134.0,197.85,3.17,11.0,1.0,0.0,56.0,11.0,1.0,1.0,9.0
2,10.5,1087.0,80.0,0.0,61.0,301.64,6.31,11.0,1.0,1.0,8.0,11.0,1.0,1.0,26.0
3,10.5,598.0,271.0,15.6638,68.0,82.3,0.86,11.0,1.0,2.0,27.0,11.0,1.0,2.0,37.0
4,13.719651,1702.858077,629.074231,32.057666,112.466832,358.39,8.15,11.0,1.0,3.0,34.0,11.0,1.0,3.0,51.0


In [0]:
df_test = pd.read_csv('/content/drive/My Drive/Semester 7/ML/Project/Data/test.csv')
features_test = ['additional_fare','duration', 'meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup', 'fare']

In [14]:
Features_for_test = preprocess(df_test, features_test)
X_pred = getX(Features_for_test)
Features_for_test.head()

['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup', 'fare', 'distance', 'df_pickup_time_month', 'df_pickup_time_day', 'df_pickup_time_hour', 'df_pickup_time_minute', 'drop_time_month', 'df_drop_time_day', 'df_drop_time_hour', 'df_drop_time_minute']


Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,distance,df_pickup_time_month,df_pickup_time_day,df_pickup_time_hour,df_pickup_time_minute,drop_time_month,df_drop_time_day,df_drop_time_hour,df_drop_time_minute
0,10.5,924.0,42.0,2.4486,148.0,289.27,6.71,2.0,1.0,0.0,38.0,2.0,1.0,0.0,53.0
1,10.5,4249.0,20.0,0.0,91.0,1912.7,41.56,2.0,1.0,1.0,2.0,2.0,1.0,2.0,13.0
2,10.5,1552.0,255.0,2.6588,23.0,394.0,5.92,2.0,1.0,5.0,2.0,2.0,1.0,5.0,28.0
3,10.5,462.0,16.0,0.0,198.0,154.32,3.3,2.0,1.0,5.0,30.0,2.0,1.0,5.0,38.0
4,10.5,814.0,392.0,12.3692,69.0,147.47,2.59,2.0,1.0,7.0,0.0,2.0,1.0,7.0,14.0


In [0]:
import xgboost as xgb

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X)
print(f1_score(y, y_pred, average='weighted'))

0.9424577205123781


In [0]:
y_pred_for_test = xgb_model.predict(X_pred)

In [30]:
y_pred_for_test

array([1, 1, 1, ..., 1, 1, 1])

In [0]:
import csv

with open('/content/drive/My Drive/Semester 7/ML/Project/submission_xgb_model_with_time.csv', mode='w') as employee_file:
    employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    employee_writer.writerow(['tripid','prediction'])
    for i in range(len(y_pred_for_test)):
      employee_writer.writerow([df_test['tripid'][i],y_pred_for_test[i]])