# initialization

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import tensorflow as tf
from sklearn.model_selection import train_test_split

# explore data

In [None]:
train_set = pd.read_csv('train.csv', index_col='tripid')

In [None]:
train_set.head()

In [None]:
train_set.tail()

In [None]:
train_set.describe()

# distance

In [None]:
def distance(pick_lat, pick_lon, drop_lat, drop_lon):
  R = 6371

  pick_lat = pick_lat * np.pi / 180
  drop_lat = drop_lat * np.pi / 180
  pick_lon = pick_lon * np.pi / 180
  drop_lon = drop_lon * np.pi / 180

  diff_lat = drop_lat - pick_lat
  diff_lon = drop_lon - pick_lon

  a = np.sin(diff_lat / 2) * np.sin(diff_lat / 2) + np.cos(pick_lat) * np.cos(drop_lat) * np.sin(diff_lon / 2) * np.sin(diff_lon / 2)
  c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

  # distance in km
  d = R * c

  return d

# change train set

In [None]:
format = '%m/%d/%Y %H:%M'

train_set = pd.read_csv('/content/drive/My Drive/Academic/Semester 7/CS4622 Machine Learning/Assignments/160278M Fare Classification/train.csv', index_col='tripid')
train_set['duration'] = train_set.apply(lambda row: time.mktime(time.strptime(row['drop_time'], format)) - time.mktime(time.strptime(row['pickup_time'], format)) + row['meter_waiting'] + row['meter_waiting_till_pickup'] if np.isnan(row['duration']) else row['duration'], axis=1)
train_set['fare'] = train_set.apply(lambda row:  -1 if np.isnan(row['fare']) else row['fare'], axis=1)
train_set.label.replace(to_replace=dict(correct=1, incorrect=0), inplace=True)

train_set['pickup_month'] = train_set.apply(lambda row: int(str(row['pickup_time']).split()[0].split('/')[0]), axis=1)
train_set['pickup_day'] = train_set.apply(lambda row: int(str(row['pickup_time']).split()[0].split('/')[1]), axis=1)
train_set['pickup_hour'] = train_set.apply(lambda row: int(str(row['pickup_time']).split()[1].split(':')[0]), axis=1)
train_set['pickup_minute'] = train_set.apply(lambda row: int(str(row['pickup_time']).split()[1].split(':')[1]), axis=1)

train_set['drop_month'] = train_set.apply(lambda row: int(str(row['drop_time']).split()[0].split('/')[0]), axis=1)
train_set['drop_day'] = train_set.apply(lambda row: int(str(row['drop_time']).split()[0].split('/')[1]), axis=1)
train_set['drop_hour'] = train_set.apply(lambda row: int(str(row['drop_time']).split()[1].split(':')[0]), axis=1)
train_set['drop_minute'] = train_set.apply(lambda row: int(str(row['drop_time']).split()[1].split(':')[1]), axis=1)

train_set.drop(train_set[train_set['pick_lat'] < 5.9].index, inplace=True)
train_set.drop(train_set[train_set['pick_lat'] > 9.8].index, inplace=True)
train_set.drop(train_set[train_set['pick_lon'] < 79.7].index, inplace=True)
train_set.drop(train_set[train_set['pick_lon'] > 81.8].index, inplace=True)
train_set.drop(train_set[train_set['drop_lat'] < 5.9].index, inplace=True)
train_set.drop(train_set[train_set['drop_lat'] > 9.8].index, inplace=True)
train_set.drop(train_set[train_set['drop_lon'] < 79.7].index, inplace=True)
train_set.drop(train_set[train_set['drop_lon'] > 81.8].index, inplace=True)

train_set['distance'] = distance(train_set['pick_lat'], train_set['pick_lon'], train_set['drop_lat'], train_set['drop_lon'])
train_set['average_speed'] = train_set.apply(lambda row: -1 if (row['duration'] - row['meter_waiting'] - row['meter_waiting_till_pickup']) <= 0 else row['distance'] / (row['duration'] - row['meter_waiting'] - row['meter_waiting_till_pickup']) * 3600, axis=1)
train_set['unit_riding_fare'] = train_set.apply(lambda row: -1 if row['distance'] <= 0 else (row['fare'] - row['meter_waiting_fare']) / row['distance'], axis=1)
train_set['unit_waiting_fare'] = train_set.apply(lambda row: 0 if row['meter_waiting'] == 0 else (row['meter_waiting_fare']) / row['meter_waiting'] * 60, axis=1)

train_set = train_set.drop(['pickup_time', 'drop_time'], axis=1)

# change test set

In [None]:
format = '%m/%d/%Y %H:%M'

test_set = pd.read_csv('/content/drive/My Drive/Academic/Semester 7/CS4622 Machine Learning/Assignments/160278M Fare Classification/test.csv', index_col='tripid')
test_set['duration'] = test_set.apply(lambda row: time.mktime(time.strptime(row['drop_time'], format)) - time.mktime(time.strptime(row['pickup_time'], format)) + row['meter_waiting'] + row['meter_waiting_till_pickup'] if np.isnan(row['duration']) else row['duration'], axis=1)
test_set['fare'] = test_set.apply(lambda row:  -1 if np.isnan(row['fare']) else row['fare'], axis=1)

test_set['pickup_month'] = test_set.apply(lambda row: int(str(row['pickup_time']).split()[0].split('/')[0]), axis=1)
test_set['pickup_day'] = test_set.apply(lambda row: int(str(row['pickup_time']).split()[0].split('/')[1]), axis=1)
test_set['pickup_hour'] = test_set.apply(lambda row: int(str(row['pickup_time']).split()[1].split(':')[0]), axis=1)
test_set['pickup_minute'] = test_set.apply(lambda row: int(str(row['pickup_time']).split()[1].split(':')[1]), axis=1)

test_set['drop_month'] = test_set.apply(lambda row: int(str(row['drop_time']).split()[0].split('/')[0]), axis=1)
test_set['drop_day'] = test_set.apply(lambda row: int(str(row['drop_time']).split()[0].split('/')[1]), axis=1)
test_set['drop_hour'] = test_set.apply(lambda row: int(str(row['drop_time']).split()[1].split(':')[0]), axis=1)
test_set['drop_minute'] = test_set.apply(lambda row: int(str(row['drop_time']).split()[1].split(':')[1]), axis=1)

test_set.drop(test_set[test_set['pick_lat'] < 5.9].index, inplace=True)
test_set.drop(test_set[test_set['pick_lat'] > 9.8].index, inplace=True)
test_set.drop(test_set[test_set['pick_lon'] < 79.7].index, inplace=True)
test_set.drop(test_set[test_set['pick_lon'] > 81.8].index, inplace=True)
test_set.drop(test_set[test_set['drop_lat'] < 5.9].index, inplace=True)
test_set.drop(test_set[test_set['drop_lat'] > 9.8].index, inplace=True)
test_set.drop(test_set[test_set['drop_lon'] < 79.7].index, inplace=True)
test_set.drop(test_set[test_set['drop_lon'] > 81.8].index, inplace=True)

test_set['distance'] = distance(test_set['pick_lat'], test_set['pick_lon'], test_set['drop_lat'], test_set['drop_lon'])
test_set['average_speed'] = test_set.apply(lambda row: -1 if (row['duration'] - row['meter_waiting'] - row['meter_waiting_till_pickup']) <= 0 else row['distance'] / (row['duration'] - row['meter_waiting'] - row['meter_waiting_till_pickup']) * 3600, axis=1)
test_set['unit_riding_fare'] = test_set.apply(lambda row: -1 if row['distance'] <= 0 else (row['fare'] - row['meter_waiting_fare']) / row['distance'], axis=1)
test_set['unit_waiting_fare'] = test_set.apply(lambda row: 0 if row['meter_waiting'] == 0 else (row['meter_waiting_fare']) / row['meter_waiting'] * 60, axis=1)

test_set = test_set.drop(['pickup_time', 'drop_time'], axis=1)

# catboost

In [None]:
x = train_set.drop(columns=['label'], axis=1)
y = train_set['label']

In [None]:
!pip install catboost
from catboost import CatBoostClassifier, Pool

# initialize data
model = CatBoostClassifier(iterations=100000)

# train the model
model.fit(x, y, verbose=False, plot=True)
model.get_all_params()

# save the model
model_path = '/content/drive/My Drive/'
model.save_model(model_path)

# make the prediction using the resulting model
preds_class = model.predict(test_set)
preds_proba = model.predict_proba(test_set)

In [None]:
fo = open('results.csv', 'w')
fo.write('tripid,prediction\n')
for i in range(len(test_set)):
  fo.write(str(test_set.index[i]) + ',' + str(preds_class[i]) + '\n')
fo.close()