### NYC Taxi Trip Duration Competition

- kaggle score = the lower the better
- data is from Jan-June 2016

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
%matplotlib inline

## Import the data

In [2]:
trips = pd.read_csv('train.csv', parse_dates=[2,3], usecols=[0,1,2,3,5,6,7,8,10])

In [3]:
trips.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,-73.982155,40.767937,-73.96463,40.765602,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,-73.980415,40.738564,-73.999481,40.731152,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,-73.979027,40.763939,-74.005333,40.710087,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,-74.01004,40.719971,-74.012268,40.706718,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,-73.973053,40.793209,-73.972923,40.78252,435


### Extract date features

In [19]:
trips['day'] = trips['pickup_datetime'].dt.day
trips['hour'] = trips['pickup_datetime'].dt.hour
trips['month'] = trips['pickup_datetime'].dt.month
trips['weekday'] = trips['pickup_datetime'].dt.weekday

### Remove Outliers

In [5]:
outliers = trips[trips['trip_duration'] > 7000].copy()

In [6]:
trips = trips.drop(outliers.index)

### Models

In [20]:
from sklearn.cluster import MiniBatchKMeans as KM

In [None]:
clf = KM(n_clusters=1000,batch_size=1000)

In [None]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(trips, trips['trip_duration'], test_size=0.20)
# X train - 80% of data
# Y train - 80% trips duration data
# X test - 20% of data

In [9]:
days = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
hours = list(range(0,24))
coordinates = ['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']

In [10]:
from sklearn.neighbors import KNeighborsRegressor as KNR

In [11]:
clfs = {day:{} for day in days}

In [12]:
for day in days:
    for hour in hours:
        clf = KNR(n_neighbors=25,weights='distance')
        rows = (X_train['hour'] == hour) & (X_train['dayofweek'] == day)
        X = X_train.loc[rows,coordinates]
        y = y_train.loc[rows]
        clf.fit(X,y)
        clfs[day][hour] = clf

In [13]:
submission = pd.DataFrame(columns=['id','trip_duration'])
for day in days:
    for hour in hours:
        rows = (X_test['hour'] == hour) & (X_test['dayofweek'] == day)
        X_test_1 = X_test.loc[rows,coordinates]
        id_test = X_test.loc[rows,'id']
        y_test_1 = clfs[day][hour].predict(X_test_1)
        submission = submission.append(pd.DataFrame({'id': id_test, 'trip_duration': y_test_1}))

In [14]:
submission = submission.merge(X_test[['id','trip_duration']],left_on='id',right_on='id',how='outer')

In [15]:
submission.columns = ['id','p','a']

In [16]:
def score(submission):
    return np.sqrt(((np.log(submission['p']+1) - np.log(submission['a']+1))**2).sum()/len(submission))

In [17]:
score(submission)

0.43080492953292804