From: https://github.com/stephenleo/nyc-taxi
1. Change C8 to cut training and testing dataset by 1/2

# Predicting Destination location
- Attempting to predict the destination location (geohashed dropoff latitude, longitude) from the available data such as pickup time, pickup location
- Anaconda python2 environment

In [3]:
!pip install pygeohash



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pygeohash as gh
%matplotlib inline

from sklearn import ensemble
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv('data/data.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/data.csv'

In [5]:
# Remove those outliers with very low or very high trip_distance or fare_amount
df_filtered = df[(df['trip_distance'] >= 2) & (df['trip_distance'] <= 50) & (df['fare_amount'] >= 3) & (df['fare_amount'] <=300)].copy()

### Create new features as mentioned in [02_Fare Prediction.ipynb](https://github.com/stephenleo87/nyc-taxi/blob/master/02_Fare%20Prediction.ipynb)

In [6]:
df_filtered['pickup_datetime'] = pd.to_datetime(df_filtered['pickup_datetime'])
df_filtered['dropoff_datetime'] = pd.to_datetime(df_filtered['dropoff_datetime'])
df_filtered['trip_duration'] = (df_filtered['dropoff_datetime']-df_filtered['pickup_datetime']).dt.total_seconds()/60
df_filtered['year'] = df_filtered['pickup_datetime'].dt.year
df_filtered['day'] = df_filtered['pickup_datetime'].dt.day
df_filtered['month'] = df_filtered['pickup_datetime'].dt.month
df_filtered['day_of_week'] = df_filtered['pickup_datetime'].dt.day_name()
df_filtered['hour_of_day'] = df_filtered['pickup_datetime'].dt.hour
df_filtered['lat_dif'] = df_filtered['pickup_latitude'] - df_filtered['dropoff_latitude']
df_filtered['lon_dif'] = df_filtered['pickup_longitude'] - df_filtered['dropoff_longitude']
df_filtered['pickup_geohash']=df_filtered.apply(lambda x: gh.encode(x.pickup_latitude, x.pickup_longitude, precision=5), axis=1)
df_filtered['dropoff_geohash']=df_filtered.apply(lambda x: gh.encode(x.dropoff_latitude, x.dropoff_longitude, precision=5), axis=1)

df_filtered.head()

Unnamed: 0.1,Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,...,trip_duration,year,day,month,day_of_week,hour_of_day,lat_dif,lon_dif,pickup_geohash,dropoff_geohash
0,0,2,2015-07-18 11:25:58,2015-07-18 11:43:47,1,7.21,-73.862762,40.769028,1.0,N,...,17.816667,2015,18,7,Saturday,11,0.046444,0.086441,dr5rz,dr5rt
1,1,1,2015-03-15 12:50:01,2015-03-15 13:23:35,1,10.8,-73.870926,40.773727,,N,...,33.566667,2015,15,3,Sunday,12,0.008034,0.117302,dr5rz,dr5ru
2,2,2,2015-04-30 12:25:44,2015-04-30 13:03:51,1,4.28,-73.97818,40.762341,,N,...,38.116667,2015,30,4,Thursday,12,0.051552,0.030731,dr5ru,dr5re
3,3,2,2015-05-28 08:47:56,2015-05-28 09:26:08,1,18.47,-73.776711,40.645302,,N,...,38.2,2015,28,5,Thursday,8,-0.20755,0.066711,dr5x1,dr72r
4,4,1,2015-06-20 19:36:17,2015-06-20 20:10:49,1,15.5,-73.777054,40.644947,,Y,...,34.533333,2015,20,6,Saturday,19,-0.080074,0.169746,dr5x1,dr5rt


In [7]:
# Remove null values from lattitude and longitude
df_filtered = df_filtered[df_filtered['lat_dif'].notnull() & df_filtered['lon_dif'].notnull()].copy()

# Intuition
- Intuitively, the dropoff location could be random
- However, could it be possible that at a given day of the week, hour of the day, pickup location and number of passengers; there is a correlation to the dropoff location?

In [12]:
# Define some helper functions
def train_test_split(X, y):
    """Split X and y into training set and testing set.
    Data from year = 2015 is used as training set while data from year = 2016 is used as testing set.
    Returns X_train, y_train, X_test, y_test
    """
    
    X_train = X.loc[X['year'] == 2015].drop('year', axis=1)
    y_train = y.loc[y['year'] == 2015].drop('year', axis=1).values.ravel()
    X_test = X.loc[X['year'] == 2016].drop('year', axis=1)
    y_test = y.loc[y['year'] == 2016].drop('year', axis=1).values.ravel()
    
    X_train = X_train[:len(X_train) // 2]
    y_train = y_train[:len(y_train) // 2]
    X_test = X_test[:len(X_test) // 2]
    y_test = y_test[:len(y_test) // 2]
    return X_train, y_train, X_test, y_test

def model_results(X_train, y_train, X_test, y_test, model, model_name):
    """Print model Accuracy on training and testing sets.
    """
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print("----{} Training Data results (2015 data set)----".format(model_name))
    print("Accuracy: {:.2f}\n".format(accuracy_score(y_train,y_train_pred)))
    
    print("----{} Test Data results (2016 data set)----".format(model_name))
    print("Accuracy: {:.2f}\n\n".format(accuracy_score(y_test,y_test_pred)))
    
def model_comparison(X_train, y_train, X_test, y_test):
    """Compare several classic classifiers by fitting a model on X_train, y_train 
    and compare accuracy on y_test vs y_pred
    """
    
    #DECISON TREE
    dtree_model = DecisionTreeClassifier()
    dtree_model.fit(X_train,y_train)
    model_results(X_train, y_train, X_test, y_test, dtree_model, 'Decision Tree')

    #RANDOM FOREST
    rfc_model = ensemble.RandomForestClassifier()
    rfc_model.fit(X_train,y_train)
    model_results(X_train, y_train, X_test, y_test, rfc_model, 'Random Forest')

    # #GRADIENT BOOSTING
    # gbc_model = ensemble.GradientBoostingClassifier()
    # gbc_model.fit(X_train,y_train)
    # model_results(X_train, y_train, X_test, y_test, gbc_model, 'Gradient Boosting')
    
    # #NEURAL NET
    # nn_model = MLPClassifier()
    # nn_model.fit(X_train,y_train)
    # model_results(X_train, y_train, X_test, y_test, nn_model, 'Neural Net')

In [13]:
# Split response (y) and features (X)
y = df_filtered[['dropoff_geohash', 'year']]
X = df_filtered[['passenger_count', 'year', 'month', 'day_of_week', 'hour_of_day', 'pickup_geohash']]
X_encoded = pd.get_dummies(X, columns=['month', 'day_of_week', 'hour_of_day', 'pickup_geohash'])

In [14]:
# Split Training and Testing sets
X_train, y_train, X_test, y_test = train_test_split(X_encoded, y)

In [15]:
model_comparison(X_train, y_train, X_test, y_test)

----Decision Tree Training Data results (2015 data set)----
Accuracy: 0.67

----Decision Tree Test Data results (2016 data set)----
Accuracy: 0.22


----Random Forest Training Data results (2015 data set)----
Accuracy: 0.67

----Random Forest Test Data results (2016 data set)----
Accuracy: 0.25




# Conclusion
- All models have low predictive power with the best model only having ~30% accuracy in Test data
- Abandon this approach