In [None]:
import pandas as pd
import numpy as np
from shapely.geometry import mapping, Polygon, shape
import json
import collections
from tqdm import tqdm
import datetime
from sklearn.ensemble import RandomForestRegressor

# A predictive model to maximize Taxi Driver revenue in NYC

## Loading Trip Data
For the sake of attepting this challenge within the given amount of time, we only loaded data for gree taxi trips in 2014. However, all the code below extends without loss in generalization to data in 2015 and also to yelow taxi trips

In [None]:
# load the raw data and transform the data type of 'pickup_datetime'
trips_green = pd.read_csv('green_trips.csv.gz')
trips_green.pickup_datetime = pd.to_datetime(trips_green.pickup_datetime)

In [10]:
# filter out the 2015 data, so that the data set becomes manageable in size
filter_2014 = trips_green.pickup_datetime < datetime.date(2015,1,1)
trips_green_2014 = trips_green.loc[filter_2014,:]
trips_green_2014.pickup_datetime = pd.to_datetime(trips_green_2014.pickup_datetime)
trips_green_2014.shape

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


(1608944, 9)

## Create a mapping between pickup/dropoff coordinates and NTAs

In [None]:
# just some utility functions for the data wrangling
def get_nta(lat, lon, nta_shapefiles):
    """
    given latitude, longitude and a dictionary of shapefiles,
    return the corresponding NTA
    """
    point = shape({'coordinates': [lat, lon], 'type':'Point'})
    
    for nta_code, nta_data in nta_shapefiles.items():
        if nta_data['shapefile'].contains(point):
            return nta_code
    return None

def datetime_to_tod(_datetime):
    """
    Transform a given datetime into a time-of-day bin on a scale between 0 and 143
    i.e. every one of 24 hours is sliced into 6 bins
    """
    return int(_datetime.hour)*6 + int(_datetime.minute / 10)

In [None]:
# load geo-fences for NYC NTAs as a shapefile. The shapefile is available on 
# https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-nynta.page
with open('nta.json') as json_file:
    ntas_raw = json.load(json_file)

In [None]:
# build a dictionary that contains a shapefile and meta information for every NTA
nta_shapefiles = collections.defaultdict(dict)
for nta_district in ntas_raw['features']:
    nta_code = nta_district['properties']['NTACode']
    borough_name = nta_district['properties']['BoroName']
    borough_code = nta_district['properties']['BoroCode']
    geometry = nta_district['geometry']
    shapefile = shape(geometry)
    nta_shapefiles[nta_code] = {'borough_name': borough_name,
                               'borough_code': borough_code,
                               'shapefile':shapefile}

In [13]:
pickup_ntas = []
dropoff_ntas = []
pickup_tods = []
dropoff_dows = []

# iterate through the entire dataset to determine NTAs for pickup and dropoff locations,
# as well as the time-of-day and day-of-week equivalent to the pickup time
for _, trip_data in tqdm(trips_green_2014.iterrows()):
    pickup_lat = trip_data.pickup_longitude
    pickup_lon = trip_data.pickup_latitude
    dropoff_lat = trip_data.dropoff_longitude
    dropoff_lon = trip_data.dropoff_latitude
    pickup_datetime = trip_data.pickup_datetime
    pickup_tod = datetime_to_tod(pickup_datetime)
    pickup_dow = pickup_datetime.weekday()
    
    pickup_tods.append(pickup_tod)
    dropoff_dows.append(pickup_dow)    
    
    pickup_ntas.append(get_nta(pickup_lat, pickup_lon, nta_shapefiles))
    dropoff_ntas.append(get_nta(dropoff_lat, dropoff_lon, nta_shapefiles))
    
trips_green_2014['pickup_nta'] = pickup_ntas
trips_green_2014['dropoff_nta'] = dropoff_ntas
trips_green_2014['pickup_tod'] = pickup_tods
trips_green_2014['pickup_dow'] = dropoff_dows
trips_green_2014.to_csv('data/gree_trips_2014.csv')

1608944it [53:01, 505.75it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-vie

## Clean up the data and set apart a test set
Some trips that cound not be matched to an NTA have to be removed from the data set. After that, data from the most recent 30 days is set apart to test the final model.

In [None]:
# remove null values
nan_filter = pd.isnull(trips_green_2014).sum(axis=1) == 0
trips_green_2014_fare = trips_green_2014.loc[nan_filter, :]
trips_green_2014_fare = pd.get_dummies(trips_green_2014_fare.loc[:,['pickup_nta']])
nta_labels = [i[2] for i in trips_green_2014_fare.columns.str.split('_') if len(i) == 3]
trips_green_2014_fare['pickup_tod'] = trips_green_2014.pickup_tod
trips_green_2014_fare['pickup_dow'] = trips_green_2014.pickup_dow
trips_green_2014_fare['fare'] = trips_green_2014.total_amount
trips_green_2014_fare['pickup_nta'] = trips_green_2014.loc[nan_filter,:].pickup_nta
trips_green_2014_fare['pickup_datetime'] = trips_green_2014.loc[nan_filter,:].pickup_datetime

In [None]:
# set apart a test data set
train_filter = trips_green_2014.pickup_datetime <= trips_green_2014.pickup_datetime.max() - datetime.timedelta(days=30)
Y_fares_train = np.array(trips_green_2014_fare.loc[train_filter,:].fare).reshape(-1,1)
X_fares_train = np.array(trips_green_2014_fare.loc[train_filter, ~trips_green_2014_fare.columns.isin(['fare','pickup_nta','pickup_datetime'])])
trip_table_test = trips_green_2014_fare.loc[~train_filter, :]

## Build an train a Random Forrest Regression model

In [None]:
m_fare = RandomForestRegressor(n_estimators = 10)

In [23]:
m_fare.fit(X_fares_train, Y_fares_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

## Simulate the model on the test data

In [39]:
ride_choices = []

for iter in range(100):
    # pick a random dateime and location
    trip_data = trip_table_test.iloc[np.random.randint(0, trip_table_test.shape[0],1)[0],:]
    next_trip_filter = (trip_table_test.pickup_datetime > 
                        trip_data.pickup_datetime + datetime.timedelta(seconds=20)) &\
                       (trip_table_test.pickup_nta == trip_data.pickup_nta) &\
                       (trip_table_test.pickup_datetime <= 
                        trip_data.pickup_datetime + datetime.timedelta(minutes=10))

    while sum(next_trip_filter) == 0:
        trip_data = trip_table_test.iloc[np.random.randint(0, trip_table_test.shape[0],1)[0],:]
        next_trip_filter = (trip_table_test.pickup_datetime > 
                        trip_data.pickup_datetime + datetime.timedelta(seconds=20)) &\
                       (trip_table_test.pickup_nta == trip_data.pickup_nta) &\
                       (trip_table_test.pickup_datetime <= 
                        trip_data.pickup_datetime + datetime.timedelta(minutes=10))


    tod = datetime_to_tod(pd.to_datetime(trip_data.pickup_datetime))
    dow = pd.to_datetime(trip_data.pickup_datetime).weekday()
    nta = trip_data.pickup_nta
    fare = trip_data.fare

    X_test = np.zeros(len(nta_labels))
    X_test[nta_labels.index(nta)]
    X_test = np.concatenate((list(X_test), [tod, dow])).reshape(1,-1)
    expected_fare = m_fare.predict(X_test)[0]

    max_fare = trip_table_test.loc[next_trip_filter,'fare'].max()

    print('Offered fare: {}\nExpected fare: {}\nMax fare: {}'.format(fare, 
                                                                     expected_fare, 
                                                                     max_fare))
    
    if expected_fare > fare: # if we make a decision based on the model
        if max_fare >= fare:
            ride_choices.append(1)
        else:
            ride_choices.append(0)

Offered fare: 7.7
Expected fare: 14.804374613758336
Max fare: 10.0
Offered fare: 13.5
Expected fare: 13.386580808080808
Max fare: 16.0
Offered fare: 8.0
Expected fare: 14.637808403361344
Max fare: 7.38
Offered fare: 6.5
Expected fare: 14.470107964980937
Max fare: 13.1
Offered fare: 16.0
Expected fare: 14.282000000000002
Max fare: 13.7
Offered fare: 9.0
Expected fare: 11.259907142857141
Max fare: 40.33
Offered fare: 6.0
Expected fare: 12.49248888888889
Max fare: 23.5
Offered fare: 11.0
Expected fare: 13.074313961988304
Max fare: 7.0
Offered fare: 13.1
Expected fare: 13.233095238095236
Max fare: 57.83
Offered fare: 5.9
Expected fare: 12.352875213675215
Max fare: 33.0
Offered fare: 7.5
Expected fare: 13.632751515151517
Max fare: 28.5
Offered fare: 12.0
Expected fare: 11.241211038961039
Max fare: 19.7
Offered fare: 5.5
Expected fare: 12.24625
Max fare: 26.0
Offered fare: 18.5
Expected fare: 13.049913725490194
Max fare: 21.0
Offered fare: 10.2
Expected fare: 11.9525
Max fare: 19.1
Offered f

In [41]:
print('Percentage of right choices: {}'.format(np.mean(ride_choices)))

Percentage of right choices: 0.8360655737704918
