In [63]:
import pandas as pd
import numpy as np
import geojsonio
from shapely.geometry import mapping, Polygon, shape
import json
import collections
from tqdm import tqdm
import datetime

In [26]:
with open('data/nta.json') as json_file:
    ntas_raw = json.load(json_file)

## Loading Data

### Demographis and weather

In [21]:
demographics = pd.read_csv('data/demographics.csv')

In [None]:
weather = pd.read_csv('data/weather.csv')

### Green trips

In [None]:
# load the raw data
trips_green = pd.read_csv('data/green_trips.csv.gz')

In [70]:
# filter, so that the data becomes manageable in size
filter_2014 = pd.to_datetime(trips_green.pickup_datetime) <= datetime.date(2014,8,1)
trips_green_2014 = trips_green.loc[filter_2014,:]
trips_green_2014.shape

(1068602, 9)

### Yellow trips

In [None]:
# load the raw data
trips_yellow = pd.read_csv('data/yellow_trips.csv.gz')

In [None]:
# filter, so that the data becomes manageable in size
filter_2014 = pd.to_datetime(trips_yellow.pickup_datetime) <= datetime.date(2014,8,1)
trips_yellow_2014 = trips_yellow.loc[filter_2014,:]
trips_yellow_2014.shape

## Creating a NTA - Trip Mapping

In [41]:
# building a dictionary that contains a shapefile and meta information for every nta
nta_shapefiles = collections.defaultdict(dict)
for nta_district in ntas_raw['features']:
    nta_code = nta_district['properties']['NTACode']
    borough_name = nta_district['properties']['BoroName']
    borough_code = nta_district['properties']['BoroCode']
    geometry = nta_district['geometry']
    shapefile = shape(geometry)
    nta_shapefiles[nta_code] = {'borough_name': borough_name,
                               'borough_code': borough_code,
                               'shapefile':shapefile}

In [51]:
def get_nta(lat, lon, nta_shapefiles):
    point = shape({'coordinates': [lat, lon], 'type':'Point'})
    
    for nta_code, nta_data in nta_shapefiles.items():
        if nta_data['shapefile'].contains(point):
            return nta_code
    return None

In [None]:
pickup_ntas = []
dropoff_ntas = []

for _, trip_data in tqdm(trips_green_2014.iterrows()):
    pickup_lat = trip_data.pickup_longitude
    pickup_lon = trip_data.pickup_latitude
    dropoff_lat = trip_data.dropoff_longitude
    dropoff_lon = trip_data.dropoff_latitude
    
    pickup_ntas.append(get_nta(pickup_lat, pickup_lon, nta_shapefiles))
    dropoff_ntas.append(get_nta(dropoff_lat, dropoff_lon, nta_shapefiles))
    
trips_green_2014['pickup_nta'] = pickup_ntas
trips_green_2014['dropoff_nta'] = dropoff_ntas


0it [00:00, ?it/s][A
1it [00:01,  1.26s/it][A
17it [00:01,  1.13it/s][A
39it [00:01,  1.61it/s][A
58it [00:01,  2.29it/s][A
78it [00:01,  3.26it/s][A
97it [00:01,  4.62it/s][A
126it [00:01,  6.55it/s][A
161it [00:01,  9.29it/s][A
189it [00:02, 13.08it/s][A
215it [00:02, 18.26it/s][A
245it [00:02, 25.40it/s][A
272it [00:02, 34.77it/s][A
304it [00:02, 47.46it/s][A
332it [00:02, 60.18it/s][A
357it [00:02, 74.92it/s][A
380it [00:02, 93.70it/s][A
416it [00:03, 120.18it/s][A
450it [00:03, 148.61it/s][A
479it [00:03, 171.96it/s][A
507it [00:03, 176.38it/s][A
533it [00:03, 189.60it/s][A
560it [00:03, 203.20it/s][A
585it [00:03, 210.97it/s][A
615it [00:03, 231.58it/s][A
642it [00:03, 241.77it/s][A
672it [00:04, 256.41it/s][A
702it [00:04, 261.75it/s][A
732it [00:04, 267.95it/s][A
760it [00:04, 264.60it/s][A
790it [00:04, 273.97it/s][A
821it [00:04, 281.92it/s][A
853it [00:04, 290.54it/s][A
883it [00:04, 288.46it/s][A
916it [00:04, 298.28it/s][A
947it [00:04, 

In [72]:
trips_green_2014['pickup_nta'] = pickup_ntas
trips_green_2014['dropoff_nta'] = dropoff_ntas

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [74]:
trips_green_2014.to_csv('data/green_trips_2014.csv')

## Add pickup TOD and DOW column to trip

In [75]:
def datetime_to_tod(_datetime):
    return int(_datetime.hour)*6 + int(_datetime.minute / 10)

In [76]:
trips_green_2014.pickup_datetime = pd.to_datetime(trips_green_2014.pickup_datetime)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [77]:
pickup_tods = []
dropoff_dows = []


for _, trip_data in tqdm(trips_green_2014.iterrows()):
    pickup_datetime = trip_data.pickup_datetime
    pickup_tod = datetime_to_tod(pickup_datetime)
    pickup_dow = pickup_datetime.weekday()
    
    pickup_tods.append(pickup_tod)
    dropoff_dows.append(pickup_dow)


0it [00:00, ?it/s][A
1it [00:09,  9.07s/it][A
656it [00:09,  6.35s/it][A
1304it [00:09,  4.45s/it][A
1925it [00:09,  3.11s/it][A
2580it [00:09,  2.18s/it][A
3161it [00:09,  1.53s/it][A
3772it [00:09,  1.07s/it][A
4347it [00:09,  1.34it/s][A
4901it [00:09,  1.91it/s][A
5476it [00:09,  2.73it/s][A
6105it [00:10,  3.90it/s][A
6679it [00:10,  5.57it/s][A
7233it [00:10,  7.95it/s][A
7746it [00:10, 11.35it/s][A
8223it [00:10, 16.19it/s][A
8697it [00:10, 23.10it/s][A
9170it [00:10, 32.93it/s][A
9718it [00:10, 46.92it/s][A
10347it [00:10, 66.81it/s][A
10981it [00:11, 95.02it/s][A
11587it [00:11, 134.83it/s][A
12164it [00:11, 190.53it/s][A
12727it [00:11, 265.68it/s][A
13344it [00:11, 372.66it/s][A
13943it [00:11, 518.53it/s][A
14494it [00:11, 710.35it/s][A
15060it [00:11, 962.95it/s][A
15609it [00:11, 1275.35it/s][A
16160it [00:12, 1657.44it/s][A
16764it [00:12, 2118.43it/s][A
17356it [00:12, 2623.51it/s][A
17987it [00:12, 3180.66it/s][A
18608it [00:12, 3725.8

In [78]:
trips_green_2014['pickup_tod'] = pickup_tods
trips_green_2014['pickup_dow'] = dropoff_dows

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [79]:
trips_green_2014.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,total_amount,pickup_nta,dropoff_nta,pickup_tod,pickup_dow
3,2014-04-27 02:27:04,2014-04-27 02:39:02,-73.949501,40.713997,-73.987785,40.718582,2,3.68,17.3,BK73,MN27,14,6
4,2014-05-26 18:32:19,2014-05-26 18:44:13,-73.944092,40.672195,-73.977325,40.664013,1,2.4,11.5,BK61,BK99,111,0
9,2014-05-30 05:53:15,2014-05-30 06:00:00,-73.95237,40.789875,-73.949173,40.781448,1,1.2,7.5,MN33,MN32,35,4
19,2014-06-23 07:43:32,2014-06-23 07:50:25,-73.930832,40.848251,-73.944366,40.830429,3,1.5,8.0,MN35,MN04,46,0
25,2014-07-24 15:24:44,2014-07-24 15:29:48,-73.954041,40.730293,-73.945892,40.730549,1,0.59,5.5,BK76,BK76,92,3


In [80]:
trips_green_2014.to_csv('data/green_trips_2014.csv')

## Bulding the training data sets

In [241]:
#trips_green_2014_fare = pd.get_dummies(trips_green_2014.loc[:,['pickup_nta','dropoff_nta']], 
#                                                            prefix=['p','d'])
nan_filter = pd.isnull(trips_green_2014).sum(axis=1) == 0
trips_green_2014_fare = trips_green_2014.loc[nan_filter, :]
trips_green_2014_fare = pd.get_dummies(trips_green_2014_fare.loc[:,['pickup_nta']])
nta_labels = [i[2] for i in trips_green_2014_fare.columns.str.split('_') if len(i) == 3]
trips_green_2014_fare['pickup_tod'] = trips_green_2014.pickup_tod
trips_green_2014_fare['pickup_dow'] = trips_green_2014.pickup_dow
trips_green_2014_fare['fare'] = trips_green_2014.total_amount

In [247]:
train_filter = trips_green_2014.pickup_datetime <= trips_green_2014.pickup_datetime.max() - datetime.timedelta(days=30)
Y_fares_train = np.array(trips_green_2014_fare.loc[train_filter,:].fare).reshape(-1,1)
X_fares_train = np.array(trips_green_2014_fare.loc[train_filter, ~trips_green_2014_fare.columns.isin(['fare'])])
trip_table_test = trips_green_2014_fare.loc[~train_filter, :]

## Building the Models

In [243]:
from sklearn.ensemble import RandomForestRegressor

In [244]:
m_fare = RandomForestRegressor(n_estimators = 10)

In [245]:
m_fare.fit(X_fares_train, Y_fares_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [246]:
m_fare.score(X_fares_test, Y_fares_test)

-0.07255100826649374

In [235]:
# pick a random dateime and location

trip_data = trip_table_test.iloc[np.random.randint(0, trip_table_test.shape[0],1)[0],:]

tod = datetime_to_tod(pd.to_datetime(trip_data.pickup_datetime))
dow = pd.to_datetime(trip_data.pickup_datetime).weekday()
nta = trip_data.pickup_nta
fare = trip_data.total_amount

X_test = np.zeros(len(nta_labels))
X_test[nta_labels.index(nta)]
X_test = np.concatenate((list(X_test), [tod, dow])).reshape(1,-1)
expected_fare = m_fare.predict(X_test)

In [186]:
def simulate_taxi_life(start_datetime, trip_table, 
                       current_depth=0, total_fares=0, max_depth=10, model=None):
    
    if current_depth >= max_depth:
        print('Done\nEnd time: {}\nTotal fares: {}'.format(next_trip_dropoff_datetime, 
                                                           total_fares))
        return next_trip_dropoff_datetime, total_fares
    
    trip_filter = (trip_table.pickup_datetime >= start_datetime)
    trip_subset = trip_table.loc[trip_filter,:].sort_values(by='pickup_datetime', 
                                                            ascending=False)
    start_nta = trip_subset.iloc[0,:].pickup_nta
    next_trip_dropoff_datetime = trip_subset.iloc[0,:].dropoff_datetime
    next_trip_fare = trip_subset.iloc[0,:].total_amount
    next_trip_dropoff_nta = trip_subset.iloc[0,:].dropoff_nta

    if model:
        X_test = pd.Series(np.zeros(len(nta_shapefiles.keys())), 
                           index=nta_shapefiles.keys())
        X_test[start_nta] = 1
        X_test = np.array([*list(X_test), tod, dow]).reshape(1,-1)
        expected_fare = m_fare.predict(X_test)
        if expected_fare > next_trip_fare * 1.1:
            # wait
            return simulate_taxi_life(start_datetime = start_datetime + datetime.timedelta(seconds=20), 
                                      trip_table = trip_table, 
                                      current_depth = current_depth, 
                                      total_fares = total_fares, 
                                      max_depth = max_depth,
                                      model = model)
        
    print('Current NTA: {}\nNext NTA: {}\nTrip fares: {}'.format(start_nta, 
                                                                 next_trip_dropoff_nta,
                                                                 next_trip_fare))
    
    return simulate_taxi_life(start_datetime = next_trip_dropoff_datetime,
                              trip_table =trip_table, 
                              current_depth = current_depth + 1, 
                              total_fares = total_fares + next_trip_fare, 
                              max_depth = max_depth, 
                              model=model)

In [192]:
simulate_taxi_life(random_date, trip_tbl)

Current NTA: MN06
Next NTA: MN17
Trip fares: 20.0


IndexError: single positional indexer is out-of-bounds