In [63]:
import pandas as pd
import numpy as np
import geojsonio
from shapely.geometry import mapping, Polygon, shape
import json
import collections
from tqdm import tqdm
import datetime

In [26]:
with open('data/nta.json') as json_file:
    ntas_raw = json.load(json_file)

## Loading Data

### Demographis and weather

In [21]:
demographics = pd.read_csv('data/demographics.csv')

In [None]:
weather = pd.read_csv('data/weather.csv')

### Green trips

In [None]:
# load the raw data
trips_green = pd.read_csv('data/green_trips.csv.gz')

In [70]:
# filter, so that the data becomes manageable in size
filter_2014 = pd.to_datetime(trips_green.pickup_datetime) <= datetime.date(2014,8,1)
trips_green_2014 = trips_green.loc[filter_2014,:]
trips_green_2014.shape

(1068602, 9)

### Yellow trips

In [None]:
# load the raw data
trips_yellow = pd.read_csv('data/yellow_trips.csv.gz')

In [None]:
# filter, so that the data becomes manageable in size
filter_2014 = pd.to_datetime(trips_yellow.pickup_datetime) <= datetime.date(2014,8,1)
trips_yellow_2014 = trips_yellow.loc[filter_2014,:]
trips_yellow_2014.shape

## Creating a NTA - Trip Mapping

In [41]:
# building a dictionary that contains a shapefile and meta information for every nta
nta_shapefiles = collections.defaultdict(dict)
for nta_district in ntas_raw['features']:
    nta_code = nta_district['properties']['NTACode']
    borough_name = nta_district['properties']['BoroName']
    borough_code = nta_district['properties']['BoroCode']
    geometry = nta_district['geometry']
    shapefile = shape(geometry)
    nta_shapefiles[nta_code] = {'borough_name': borough_name,
                               'borough_code': borough_code,
                               'shapefile':shapefile}

In [51]:
def get_nta(lat, lon, nta_shapefiles):
    point = shape({'coordinates': [lat, lon], 'type':'Point'})
    
    for nta_code, nta_data in nta_shapefiles.items():
        if nta_data['shapefile'].contains(point):
            return nta_code
    return None

In [None]:
pickup_ntas = []
dropoff_ntas = []

for _, trip_data in tqdm(trips_green_2014.iterrows()):
    pickup_lat = trip_data.pickup_longitude
    pickup_lon = trip_data.pickup_latitude
    dropoff_lat = trip_data.dropoff_longitude
    dropoff_lon = trip_data.dropoff_latitude
    
    pickup_ntas.append(get_nta(pickup_lat, pickup_lon, nta_shapefiles))
    dropoff_ntas.append(get_nta(dropoff_lat, dropoff_lon, nta_shapefiles))
    
trips_green['pickup_nta'] = pickup_ntas
trips_green['dropoff_nta'] = dropoff_ntas


0it [00:00, ?it/s][A
1it [00:01,  1.26s/it][A
17it [00:01,  1.13it/s][A
39it [00:01,  1.61it/s][A
58it [00:01,  2.29it/s][A
78it [00:01,  3.26it/s][A
97it [00:01,  4.62it/s][A
126it [00:01,  6.55it/s][A
161it [00:01,  9.29it/s][A
189it [00:02, 13.08it/s][A
215it [00:02, 18.26it/s][A
245it [00:02, 25.40it/s][A
272it [00:02, 34.77it/s][A
304it [00:02, 47.46it/s][A
332it [00:02, 60.18it/s][A
357it [00:02, 74.92it/s][A
380it [00:02, 93.70it/s][A
416it [00:03, 120.18it/s][A
450it [00:03, 148.61it/s][A
479it [00:03, 171.96it/s][A
507it [00:03, 176.38it/s][A
533it [00:03, 189.60it/s][A
560it [00:03, 203.20it/s][A
585it [00:03, 210.97it/s][A
615it [00:03, 231.58it/s][A
642it [00:03, 241.77it/s][A
672it [00:04, 256.41it/s][A
702it [00:04, 261.75it/s][A
732it [00:04, 267.95it/s][A
760it [00:04, 264.60it/s][A
790it [00:04, 273.97it/s][A
821it [00:04, 281.92it/s][A
853it [00:04, 290.54it/s][A
883it [00:04, 288.46it/s][A
916it [00:04, 298.28it/s][A
947it [00:04, 