In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import re
from shapely.geometry import Point, LineString

### 1. Getting number of dropoffs per location ID

In [2]:
#Empty lists for monthly pickups and dropoffs
monthly_dropoffs = []
monthly_pickups = []

In [3]:
#Function to prepare monthly pickups and dropoffs
def get_last_call_trips(month_num:int, col:str, df_list:list) -> list:
    '''Extracts trips between 2AM and 3AM on Saturdays and Sundays
    month_num = number between 1 and 12
    col = PULocationID or DOLocationID'''
    
    if month_num < 10:
        fname = 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-0' + str(month_num) + '.csv'
    else:
        fname = 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-' + str(month_num) + '.csv'
        
    print(f"Loading dataset {month_num}/12")
    #load full taxi trip dataset for a month
    full = pd.read_csv(fname, usecols=['tpep_pickup_datetime', 'PULocationID', 'DOLocationID'],
                      parse_dates=['tpep_pickup_datetime'],
                      infer_datetime_format=True)
    
    #set index to pickup datetime
    full.set_index('tpep_pickup_datetime', inplace=True)
    
    #isolate trips that are between 2am and 3am
    month = full.between_time('02:00', '03:00').copy()
    del(full) #delete full dataset to save memory
    
    #isolate weekend trips
    month['day'] = month.index.dayofweek
    month.drop(month[(month['day'] != 5) & (month['day'] != 6)].index, inplace=True)
    
    #find number of pickups or dropoffs for each LocationID
    trips = pd.DataFrame(month.groupby(col).size())
    
    #Add column for month
    trips['month'] = month_num
    
    #add to months list
    df_list.append(trips)
    
    #delete month dataset
    del(month)

In [4]:
#Extract dropoffs for each month
for i in range(1,13):
    get_last_call_trips(i, 'DOLocationID', monthly_dropoffs)

Loading dataset 1/12


MemoryError: 

In [None]:
for month in monthly_dropoffs:
    mdf = pd.DataFrame(month)
    mdf['LocationID'] = mdf.index
    mdf.rename(columns={0: 'Dropoffs'}, inplace=True)

In [None]:
#Concatenate list of monthly dfs into one df
df = pd.concat(monthly_dropoffs, ignore_index=True)

In [None]:
location_dropoffs = df.pivot_table(values='Dropoffs',
              index='LocationID',
              columns='month')

In [None]:
location_dropoffs.fillna(0, inplace=True)

In [None]:
location_dropoffs.drop(index=[264, 265], inplace=True)

In [None]:
taxi_zones = gpd.read_file('taxi_zones/taxi_zones.shp') #downloaded from https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip
taxi_zones.set_index('LocationID', inplace=True)

#Find centroids of taxi zones (+ coordinates)
centroids = taxi_zones['geometry'].centroid

In [None]:
#Add coordinates of pickup zone centroid to trip df
location_dropoffs = pd.merge(location_dropoffs, centroids.to_frame(), how='left', left_index=True, right_index=True)
location_dropoffs.rename(columns={0: 'geometry'}, inplace=True)

#Drop 10 records that don't have DO location IDs
location_dropoffs.drop(location_dropoffs[location_dropoffs['geometry'].isna()].index, inplace=True) 

#Convert df to geo-df
gdf = gpd.GeoDataFrame(location_dropoffs,
                geometry='geometry',
                crs='epsg:2263')

#Reproject to Web Mercator
gdf_web_merc = gdf.to_crs('epsg:3857')

### 2. Getting number of pickups per location ID

In [5]:
#Extract dropoffs for each month
for i in range(1,13):
    get_last_call_trips(i, 'PULocationID', monthly_pickups)
    
for month in monthly_dropoffs:
    mdf = pd.DataFrame(month)
    mdf['LocationID'] = mdf.index
    mdf.rename(columns={0: 'Dropoffs'}, inplace=True)

Loading dataset 1/12
Loading dataset 2/12
Loading dataset 3/12
Loading dataset 4/12
Loading dataset 5/12
Loading dataset 6/12
Loading dataset 7/12
Loading dataset 8/12
Loading dataset 9/12
Loading dataset 10/12
Loading dataset 11/12
Loading dataset 12/12


In [None]:
#Load income data for New York state Census tracts
income_tract = pd.read_csv('ACSST5Y2019.S1903_2021-03-02T112951/ACSST5Y2019.S1903_data_with_overlays_2021-03-02T112740.csv',
                          skiprows=[1], usecols=['GEO_ID', 'NAME', 'S1903_C03_001E'])

#Select only Census tracts in NYC
income_nyc = income_tract.loc[
    income_tract['NAME'].str.contains(r'New York County|Kings County|Bronx County|Richmond County|Queens County')
].copy()
del(income_tract) #save memory by deleting full NY dataset

#Drop non-residential Census tracts
income_nyc.drop(index=income_nyc[income_nyc['S1903_C03_001E'] == '-'].index, inplace=True)

#Convert income column to int
income_nyc['S1903_C03_001E'] = income_nyc['S1903_C03_001E'].str.replace('+','').str.replace(',','').astype('int64')

#Load shapefile for census tracts
tracts_shp = gpd.read_file('https://opendata.arcgis.com/datasets/7bba09631bd740f49ba0442f9603fa38_0.geojson')

#Income dataset and tract shapefile have different codes for each tract, but the codes used in the shapefile can be extracted from
#the GEO_ID column in the income dataset
def get_boroCT(row):
    if re.search('Richmond County|New York County', row['NAME']):
        return row['GEO_ID'][-7:]
    else:
        CT_code = row['GEO_ID'][-6:]
        if 'Bronx County' in row['NAME']:
            return '2' + CT_code
        elif 'Queens County' in row['NAME']:
            return '4' + CT_code
        elif 'Kings County' in row['NAME']:
            return '3' + CT_code
        
income_nyc['BoroCT2010'] = income_nyc.apply(get_boroCT, axis=1)

#Join income data to shapefile
tracts_income_geo = pd.merge(tracts_shp, income_nyc, on='BoroCT2010', how='left')

#Calculate area of each taxi zone
taxi_zones['zone_area'] = taxi_zones.area

#Add new column to taxi zone df with LocationID, bc it will be lost in the intersection
taxi_zones['zone_ID'] = taxi_zones.index.values

#Reproject Census tracts to projection of taxi zones
tracts_income_geo = tracts_income_geo.to_crs('epsg:2263')

#Find intersecting polygons between Census tracts and taxi zones
tract_zone_inter = gpd.overlay(tracts_income_geo, taxi_zones, how='intersection')

#Drop polygons w same geometry
tract_zone_inter.drop_duplicates(subset=['geometry'] ,inplace=True)

#Find area of each polygon
tract_zone_inter['polygon_area'] = tract_zone_inter['geometry'].area

#Divide each polygon's area by the area of the taxi zone, then multiply this proportion by the tract's median income
tract_zone_inter['pc_zone'] = tract_zone_inter.apply(lambda x: (x['polygon_area']/x['zone_area'])*x['S1903_C03_001E'], axis=1)

#Sum income fractions for each taxi zone
zone_income = tract_zone_inter.groupby('zone_ID').agg({'pc_zone': 'sum'})

#Using zone_income, add income of drop-off taxi zones to flow dataset
dropoffs_w_income = pd.merge(gdf_web_merc, zone_income, left_index=True, right_index=True, how='left').rename(columns={'pc_zone': 'DO_income'})

In [None]:
dropoffs_w_income

In [None]:
dropoffs_w_income.loc[dropoffs_w_income['DO_income'] < 20000, 'DO_income'] = np.nan

In [None]:
monthcols = {
    1: 'January',
    2: 'February',
    3: 'March',
    4: 'April',
    5: 'May',
    6: 'June',
    7: 'July',
    8: 'August',
    9: 'September',
    10: 'October',
    11: 'November',
    12: 'December'
}

dropoffs_w_income.rename(columns=monthcols, inplace=True)

In [None]:
for col in list(months.values()):
    dropoffs_w_income[col] = dropoffs_w_income[col].astype('int')

In [None]:
dropoffs_w_income.to_file('dropoffs_w_income.shp', driver='ESRI Shapefile')

In [None]:
dropoffs_w_income

In [None]:
dropoffs_w_income.describe()

In [None]:
dropoffs_w_income.plot(column='February',
                      scheme='quantiles',
                      legend=True)