This notebook retrieves and wrangles [MTA turnstile data](http://web.mta.info/developers/turnstile.html). The data is parsed by station, date, and time. Additionally, zip code data was added from a csv generated by zip_codes notebook. Finally, seaborn visualizations of our analyses are included inline. 

We used this data to identify stations and times for street team deployment.

In [1]:
import pandas as pd
import numpy as np

from dateutil.parser import parse
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('white')
sns.set_palette('husl')
sns.despine();

<matplotlib.figure.Figure at 0x106f0d4d0>

In [2]:
#zip code data
zip_codes_raw = 'zips.csv'

#mta turnstile data for all of may 2016 and first half of june 2016
link_prefix = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_"
dates = [160507, 160514, 160521, 160528, 160604, 160611]
files = []

dateparse = lambda x: pd.datetime.strptime(x, '%m/%d/%Y')

for i in range(len(dates)):
    files.append(link_prefix + str(dates[i]) + ".txt")

dfs = [pd.read_csv(f, parse_dates=['DATE'], date_parser=dateparse,
                   usecols = ['C/A', 'UNIT', 'SCP', 'STATION', 'DATE', 'TIME', 'ENTRIES', 'EXITS                                                               ']) for f in files]
df = pd.concat(dfs)

In [3]:
df.rename(columns = {
    'C/A': 'area',
    'UNIT': 'unit',
    'SCP': 'scp',
    'STATION': 'station',
    'DATE': 'date',
    'TIME': 'time',
    'ENTRIES': 'entries',
    'EXITS                                                               ': 'exits'},
    inplace = True)

In [None]:
df['weekday'] = df['date'].dt.dayofweek

In [None]:
df['parsed_time'] = df['time'].apply(parse)

In [None]:
df.sort_values(['station','area','unit','scp','date','time'], inplace=True)

In [None]:
df['hourly_entries'] = df['entries'].diff()
df['hourly_exits'] = df['exits'].diff()

hourly = ['hourly_entries', 'hourly_exits']

for item in hourly:
    df[item].fillna(0, inplace=True)

In [None]:
zip_codes = pd.read_csv(zip_codes_raw)

In [None]:
df = df.merge(zip_codes, on='station')

In [None]:
#set all negative hourly entries to absolute value
for item in hourly:
    df.loc[df[item] < 0, item] = abs(df[item])
    df[df[item] < 0]

In [None]:
#drop weekends
df = df[df['weekday'].isin([0,1,2,3,4])]

In [None]:
df['hourly_entries'].describe(percentiles=[0.01, 0.999])

In [None]:
df['hourly_exits'].describe(percentiles=[0.01, 0.999])

In [None]:
df['traffic'] = df[hourly].sum(axis=1)

In [1]:
#some turnstiles extreme values which may be explained by mechanical changes
#we assumed one turnstile could not have more than 10,000 entries or exits
#in a given hour, (1 entry per second would be 3600 entries per hour)

df['good'] = df['traffic'] <= 10000

NameError: name 'df' is not defined

In [None]:
#zip_codes with median income > $200k
#and % commute to work by public transporation >= 70%
potential_census_zips = [11201, 11217, 11238]

In [None]:
potential_stations = df[df['zip_code'].isin(potential_census_zips)]
potential_stations = potential_stations[potential_stations['good']==True]

In [None]:
potential_stations[potential_stations['station'] == 'PROSPECT PARK'].sort_values('traffic', ascending=False).head()

In [None]:
stations = list(potential_stations['station'].unique())
stations

In [None]:
total_by_station = potential_stations.groupby(by='station', as_index=False).sum()[['station','traffic']].sort_values('traffic', ascending=False)

In [None]:
total_by_station['weekly_traffic'] = total_by_station['traffic']/6

In [None]:
weekdays = sns.factorplot('station', 'weekly_traffic', data = total_by_station, kind = 'bar', aspect = 2)
plt.xticks(rotation=45)
plt.xlabel('Station')
plt.ylabel('Traffic')
plt.title("Total Weekday Traffic Per Week")

weekdays.savefig('figures/weekday_totals.png')

In [None]:
top_stations_lst = ['ATL AV-BARCLAY',
                   'JAY ST-METROTEC',
                   'BOROUGH HALL',
                   'DEKALB AV']

top_stations = potential_stations[potential_stations['station'].isin(top_stations_lst)]
top_stations = top_stations.groupby(by=['station', 'parsed_time'], as_index=False).sum()[['station', 'parsed_time','traffic']]
top_stations.set_index('parsed_time', drop=True, inplace=True)
top_stations = top_stations.groupby(by=['station']).resample('4H').mean()
top_stations = top_stations.reset_index()

In [None]:
top_stations.loc[top_stations['parsed_time'] == '2017-01-23 00:00:00', 'label'] = '8pm-12am'
top_stations.loc[top_stations['parsed_time'] == '2017-01-23 04:00:00', 'label'] = '12am-4am'
top_stations.loc[top_stations['parsed_time'] == '2017-01-23 08:00:00', 'label'] = '4am-8am'
top_stations.loc[top_stations['parsed_time'] == '2017-01-23 12:00:00', 'label'] = '8am-12pm'
top_stations.loc[top_stations['parsed_time'] == '2017-01-23 16:00:00', 'label'] = '12pm-4pm'
top_stations.loc[top_stations['parsed_time'] == '2017-01-23 20:00:00', 'label'] = '4pm-8pm'

In [None]:
order = ['12am-4am', '4am-8am', '8am-12pm', '12pm-4pm', '4pm-8pm', '8pm-12am']

flow_bar = sns.factorplot('label', 'traffic', hue ='station', kind = 'bar', data = top_stations, aspect = 2, order=order)
plt.xticks(rotation=45)
plt.xlabel('Time of Day')
plt.ylabel('Traffic')
plt.title("Traffic flow")

flow_bar.savefig('figures/traffic_flow_bar.png')

In [None]:
stations_by_weekday = potential_stations[potential_stations['station'].isin(top_stations_lst)]
stations_by_weekday = stations_by_weekday.groupby(by=['station', 'weekday'], as_index=False).sum()[['station', 'weekday','traffic']]
stations_by_weekday

In [None]:
stations_by_weekday.loc[stations_by_weekday['weekday'] == 0, 'label'] = 'Monday'
stations_by_weekday.loc[stations_by_weekday['weekday'] == 1, 'label'] = 'Tuesday'
stations_by_weekday.loc[stations_by_weekday['weekday'] == 2, 'label'] = 'Wednesday'
stations_by_weekday.loc[stations_by_weekday['weekday'] == 3, 'label'] = 'Thursday'
stations_by_weekday.loc[stations_by_weekday['weekday'] == 4, 'label'] = 'Friday'

In [None]:
weekday_bar = sns.factorplot('label', 'traffic', hue ='station', kind = 'bar', data = stations_by_weekday, aspect = 2)
plt.xticks(rotation=45)
plt.xlabel('Day of Week')
plt.ylabel('Traffic')
plt.title("Traffic flow by Weekday")

weekday_bar.savefig('figures/weekday_bar.png');