In [2]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl

In [66]:
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [170325, 170401, 170408, 170415, 170422, 170429, 170506, 170513, 170520, 170527, 170603, 170610, 170617, 170624]
list2018 = [180324, 180331, 180407, 180414, 180421, 180428, 180505, 180512, 180519, 180526, 180602, 180609, 180616, 180623]
turnstilesDF = get_data(list2018)
stationsDF = pd.read_csv('Stations.csv')
turnstilesDF.columns = [column.strip() for column in turnstilesDF.columns]

In [67]:


# in case trying to merge data sets by station
stationsDF.rename(columns = {'Stop Name':'STATION'}, inplace=True)

# lose spaces in turnstilesDF.STATION column
uniqueStations = turnstilesDF.STATION.unique()
uniqueStationsEdit = [station.replace(' ','') for station in uniqueStations]
#uniqueStationsEdit = [station.replace('GRANDCENTRAL-42ST', 'GRDCNTRL-42ST') for station in uniqueStationsEdit]
uniqueStationsDF = {'STATION':uniqueStations, 'EDITED_STATIONS':uniqueStationsEdit}
uniqueStationsDF = pd.DataFrame(uniqueStationsDF)
newTurnstilesDF = pd.merge(turnstilesDF, uniqueStationsDF, on='STATION')

# list of stations in Manhattan
mStationsDF = stationsDF.loc[stationsDF.Borough == 'M']
uniqueStations = mStationsDF.STATION.unique()
uniqueStationsEdit = [station.upper() for station in uniqueStations]
uniqueStationsEdit = [station.replace(' ','') for station in uniqueStationsEdit]
uniqueStationsDF = {'STATION':uniqueStations, 'EDITED_STATIONS':uniqueStationsEdit}
uniqueStationsDF = pd.DataFrame(uniqueStationsDF)
newStationsDF = pd.merge(mStationsDF, uniqueStationsDF, on='STATION')
totalDF = pd.merge(newTurnstilesDF, newStationsDF, on='EDITED_STATIONS')
totalDF

Unnamed: 0,C/A,UNIT,SCP,STATION_x,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,...,Complex ID,GTFS Stop ID,Division,Line,STATION_y,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/17/2018,00:00:00,REGULAR,6552626,...,613,629,IRT,Lexington Av,59 St,M,4 5 6,Subway,40.762526,-73.967967
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/17/2018,04:00:00,REGULAR,6552626,...,613,629,IRT,Lexington Av,59 St,M,4 5 6,Subway,40.762526,-73.967967
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/17/2018,08:00:00,REGULAR,6552626,...,613,629,IRT,Lexington Av,59 St,M,4 5 6,Subway,40.762526,-73.967967
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/17/2018,12:00:00,REGULAR,6552626,...,613,629,IRT,Lexington Av,59 St,M,4 5 6,Subway,40.762526,-73.967967
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/17/2018,16:00:00,REGULAR,6552626,...,613,629,IRT,Lexington Av,59 St,M,4 5 6,Subway,40.762526,-73.967967
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/17/2018,20:00:00,REGULAR,6552626,...,613,629,IRT,Lexington Av,59 St,M,4 5 6,Subway,40.762526,-73.967967
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/18/2018,00:00:00,REGULAR,6552626,...,613,629,IRT,Lexington Av,59 St,M,4 5 6,Subway,40.762526,-73.967967
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/18/2018,04:00:00,REGULAR,6552626,...,613,629,IRT,Lexington Av,59 St,M,4 5 6,Subway,40.762526,-73.967967
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/18/2018,08:00:00,REGULAR,6552626,...,613,629,IRT,Lexington Av,59 St,M,4 5 6,Subway,40.762526,-73.967967
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/18/2018,12:00:00,REGULAR,6552626,...,613,629,IRT,Lexington Av,59 St,M,4 5 6,Subway,40.762526,-73.967967


In [68]:
# create DATE_TIME column
turnstilesDF['DATE_TIME'] = pd.to_datetime(turnstilesDF.DATE + ' ' + turnstilesDF.TIME, format = "%m/%d/%Y %H:%M:%S")
turnstilesDF.DATE_TIME

# check for duplicate entries
(turnstilesDF
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head()

# check for duplicate exits
(turnstilesDF
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .EXITS.count()
 .reset_index()
 .sort_values("EXITS", ascending=False)).head()

# get daily entry totals
turnstilesEntriesDF = turnstilesDF.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"])\
.ENTRIES.first().reset_index()


turnstilesEntriesDF[["PREV_DATE", "PREV_ENTRIES"]] = (turnstilesEntriesDF
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES"]
                                                       .transform(lambda grp: grp.shift(1)))

turnstilesEntriesDF.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

# get daily exit totals
turnstilesExitsDF = turnstilesDF.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"])\
.EXITS.first().reset_index()


turnstilesExitsDF[["PREV_DATE", "PREV_EXITS"]] = (turnstilesExitsDF
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "EXITS"]
                                                       .transform(lambda grp: grp.shift(1)))

turnstilesExitsDF.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

#combine entries and exits to get total (labeled traffic)
EntriesAndExitsDF = pd.merge(turnstilesEntriesDF, turnstilesExitsDF, on=["C/A", "UNIT", "SCP", "STATION", "DATE"])
EntriesAndExitsDF['TRAFFIC'] = EntriesAndExitsDF['PREV_ENTRIES'] + EntriesAndExitsDF['PREV_EXITS']
EntriesAndExitsDF


Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,PREV_DATE_x,PREV_ENTRIES,EXITS,PREV_DATE_y,PREV_EXITS,TRAFFIC
0,A002,R051,02-00-00,59 ST,03/18/2018,6552626,03/17/2018,6552626.0,2219140,03/17/2018,2219139.0,8771765.0
1,A002,R051,02-00-00,59 ST,03/19/2018,6552628,03/18/2018,6552626.0,2219143,03/18/2018,2219140.0,8771766.0
2,A002,R051,02-00-00,59 ST,03/20/2018,6554065,03/19/2018,6552628.0,2219665,03/19/2018,2219143.0,8771771.0
3,A002,R051,02-00-00,59 ST,03/21/2018,6555430,03/20/2018,6554065.0,2220160,03/20/2018,2219665.0,8773730.0
4,A002,R051,02-00-00,59 ST,03/22/2018,6556303,03/21/2018,6555430.0,2220535,03/21/2018,2220160.0,8775590.0
5,A002,R051,02-00-00,59 ST,03/23/2018,6557744,03/22/2018,6556303.0,2221032,03/22/2018,2220535.0,8776838.0
6,A002,R051,02-00-00,59 ST,03/24/2018,6559322,03/23/2018,6557744.0,2221550,03/23/2018,2221032.0,8778776.0
7,A002,R051,02-00-00,59 ST,03/25/2018,6559565,03/24/2018,6559322.0,2221604,03/24/2018,2221550.0,8780872.0
8,A002,R051,02-00-00,59 ST,03/26/2018,6559579,03/25/2018,6559565.0,2221614,03/25/2018,2221604.0,8781169.0
9,A002,R051,02-00-00,59 ST,03/27/2018,6560941,03/26/2018,6559579.0,2222111,03/26/2018,2221614.0,8781193.0


In [69]:
# get daily traffic totals
EntriesAndExitsDF = EntriesAndExitsDF.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"])\
.TRAFFIC.first().reset_index()

EntriesAndExitsDF["PREV_TRAFFIC"] = (EntriesAndExitsDF
                                    .groupby(["C/A", "UNIT", "SCP", "STATION"])["TRAFFIC"]
                                    .transform(lambda grp: grp.shift(1)))

# lose data without traffic on prior day
EntriesAndExitsDF.dropna(subset=["PREV_TRAFFIC"], axis=0, inplace=True)

#function to determine daily traffic
def get_daily_counts(row, max_counter):
    counter = row["TRAFFIC"] - row["PREV_TRAFFIC"]
    if counter < 0:
        # Maybe counter is reversed?
        counter = -counter
    if counter > max_counter:
        # print(row["TRAFFIC"], row["PREV_TRAFFIC"]) --> if you want to see the specific case
        counter = min(row["TRAFFIC"], row["PREV_TRAFFIC"])
        # if current entries is bad, use yesterday's count as proxy
    if counter > max_counter:
        # Check it again to make sure we are not giving a counter that's too big
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits

EntriesAndExitsDF["DAILY_TRAFFIC"] = EntriesAndExitsDF.apply(get_daily_counts, axis=1, max_counter=100000)

EntriesAndExitsDF







Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,TRAFFIC,PREV_TRAFFIC,DAILY_TRAFFIC
1,A002,R051,02-00-00,59 ST,03/19/2018,8771766.0,8771765.0,1.0
2,A002,R051,02-00-00,59 ST,03/20/2018,8771771.0,8771766.0,5.0
3,A002,R051,02-00-00,59 ST,03/21/2018,8773730.0,8771771.0,1959.0
4,A002,R051,02-00-00,59 ST,03/22/2018,8775590.0,8773730.0,1860.0
5,A002,R051,02-00-00,59 ST,03/23/2018,8776838.0,8775590.0,1248.0
6,A002,R051,02-00-00,59 ST,03/24/2018,8778776.0,8776838.0,1938.0
7,A002,R051,02-00-00,59 ST,03/25/2018,8780872.0,8778776.0,2096.0
8,A002,R051,02-00-00,59 ST,03/26/2018,8781169.0,8780872.0,297.0
9,A002,R051,02-00-00,59 ST,03/27/2018,8781193.0,8781169.0,24.0
10,A002,R051,02-00-00,59 ST,03/28/2018,8783052.0,8781193.0,1859.0


In [70]:
# filter into daily traffic per station per day
EntriesAndExitsDF.groupby(['C/A', 'UNIT', 'STATION', 'DATE']).sum().sort_values('DAILY_TRAFFIC', ascending=False)

# add edited station names to daily traffic DF
trafficStations = EntriesAndExitsDF.STATION.unique()
#trafficStations = [station.replace('GRD CNTRL', 'GRAND CENTRAL') for station in trafficStations]
trafficStationsEdit = [station.upper() for station in trafficStations]
trafficStationsEdit = [station.replace(' ','') for station in trafficStationsEdit]
trafficStationsDF = {'STATION':trafficStations, 'EDITED_STATION':trafficStationsEdit}
trafficStationsDF = pd.DataFrame(trafficStationsDF)
newTrafficStationsDF = pd.merge(EntriesAndExitsDF, trafficStationsDF, on='STATION')

# function that creates column that states if station is in Manhattan
def inManhattan(row):
    if row['EDITED_STATION'] in uniqueStationsEdit:
        return 'Y'
    else:
        return 'N'


#create new daily traffic DF with only Manhattan stations
newTrafficStationsDF['Manhattan'] = newTrafficStationsDF.apply(inManhattan, axis=1)
TrafficManhattanDF2016 = newTrafficStationsDF.loc[newTrafficStationsDF.Manhattan == 'Y']
newTrafficStationsDF.groupby(['C/A', 'UNIT', 'STATION', 'DATE']).sum().sort_values('DAILY_TRAFFIC', ascending=False)
grandCentralDF = newTrafficStationsDF.loc[newTrafficStationsDF.STATION == 'GRD CNTRL-42 ST']
pennDF = newTrafficStationsDF.loc[newTrafficStationsDF.STATION == '34 ST-PENN STA']
pennDF

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,TRAFFIC,PREV_TRAFFIC,DAILY_TRAFFIC,EDITED_STATION,Manhattan
197063,N067,R012,00-00-00,34 ST-PENN STA,03/19/2018,3932634.0,3931797.0,837.0,34ST-PENNSTA,N
197064,N067,R012,00-00-00,34 ST-PENN STA,03/20/2018,3933269.0,3932634.0,635.0,34ST-PENNSTA,N
197065,N067,R012,00-00-00,34 ST-PENN STA,03/21/2018,3935971.0,3933269.0,2702.0,34ST-PENNSTA,N
197066,N067,R012,00-00-00,34 ST-PENN STA,03/22/2018,3938736.0,3935971.0,2765.0,34ST-PENNSTA,N
197067,N067,R012,00-00-00,34 ST-PENN STA,03/23/2018,3939956.0,3938736.0,1220.0,34ST-PENNSTA,N
197068,N067,R012,00-00-00,34 ST-PENN STA,03/24/2018,3942200.0,3939956.0,2244.0,34ST-PENNSTA,N
197069,N067,R012,00-00-00,34 ST-PENN STA,03/25/2018,3945048.0,3942200.0,2848.0,34ST-PENNSTA,N
197070,N067,R012,00-00-00,34 ST-PENN STA,03/26/2018,3946727.0,3945048.0,1679.0,34ST-PENNSTA,N
197071,N067,R012,00-00-00,34 ST-PENN STA,03/27/2018,3948088.0,3946727.0,1361.0,34ST-PENNSTA,N
197072,N067,R012,00-00-00,34 ST-PENN STA,03/28/2018,3951011.0,3948088.0,2923.0,34ST-PENNSTA,N


In [71]:
# final DF
TrafficManhattanDF2016 = pd.concat([TrafficManhattanDF2016, grandCentralDF, pennDF])
TrafficManhattanDF2016.groupby(['C/A', 'UNIT', 'STATION', 'DATE']).sum().sort_values(['DAILY_TRAFFIC'], ascending=False)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAFFIC,PREV_TRAFFIC,DAILY_TRAFFIC
C/A,UNIT,STATION,DATE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
R238,R046,GRD CNTRL-42 ST,06/16/2018,280628384.0,280511788.0,116596.0
R238,R046,GRD CNTRL-42 ST,06/02/2018,279265620.0,279150921.0,114699.0
R238,R046,GRD CNTRL-42 ST,06/15/2018,280511788.0,280397090.0,114698.0
R238,R046,GRD CNTRL-42 ST,03/31/2018,274361836.0,274247658.0,114178.0
R238,R046,GRD CNTRL-42 ST,06/22/2018,281196021.0,281081956.0,114065.0
R238,R046,GRD CNTRL-42 ST,06/08/2018,279830294.0,279716405.0,113889.0
R238,R046,GRD CNTRL-42 ST,03/30/2018,274247658.0,274134315.0,113343.0
R238,R046,GRD CNTRL-42 ST,06/14/2018,280397090.0,280283895.0,113195.0
R238,R046,GRD CNTRL-42 ST,06/09/2018,279943479.0,279830294.0,113185.0
R238,R046,GRD CNTRL-42 ST,03/29/2018,274134315.0,274021194.0,113121.0


In [72]:
with open("NewTrafficManhattanDF", 'wb') as picklefile: 
    pkl.dump(TrafficManhattanDF2016, picklefile)