In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read in data

Important links:  
[NYC Turnstile data](http://web.mta.info/developers/turnstile.html)  
[NYC Station Data](http://web.mta.info/developers/data/nyct/subway/Stations.csv)

In [None]:
turnstilesDF = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_180602.txt')
stationsDF = pd.read_csv('http://web.mta.info/developers/data/nyct/subway/Stations.csv')


Adjust column names as needed so that turnstile data plays nice with the station data.

In [3]:
# in case trying to merge data sets by station
turnstilesDF.columns = [column.strip() for column in turnstilesDF.columns]
stationsDF.rename(columns = {'Stop Name':'STATION'}, inplace=True)
uniqueStations = turnstilesDF.STATION.unique()
uniqueStationsEdit = [station.replace(' ','') for station in uniqueStations]
uniqueStationsDF = {'STATION':uniqueStations, 'EDITED_STATIONS':uniqueStationsEdit}
uniqueStationsDF = pd.DataFrame(uniqueStationsDF)
newTurnstilesDF = pd.merge(turnstilesDF, uniqueStationsDF, on='STATION')

In [None]:
# create DATE_TIME column
turnstilesDF['DATE_TIME'] = pd.to_datetime(turnstilesDF.DATE + ' ' + turnstilesDF.TIME, format = "%m/%d/%Y %H:%M:%S")
turnstilesDF.DATE_TIME

In [9]:
# check for duplicate entries
(turnstilesDF
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,ENTRIES
0,A002,R051,02-00-00,59 ST,2018-05-26 00:00:00,1
129043,R134,R272,01-06-00,28 ST,2018-05-26 09:00:00,1
131085,R143,R032,02-00-03,TIMES SQ-42 ST,2018-05-28 04:00:00,1
131086,R143,R032,02-00-03,TIMES SQ-42 ST,2018-05-28 08:00:00,1
131087,R143,R032,02-00-03,TIMES SQ-42 ST,2018-05-28 12:00:00,1


In [8]:
#check for duplicate exits
(turnstilesDF
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .EXITS.count()
 .reset_index()
 .sort_values("EXITS", ascending=False)).head()

# get daily entry totals
turnstilesEntriesDF = turnstilesDF.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"])\
.ENTRIES.first().reset_index()


turnstilesEntriesDF[["PREV_DATE", "PREV_ENTRIES"]] = (turnstilesEntriesDF
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES"]
                                                       .transform(lambda grp: grp.shift(1)))

turnstilesEntriesDF.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

# get daily exit totals
turnstilesExitsDF = turnstilesDF.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"])\
.EXITS.first().reset_index()


turnstilesExitsDF[["PREV_DATE", "PREV_EXITS"]] = (turnstilesExitsDF
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "EXITS"]
                                                       .transform(lambda grp: grp.shift(1)))

turnstilesExitsDF.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

#combine entries and exits to get total (labeled traffic)
EntriesAndExitsDF = pd.merge(turnstilesEntriesDF, turnstilesExitsDF, on=["C/A", "UNIT", "SCP", "STATION", "DATE"])
EntriesAndExitsDF['TRAFFIC'] = EntriesAndExitsDF['PREV_ENTRIES'] + EntriesAndExitsDF['PREV_EXITS']
EntriesAndExitsDF.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,PREV_DATE_x,PREV_ENTRIES,EXITS,PREV_DATE_y,PREV_EXITS,TRAFFIC
0,A002,R051,02-00-00,59 ST,05/27/2018,6634496,05/26/2018,6633898.0,2248662,05/26/2018,2248498.0,8882396.0
1,A002,R051,02-00-00,59 ST,05/28/2018,6634957,05/27/2018,6634496.0,2248795,05/27/2018,2248662.0,8883158.0
2,A002,R051,02-00-00,59 ST,05/29/2018,6635511,05/28/2018,6634957.0,2248955,05/28/2018,2248795.0,8883752.0
3,A002,R051,02-00-00,59 ST,05/30/2018,6636819,05/29/2018,6635511.0,2249505,05/29/2018,2248955.0,8884466.0
4,A002,R051,02-00-00,59 ST,05/31/2018,6638275,05/30/2018,6636819.0,2250098,05/30/2018,2249505.0,8886324.0


In [19]:
#function to determine daily traffic
def get_daily_counts(row, max_counter):
    counter = row["TRAFFIC"] - row["PREV_TRAFFIC"]
    if counter < 0:
        # Maybe counter is reversed?
        counter = -counter
    if counter > max_counter:
        # print(row["TRAFFIC"], row["PREV_TRAFFIC"]) --> if you want to see the specific case
        counter = min(row["TRAFFIC"], row["PREV_TRAFFIC"])
        # if current entries is bad, use yesterday's count as proxy
    if counter > max_counter:
        # Check it again to make sure we are not giving a counter that's too big
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits

In [20]:
# get daily traffic totals
EntriesAndExitsDF = EntriesAndExitsDF.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"])\
.TRAFFIC.first().reset_index()

EntriesAndExitsDF["PREV_TRAFFIC"] = (EntriesAndExitsDF
                                    .groupby(["C/A", "UNIT", "SCP", "STATION"])["TRAFFIC"]
                                    .transform(lambda grp: grp.shift(1)))

# lose data without traffic on prior day
EntriesAndExitsDF.dropna(subset=["PREV_TRAFFIC"], axis=0, inplace=True)



# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits

EntriesAndExitsDF["DAILY_TRAFFIC"] = EntriesAndExitsDF.apply(get_daily_counts, axis=1, max_counter=1000000)

#(EntriesAndExitsDF[EntriesAndExitsDF['STATION'] == '59 ST' & EntriesAndExitsDF['DATE']=='05/29/2018'].groupby('DATE')).head(15)
#EntriesAndExitsDF[(EntriesAndExitsDF['STATION'] == '59 ST')\
#                   & (EntriesAndExitsDF['DATE']=='05/29/2018')].head(15)
#EntriesAndExitsDF['DATE'].describe()

ValueError: No objects to concatenate