In [20]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [70]:
turnstilesDF = pd.read_csv('turnstile_180602.txt')
stationsDF = pd.read_csv('Stations.csv')
turnstilesDF.columns = [column.strip() for column in turnstilesDF.columns]

# in case trying to merge data sets by station
stationsDF.rename(columns = {'Stop Name':'STATION'}, inplace=True)
uniqueStations = turnstilesDF.STATION.unique()
uniqueStationsEdit = [station.replace(' ','') for station in uniqueStations]
uniqueStationsDF = {'STATION':uniqueStations, 'EDITED_STATIONS':uniqueStationsEdit}
uniqueStationsDF = pd.DataFrame(uniqueStationsDF)
newTurnstilesDF = pd.merge(turnstilesDF, uniqueStationsDF, on='STATION')

In [77]:
# create DATE_TIME column
turnstilesDF['DATE_TIME'] = pd.to_datetime(turnstilesDF.DATE + ' ' + turnstilesDF.TIME, format = "%m/%d/%Y %H:%M:%S")
turnstilesDF.DATE_TIME

# check for duplicate entries
(turnstilesDF
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head()

#check for duplicate exits
(turnstilesDF
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .EXITS.count()
 .reset_index()
 .sort_values("EXITS", ascending=False)).head()

# get daily entry totals
turnstilesEntriesDF = turnstilesDF.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"])\
.ENTRIES.first().reset_index()


turnstilesEntriesDF[["PREV_DATE", "PREV_ENTRIES"]] = (turnstilesEntriesDF
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES"]
                                                       .transform(lambda grp: grp.shift(1)))

turnstilesEntriesDF.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

# get daily exit totals
turnstilesExitsDF = turnstilesDF.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"])\
.EXITS.first().reset_index()


turnstilesExitsDF[["PREV_DATE", "PREV_EXITS"]] = (turnstilesExitsDF
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "EXITS"]
                                                       .transform(lambda grp: grp.shift(1)))

turnstilesExitsDF.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

#combine entries and exits to get total (labeled traffic)
EntriesAndExitsDF = pd.merge(turnstilesEntriesDF, turnstilesExitsDF, on=["C/A", "UNIT", "SCP", "STATION", "DATE"])
EntriesAndExitsDF['TRAFFIC'] = EntriesAndExitsDF['PREV_ENTRIES'] + EntriesAndExitsDF['PREV_EXITS']
EntriesAndExitsDF


Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,PREV_DATE_x,PREV_ENTRIES,EXITS,PREV_DATE_y,PREV_EXITS,TRAFFIC
0,A002,R051,02-00-00,59 ST,05/27/2018,6634496,05/26/2018,6633898.0,2248662,05/26/2018,2248498.0,8882396.0
1,A002,R051,02-00-00,59 ST,05/28/2018,6634957,05/27/2018,6634496.0,2248795,05/27/2018,2248662.0,8883158.0
2,A002,R051,02-00-00,59 ST,05/29/2018,6635511,05/28/2018,6634957.0,2248955,05/28/2018,2248795.0,8883752.0
3,A002,R051,02-00-00,59 ST,05/30/2018,6636819,05/29/2018,6635511.0,2249505,05/29/2018,2248955.0,8884466.0
4,A002,R051,02-00-00,59 ST,05/31/2018,6638275,05/30/2018,6636819.0,2250098,05/30/2018,2249505.0,8886324.0
5,A002,R051,02-00-00,59 ST,06/01/2018,6639800,05/31/2018,6638275.0,2250657,05/31/2018,2250098.0,8888373.0
6,A002,R051,02-00-01,59 ST,05/27/2018,5944491,05/26/2018,5943956.0,1329914,05/26/2018,1329823.0,7273779.0
7,A002,R051,02-00-01,59 ST,05/28/2018,5944869,05/27/2018,5944491.0,1329990,05/27/2018,1329914.0,7274405.0
8,A002,R051,02-00-01,59 ST,05/29/2018,5945375,05/28/2018,5944869.0,1330095,05/28/2018,1329990.0,7274859.0
9,A002,R051,02-00-01,59 ST,05/30/2018,5946598,05/29/2018,5945375.0,1330357,05/29/2018,1330095.0,7275470.0


In [78]:
# get daily traffic totals
EntriesAndExitsDF = EntriesAndExitsDF.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"])\
.TRAFFIC.first().reset_index()

EntriesAndExitsDF["PREV_TRAFFIC"] = (EntriesAndExitsDF
                                    .groupby(["C/A", "UNIT", "SCP", "STATION"])["TRAFFIC"]
                                    .transform(lambda grp: grp.shift(1)))

# lose data without traffic on prior day
EntriesAndExitsDF.dropna(subset=["PREV_TRAFFIC"], axis=0, inplace=True)

#function to determine daily traffic
def get_daily_counts(row, max_counter):
    counter = row["TRAFFIC"] - row["PREV_TRAFFIC"]
    if counter < 0:
        # Maybe counter is reversed?
        counter = -counter
    if counter > max_counter:
        # print(row["TRAFFIC"], row["PREV_TRAFFIC"]) --> if you want to see the specific case
        counter = min(row["TRAFFIC"], row["PREV_TRAFFIC"])
        # if current entries is bad, use yesterday's count as proxy
    if counter > max_counter:
        # Check it again to make sure we are not giving a counter that's too big
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits

EntriesAndExitsDF["DAILY_TRAFFIC"] = EntriesAndExitsDF.apply(get_daily_counts, axis=1, max_counter=1000000)

EntriesAndExitsDF







Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,TRAFFIC,PREV_TRAFFIC,DAILY_TRAFFIC
1,A002,R051,02-00-00,59 ST,05/28/2018,8883158.0,8882396.0,762.0
2,A002,R051,02-00-00,59 ST,05/29/2018,8883752.0,8883158.0,594.0
3,A002,R051,02-00-00,59 ST,05/30/2018,8884466.0,8883752.0,714.0
4,A002,R051,02-00-00,59 ST,05/31/2018,8886324.0,8884466.0,1858.0
5,A002,R051,02-00-00,59 ST,06/01/2018,8888373.0,8886324.0,2049.0
7,A002,R051,02-00-01,59 ST,05/28/2018,7274405.0,7273779.0,626.0
8,A002,R051,02-00-01,59 ST,05/29/2018,7274859.0,7274405.0,454.0
9,A002,R051,02-00-01,59 ST,05/30/2018,7275470.0,7274859.0,611.0
10,A002,R051,02-00-01,59 ST,05/31/2018,7276955.0,7275470.0,1485.0
11,A002,R051,02-00-01,59 ST,06/01/2018,7278370.0,7276955.0,1415.0


Index(['C/A', 'UNIT', 'SCP', 'STATION', 'DATE', 'ENTRIES'], dtype='object')