In [58]:
import pandas as pd

def clean(week_nums):
    dfs = []
    for week_num in week_nums:
        url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
        file_url = url.format(week_num)
        names = ['c_a', 'unit', 'scp', 'station', 'linename', 'division', 
                 'date', 'time', 'desc', 'entries', 'exits']
        df = pd.read_csv(file_url, names=names, parse_dates=[['date','time']], 
                         keep_date_col=True, skiprows=1)
        df['date'] = pd.to_datetime(df['date'])
        df['day_of_week'] = df['date_time'].dt.day_name()
        df = df[(~df['c_a'].str.contains('PTH') & 
                 ~df['desc'].str.contains('RECOVR') & 
                 df.time.astype(str).str.contains('00:00'))]
        df = df[['station', 'unit', 'c_a', 'scp', 'date_time', 'date', 'day_of_week', 'time', 
                'desc', 'entries', 'exits']]
        dfs.append(df)
        return pd.concat(dfs)

week_nums = [200627, 200620, 200613]

In [77]:
turnstiles_df = clean(week_nums)

In [67]:
turnstiles_df.date

0        2020-06-20
1        2020-06-20
2        2020-06-20
3        2020-06-20
4        2020-06-20
            ...    
206666   2020-06-26
206667   2020-06-26
206668   2020-06-26
206669   2020-06-26
206670   2020-06-26
Name: date, Length: 189525, dtype: datetime64[ns]

In [78]:
turnstiles_block = (turnstiles_df
                        .groupby(["c_a", "unit", "scp", "station", "date_time", "exits"],as_index=False).entries.first())
turnstiles_block[["prev_datetime", "prev_entries", "prev_exits"]] = (turnstiles_block
                                                       .groupby(["c_a", "unit", "scp", "station"])["date_time", "entries", "exits"]
                                                       .apply(lambda grp: grp.shift(1)))
turnstiles_block.dropna(subset=["prev_datetime"], axis=0, inplace=True)
def get_counts_entry(row, max_counter):
    counter = row["entries"] - row["prev_entries"]
    if counter < 0:
        # Maybe counter is reversed?
        counter = -counter
    if counter > max_counter:
        # Maybe counter was reset to 0? 
        print(row["entries"], row["prev_entries"])
        counter = min(row["entries"], row["prev_entries"])
    if counter > max_counter:
        # Check it again to make sure we're not still giving a counter that's too big
        return 0
    return counter
def get_counts_exit(row, max_counter):
    counter = row["exits"] - row["prev_exits"]
    if counter < 0:
        # Maybe counter is reversed?
        counter = -counter
    if counter > max_counter:
        # Maybe counter was reset to 0? 
        print(row["exits"], row["prev_exits"])
        counter = min(row["exits"], row["prev_exits"])
    if counter > max_counter:
        # Check it again to make sure we're not still giving a counter that's too big
        return 0
    return counter

  after removing the cwd from sys.path.


In [79]:
turnstiles_block['delta_entries'] = turnstiles_block.apply(get_counts_entry, axis=1, max_counter=1e6)

43 8360152.0
12 2393140.0
654322392 2808057.0
2808205 654322392.0
15 6666024.0
1703967 17387249.0


In [80]:
turnstiles_block['delta_exits'] = turnstiles_block.apply(get_counts_exit, axis=1, max_counter=1e6)
turnstiles_block

22 10927677.0
1 1578147.0
67101704 1833174.0
1833360 67101704.0
5 3597248.0


Unnamed: 0,c_a,unit,scp,station,date_time,exits,entries,prev_datetime,prev_entries,prev_exits,delta_entries,delta_exits
1,A002,R051,02-00-00,59 ST,2020-06-20 04:00:00,2522559,7424220,2020-06-20 00:00:00,7424218.0,2522558.0,2.0,1.0
2,A002,R051,02-00-00,59 ST,2020-06-20 08:00:00,2522572,7424231,2020-06-20 04:00:00,7424220.0,2522559.0,11.0,13.0
3,A002,R051,02-00-00,59 ST,2020-06-20 12:00:00,2522590,7424265,2020-06-20 08:00:00,7424231.0,2522572.0,34.0,18.0
4,A002,R051,02-00-00,59 ST,2020-06-20 16:00:00,2522604,7424340,2020-06-20 12:00:00,7424265.0,2522590.0,75.0,14.0
5,A002,R051,02-00-00,59 ST,2020-06-20 20:00:00,2522612,7424415,2020-06-20 16:00:00,7424340.0,2522604.0,75.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...
189520,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2020-06-26 05:00:00,514,5554,2020-06-26 01:00:00,5554.0,514.0,0.0,0.0
189521,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2020-06-26 09:00:00,514,5554,2020-06-26 05:00:00,5554.0,514.0,0.0,0.0
189522,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2020-06-26 13:00:00,514,5554,2020-06-26 09:00:00,5554.0,514.0,0.0,0.0
189523,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2020-06-26 17:00:00,514,5554,2020-06-26 13:00:00,5554.0,514.0,0.0,0.0


In [83]:
mf = turnstiles_block.groupby(['unit', 'station', 'date_time',]).agg({'delta_exits': 'sum'})
mf[:50]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,delta_exits
unit,station,date_time,Unnamed: 3_level_1
R001,SOUTH FERRY,2020-06-20 05:00:00,37.0
R001,SOUTH FERRY,2020-06-20 09:00:00,153.0
R001,SOUTH FERRY,2020-06-20 13:00:00,171.0
R001,SOUTH FERRY,2020-06-20 17:00:00,359.0
R001,SOUTH FERRY,2020-06-20 21:00:00,240.0
R001,SOUTH FERRY,2020-06-21 01:00:00,199.0
R001,SOUTH FERRY,2020-06-21 05:00:00,49.0
R001,SOUTH FERRY,2020-06-21 09:00:00,104.0
R001,SOUTH FERRY,2020-06-21 13:00:00,210.0
R001,SOUTH FERRY,2020-06-21 17:00:00,277.0
