In [1]:
import pandas as pd

def clean(file_path, out_file):
    '''
    Data cleaning pipeline:
    Renaming columns and rearranging column orders
    Converting 'date' column into a datetime object
    Adding 'day_of_the_week' classifier
    Removing unneccessary columns
    '''
    names = ['c_a', 'unit', 'scp', 'station', 'linename', 'division', 
             'date', 'time', 'desc', 'entries', 'exits']
    df = pd.read_csv(file_path, names=names, skiprows=1)
    df['date'] = pd.to_datetime(df['date'])
    df['day_of_week'] = df['date'].dt.day_name()
    df = df[['station', 'unit', 'c_a', 'scp', 'date', 'day_of_week', 'time', 'entries', 'exits']]
    #return df
    return df.to_pickle(out_file)

In [2]:
clean('http://web.mta.info/developers/data/nyct/turnstile/turnstile_200627.txt', '200627')

In [3]:
df = pd.read_pickle('200627')
df[:43]

Unnamed: 0,station,unit,c_a,scp,date,day_of_week,time,entries,exits
0,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,00:00:00,7424218,2522558
1,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,04:00:00,7424220,2522559
2,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,08:00:00,7424231,2522572
3,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,12:00:00,7424265,2522590
4,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,16:00:00,7424340,2522604
5,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,20:00:00,7424415,2522612
6,59 ST,R051,A002,02-00-00,2020-06-21,Sunday,00:00:00,7424441,2522622
7,59 ST,R051,A002,02-00-00,2020-06-21,Sunday,04:00:00,7424443,2522623
8,59 ST,R051,A002,02-00-00,2020-06-21,Sunday,08:00:00,7424457,2522628
9,59 ST,R051,A002,02-00-00,2020-06-21,Sunday,12:00:00,7424471,2522645


In [4]:
i = 0
j = 42
dfs = []
while j <= len(df)+42:
    scp_deltas = df[i:j][['entries', 'exits']].diff()
    dfs.append(scp_deltas)
    i+=42
    j+=42

In [5]:
deltas = pd.concat(dfs)
deltas

Unnamed: 0,entries,exits
0,,
1,2.0,1.0
2,11.0,13.0
3,34.0,18.0
4,75.0,14.0
...,...,...
206666,0.0,0.0
206667,0.0,0.0
206668,0.0,0.0
206669,0.0,0.0


In [6]:
deltas.shape, df.shape

((206671, 2), (206671, 9))

In [7]:
df['delta_entries'] = deltas.entries
df['delta_exits'] = deltas.exits
df

Unnamed: 0,station,unit,c_a,scp,date,day_of_week,time,entries,exits,delta_entries,delta_exits
0,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,00:00:00,7424218,2522558,,
1,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,04:00:00,7424220,2522559,2.0,1.0
2,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,08:00:00,7424231,2522572,11.0,13.0
3,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,12:00:00,7424265,2522590,34.0,18.0
4,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,16:00:00,7424340,2522604,75.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...
206666,RIT-ROOSEVELT,R469,TRAM2,00-05-01,2020-06-26,Friday,05:00:00,5554,514,0.0,0.0
206667,RIT-ROOSEVELT,R469,TRAM2,00-05-01,2020-06-26,Friday,09:00:00,5554,514,0.0,0.0
206668,RIT-ROOSEVELT,R469,TRAM2,00-05-01,2020-06-26,Friday,13:00:00,5554,514,0.0,0.0
206669,RIT-ROOSEVELT,R469,TRAM2,00-05-01,2020-06-26,Friday,17:00:00,5554,514,0.0,0.0


In [8]:
pd.to_pickle(df, 'Delta Dataframe')