In [92]:
import pandas as pd

def clean(file_path, out_file):
    '''
    Data cleaning pipeline:
    Renaming columns and rearranging column orders
    Converting 'date' column into a datetime object
    Adding 'day_of_the_week' classifier
    Removing unneccessary columns
    '''
    names = ['c_a', 'unit', 'scp', 'station', 'linename', 'division', 
             'date', 'time', 'desc', 'entries', 'exits']
    df = pd.read_csv(file_path, names=names, skiprows=1)
    df['date'] = pd.to_datetime(df['date'])
    df['day_of_week'] = df['date'].dt.day_name()
    df = df[['station', 'unit', 'c_a', 'scp', 'date', 'day_of_week', 'time', 'entries', 'exits']]
    #return df
    return df.to_pickle(out_file)

In [93]:
clean('http://web.mta.info/developers/data/nyct/turnstile/turnstile_200627.txt', '200627')

In [94]:
df = pd.read_pickle('200627')
df

Unnamed: 0,station,unit,c_a,scp,date,day_of_week,time,entries,exits
0,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,00:00:00,7424218,2522558
1,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,04:00:00,7424220,2522559
2,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,08:00:00,7424231,2522572
3,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,12:00:00,7424265,2522590
4,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,16:00:00,7424340,2522604
...,...,...,...,...,...,...,...,...,...
206666,RIT-ROOSEVELT,R469,TRAM2,00-05-01,2020-06-26,Friday,05:00:00,5554,514
206667,RIT-ROOSEVELT,R469,TRAM2,00-05-01,2020-06-26,Friday,09:00:00,5554,514
206668,RIT-ROOSEVELT,R469,TRAM2,00-05-01,2020-06-26,Friday,13:00:00,5554,514
206669,RIT-ROOSEVELT,R469,TRAM2,00-05-01,2020-06-26,Friday,17:00:00,5554,514


In [95]:
#I only want data  from one day
df1 = df[df.date=='2020-06-20']

In [96]:
df2 = df1[df.scp=='02-00-00']
df2

  df2 = df1[df.scp=='02-00-00']


Unnamed: 0,station,unit,c_a,scp,date,day_of_week,time,entries,exits
0,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,00:00:00,7424218,2522558
1,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,04:00:00,7424220,2522559
2,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,08:00:00,7424231,2522572
3,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,12:00:00,7424265,2522590
4,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,16:00:00,7424340,2522604
...,...,...,...,...,...,...,...,...,...
203860,FLATBUSH AV-B.C,R110,R647,02-00-00,2020-06-20,Saturday,05:00:00,8912630,3093077
203861,FLATBUSH AV-B.C,R110,R647,02-00-00,2020-06-20,Saturday,09:00:00,8912646,3093093
203862,FLATBUSH AV-B.C,R110,R647,02-00-00,2020-06-20,Saturday,13:00:00,8912677,3093128
203863,FLATBUSH AV-B.C,R110,R647,02-00-00,2020-06-20,Saturday,17:00:00,8912719,3093170


In [97]:
df3 = df2[df.unit == 'R051']
df3

  df3 = df2[df.unit == 'R051']


Unnamed: 0,station,unit,c_a,scp,date,day_of_week,time,entries,exits
0,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,00:00:00,7424218,2522558
1,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,04:00:00,7424220,2522559
2,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,08:00:00,7424231,2522572
3,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,12:00:00,7424265,2522590
4,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,16:00:00,7424340,2522604
5,59 ST,R051,A002,02-00-00,2020-06-20,Saturday,20:00:00,7424415,2522612


In [112]:
df_pivot = pd.pivot_table(df, index=['station', 'unit', 'c_a', 'scp', 'date', 'day_of_week', 'time'], values=['entries', 'exits'])
df_pivot.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,entries,exits
station,unit,c_a,scp,date,day_of_week,time,Unnamed: 7_level_1,Unnamed: 8_level_1
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,00:00:00,15455553,17299983
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,04:00:00,15455553,17299994
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,08:00:00,15455553,17300070
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,12:00:00,15455553,17300203
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,16:00:00,15455553,17300341
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,20:00:00,15455553,17300434
1 AV,R248,H007,00-00-00,2020-06-21,Sunday,00:00:00,15455553,17300504
1 AV,R248,H007,00-00-00,2020-06-21,Sunday,04:00:00,15455553,17300514
1 AV,R248,H007,00-00-00,2020-06-21,Sunday,08:00:00,15455553,17300565
1 AV,R248,H007,00-00-00,2020-06-21,Sunday,12:00:00,15455553,17300634


In [109]:
df_pivot[:6].diff()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,entries,exits
station,unit,c_a,scp,date,day_of_week,time,Unnamed: 7_level_1,Unnamed: 8_level_1
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,00:00:00,,
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,04:00:00,0.0,11.0
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,08:00:00,0.0,76.0
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,12:00:00,0.0,133.0
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,16:00:00,0.0,138.0
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,20:00:00,0.0,93.0


In [106]:
df_pivot[-6:].diff()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,entries,exits
station,unit,c_a,scp,date,day_of_week,time,Unnamed: 7_level_1,Unnamed: 8_level_1
59 ST,R051,A002,02-00-00,2020-06-20,Saturday,00:00:00,,
59 ST,R051,A002,02-00-00,2020-06-20,Saturday,04:00:00,2.0,1.0
59 ST,R051,A002,02-00-00,2020-06-20,Saturday,08:00:00,11.0,13.0
59 ST,R051,A002,02-00-00,2020-06-20,Saturday,12:00:00,34.0,18.0
59 ST,R051,A002,02-00-00,2020-06-20,Saturday,16:00:00,75.0,14.0
59 ST,R051,A002,02-00-00,2020-06-20,Saturday,20:00:00,75.0,8.0


In [101]:
df_pivot = pd.pivot_table(df, index=['station', 'unit', 'c_a', 'scp', 'date', 'day_of_week', 'time'], values=['entries', 'exits'])
df_pivot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,entries,exits
station,unit,c_a,scp,date,day_of_week,time,Unnamed: 7_level_1,Unnamed: 8_level_1
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,00:00:00,15455553,17299983
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,04:00:00,15455553,17299994
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,08:00:00,15455553,17300070
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,12:00:00,15455553,17300203
1 AV,R248,H007,00-00-00,2020-06-20,Saturday,16:00:00,15455553,17300341


In [74]:
df['day_of_week'] = pd.to_datetime(df['date']).dt.day_name()

0         Saturday
1         Saturday
2         Saturday
3         Saturday
4         Saturday
            ...   
206666      Friday
206667      Friday
206668      Friday
206669      Friday
206670      Friday
Name: date, Length: 206671, dtype: object

In [76]:
df3[:6].groupby(['station', 'unit', 'c_a', 'date', 'day_of_week', 'time', 'scp'])['entries'].diff()

31952   NaN
31953   NaN
31954   NaN
31955   NaN
31956   NaN
31957   NaN
Name: entries, dtype: float64