In [55]:
import pandas as pd
import numpy as np

In [60]:
def readTurnstileData(link):
    """
    This function reads in data from an online MTA Turnstile dataset into a DataFrame
    ---
    input: link to dataset
    output: DataFrame
    """
    cols = ['control_area','unit','scp','station','line_name','division','date','time',
        'desc','entries','exits']
    
    turnstile_data = pd.read_csv(link, header = 0, names = cols)
    return turnstile_data

def formatDateTime(df):
    """
    This function converts the date and time into DateTime format in a single column
    and deletes the unformatted date and time columns
    
    Note: only run once per DataFrame, will result in error otherwise
    ---
    input: DataFrame
    output: DataFrame
    """
    
    #convert date and time to DateTime format in a single column
    df['concat_date_time'] = df['date'] + ' ' + df['time']
    df['date_time'] = pd.to_datetime(df.concat_date_time,format = '%m/%d/%Y %H:%M:%S')
    
    #delete unformatted date and time columns
    del df['concat_date_time']
    del df['date']
    del df['time']
    return df

def read_and_format_turnstile_data(link):
    """
    This function reads in Turnstile data from online and returns a DataFrame with 
    with the date and time information converted to a single DateTime column
    ---
    input: link
    output: DataFrame
    """
    df = readTurnstileData(link)
    df1 = formatDateTime(df)
    return df1

In [61]:
# Read in one of the turnstile datasets
turnstile_data_link_1 = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190921.txt'
turnstile_data_1 = read_and_format_turnstile_data(turnstile_data_link_1)

In [62]:
turnstile_data_1.head(50)
# entries and exits are cumulative, subtract previous value to get the number for that 4 hour period


Unnamed: 0,control_area,unit,scp,station,line_name,division,desc,entries,exits,date_time
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198818,2438323,2019-09-14 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198834,2438325,2019-09-14 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198847,2438354,2019-09-14 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198929,2438428,2019-09-14 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7199125,2438483,2019-09-14 16:00:00
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7199405,2438527,2019-09-14 20:00:00
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7199547,2438545,2019-09-15 00:00:00
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7199567,2438547,2019-09-15 04:00:00
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7199579,2438564,2019-09-15 08:00:00
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7199648,2438630,2019-09-15 12:00:00


In [51]:
station_groupby = turnstile_data.groupby('station')

In [54]:
station_entries = station_groupby.entries.sum().sort_values(ascending = True)
station_entries

station
NEWARK HM HE            1073019
PATH WTC 2              7402804
NEWARK HW BMEBE        16359580
9TH STREET             18546632
ORCHARD BEACH          20787564
                       ...     
DEKALB AV          226731273077
23 ST              237675376663
TIMES SQ-42 ST     244851205878
125 ST             282278333960
42 ST-PORT AUTH    315905669087
Name: entries, Length: 378, dtype: int64

In [None]:
turnstile_groupby = turnstile_data.groupby(['station','scp'])

In [None]:
turnstile_data_1 = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190921.txt'