# Finding the highest stations, days, and hours with the same steps used for January to March, 2019
#### All steps without notation are the same as those for January to March, 2021

In [4]:
import pandas as pd

In [5]:
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt" #url with formatting to fill in the number of the particular week
    dfs = []
    for week_num in week_nums: #for each week
        file_url = url.format(week_num) #format the url so it grabs that week
        dfs.append(pd.read_csv(file_url)) #append to our empty list
    return pd.concat(dfs) #combine files for each week
        
week_nums = [190105, 190112, 190119, 190126, 190202, 190209, 190216, 190223, 190302, 190309, 190316, 190323, 190330]
turnstiles_df = get_data(week_nums) #use our function to get a df with all of our data

In [6]:
turnstiles_df.columns = [column.strip() for column in turnstiles_df.columns]

In [7]:
import datetime
turnstiles_df["DATE_TIME"] = pd.to_datetime(turnstiles_df.DATE + " " + turnstiles_df.TIME, 
                                            format="%m/%d/%Y %H:%M:%S")
turnstiles_df['DATE'] = pd.to_datetime(turnstiles_df.DATE, format="%m/%d/%Y")
turnstiles_df['TIME'] = pd.to_datetime(turnstiles_df.TIME, format="%H:%M:%S")

In [8]:
turnstiles_df.sort_values(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], inplace=True, ascending=False)
turnstiles_df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], inplace=True)

In [9]:
turnstiles_df = turnstiles_df.drop("DESC", axis=1, errors="ignore")

In [10]:
turnstiles_df["ENTRIES_EXITS"] = turnstiles_df["ENTRIES"] + turnstiles_df["EXITS"]

In [11]:
turnstiles_df = turnstiles_df.drop(["ENTRIES", "EXITS"], axis=1, errors="ignore")

## 1. Finding the highest traffic stations

In [12]:
turnstiles_daily = (turnstiles_df
                        .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"],as_index=False)
                        .ENTRIES_EXITS.first())

In [13]:
turnstiles_daily[["PREV_DATE", "PREV_ENTRIES_EXITS"]] = (turnstiles_daily
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])[["DATE", "ENTRIES_EXITS"]]
                                                       .shift(1))
turnstiles_daily.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

In [14]:
def get_daily_counts(row, max_counter):
    counter = row["ENTRIES_EXITS"] - row["PREV_ENTRIES_EXITS"]
    
    if counter < 0:
        counter = -counter  # adjust for "reverse" counter
        
    if counter > max_counter:
        # Maybe counter was reset, so it may make sense to take the minimum
        counter = min(row["ENTRIES_EXITS"], row["PREV_ENTRIES_EXITS"])
        
    if counter > max_counter:
        # If we still get a counter that is too big, set to zero
        return 0
    
    return counter

turnstiles_daily["DAILY_ENTRIES_EXITS"] = turnstiles_daily.apply(get_daily_counts, axis=1, max_counter=1000000)

In [15]:
station_daily2 = turnstiles_daily.groupby(["STATION", "DATE"])[['DAILY_ENTRIES_EXITS']].sum().reset_index().copy()
station_daily2.head()

Unnamed: 0,STATION,DATE,DAILY_ENTRIES_EXITS
0,1 AV,2018-12-30,26764.0
1,1 AV,2018-12-31,33032.0
2,1 AV,2019-01-01,22086.0
3,1 AV,2019-01-02,39534.0
4,1 AV,2019-01-03,42397.0


In [16]:
station_daily2.sort_values('DAILY_ENTRIES_EXITS', ascending=False).head()

Unnamed: 0,STATION,DATE,DAILY_ENTRIES_EXITS
5548,34 ST-PENN STA,2019-02-28,1733948.0
21085,GREENPOINT AV,2019-03-24,1560361.0
5547,34 ST-PENN STA,2019-02-27,1507269.0
16578,CITY / BUS,2019-03-13,1035606.0
4220,23 ST,2019-03-22,1017340.0


In [17]:
station_totals2 = station_daily2.groupby('STATION')['DAILY_ENTRIES_EXITS'].sum() \
    .reset_index() \
    .sort_values('DAILY_ENTRIES_EXITS', ascending=False).copy()

station_totals2.head(10)

Unnamed: 0,STATION,DAILY_ENTRIES_EXITS
61,34 ST-PENN STA,28681654.0
233,GRD CNTRL-42 ST,21079465.0
59,34 ST-HERALD SQ,17486235.0
46,23 ST,15806222.0
14,14 ST-UNION SQ,15613188.0
352,TIMES SQ-42 ST,14267743.0
68,42 ST-PORT AUTH,13485695.0
110,86 ST,13241202.0
226,FULTON ST,12718086.0
314,PATH NEW WTC,12277033.0


In [18]:
station_average2 = station_daily2.groupby('STATION')['DAILY_ENTRIES_EXITS'].mean() \
    .reset_index() \
    .sort_values('DAILY_ENTRIES_EXITS', ascending=False).copy()

station_average_highest2 = station_average2.head(10).copy()
station_average_highest2

Unnamed: 0,STATION,DAILY_ENTRIES_EXITS
61,34 ST-PENN STA,318685.044444
233,GRD CNTRL-42 ST,234216.277778
59,34 ST-HERALD SQ,194291.5
46,23 ST,175624.688889
14,14 ST-UNION SQ,173479.866667
352,TIMES SQ-42 ST,158530.477778
68,42 ST-PORT AUTH,149841.055556
110,86 ST,147124.466667
226,FULTON ST,141312.066667
314,PATH NEW WTC,136411.477778


In [19]:
import pickle
station_average_highest2.to_pickle('station_average_highest2.pickle')

## 2. Finding the highest traffic days of the week (for the highest traffic stations)

In [20]:
highest_stations2 = station_daily2.loc[(station_daily2["STATION"] == "34 ST-PENN STA")
                                     | (station_daily2["STATION"] == "GRD CNTRL-42 ST") 
                                     | (station_daily2["STATION"] == "34 ST-HERALD SQ") 
                                     | (station_daily2["STATION"] == "23 ST")                                  
                                     | (station_daily2["STATION"] == "14 ST-UNION SQ") 
                                     | (station_daily2["STATION"] == "TIMES SQ-42 ST")
                                     | (station_daily2["STATION"] == "42 ST-PORT AUTH") 
                                     | (station_daily2["STATION"] == "86 ST") 
                                     | (station_daily2["STATION"] == "FULTON ST") 
                                     | (station_daily2["STATION"] == "PATH NEW WTC")
                                     | (station_daily2["STATION"] == "125 ST")
                                     | (station_daily2["STATION"] == "59 ST COLUMBUS")
                                     | (station_daily2["STATION"] == "59 ST")].copy()
highest_stations2.head()

Unnamed: 0,STATION,DATE,DAILY_ENTRIES_EXITS
810,125 ST,2018-12-30,80187.0
811,125 ST,2018-12-31,97475.0
812,125 ST,2019-01-01,63715.0
813,125 ST,2019-01-02,119172.0
814,125 ST,2019-01-03,129947.0


In [21]:
highest_stations2['DAY_OF_WEEK_NUM'] = pd.to_datetime(station_daily2['DATE']).dt.dayofweek
highest_stations2.head()

Unnamed: 0,STATION,DATE,DAILY_ENTRIES_EXITS,DAY_OF_WEEK_NUM
810,125 ST,2018-12-30,80187.0,6
811,125 ST,2018-12-31,97475.0,0
812,125 ST,2019-01-01,63715.0,1
813,125 ST,2019-01-02,119172.0,2
814,125 ST,2019-01-03,129947.0,3


In [22]:
import pickle
highest_stations2.to_pickle('highest_stations2.pickle')

In [23]:
day_of_week_avg2 = highest_stations2.groupby('DAY_OF_WEEK_NUM')['DAILY_ENTRIES_EXITS'].mean() \
    .reset_index() \
    .sort_values('DAILY_ENTRIES_EXITS', ascending=False)

day_of_week_avg2.head(10)

Unnamed: 0,DAY_OF_WEEK_NUM,DAILY_ENTRIES_EXITS
2,2,207435.852071
3,3,207227.994083
4,4,202868.272189
1,1,183521.751479
0,0,170469.118343
5,5,101690.346154
6,6,88892.786982


In [24]:
import pickle
day_of_week_avg2.to_pickle('day_of_week_avg2.pickle')

## 3. Finding the highest traffic hours of the day

In [25]:
turnstiles_time = (turnstiles_df
                        .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"],as_index=False)
                        .ENTRIES_EXITS.first())

In [26]:
turnstiles_time[["PREV_DATE_TIME", "PREV_ENTRIES_EXITS"]] = (turnstiles_time
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])[["DATE_TIME", "ENTRIES_EXITS"]]
                                                       .shift(1))

In [27]:
turnstiles_time.dropna(subset=["PREV_DATE_TIME"], axis=0, inplace=True)

In [28]:
def get_time_counts(row, max_counter):
    counter = row["ENTRIES_EXITS"] - row["PREV_ENTRIES_EXITS"]
    
    if counter < 0:
        counter = -counter  # adjust for "reverse" counter
        
    if counter > max_counter:
        # Maybe counter was reset, so it may make sense to take the minimum
        counter = min(row["ENTRIES_EXITS"], row["PREV_ENTRIES_EXITS"])
        
    if counter > max_counter:
        # If we still get a counter that is too big, set to zero
        return 0
    
    return counter

turnstiles_time["TIME_ENTRIES_EXITS"] = turnstiles_time.apply(get_time_counts, axis=1, max_counter=100000)

turnstiles_time.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,ENTRIES_EXITS,PREV_DATE_TIME,PREV_ENTRIES_EXITS,TIME_ENTRIES_EXITS
1,A002,R051,02-00-00,59 ST,2018-12-29 07:00:00,9225235,2018-12-29 03:00:00,9225207.0,28.0
2,A002,R051,02-00-00,59 ST,2018-12-29 11:00:00,9225402,2018-12-29 07:00:00,9225235.0,167.0
3,A002,R051,02-00-00,59 ST,2018-12-29 15:00:00,9225706,2018-12-29 11:00:00,9225402.0,304.0
4,A002,R051,02-00-00,59 ST,2018-12-29 19:00:00,9226139,2018-12-29 15:00:00,9225706.0,433.0
5,A002,R051,02-00-00,59 ST,2018-12-29 23:00:00,9226405,2018-12-29 19:00:00,9226139.0,266.0


In [29]:
station_time2 = turnstiles_time.groupby(["STATION", "DATE_TIME"])[['TIME_ENTRIES_EXITS']].sum().reset_index().copy()

In [30]:
station_time2['DAY_NUM'] = pd.to_datetime(station_time2['DATE_TIME']).dt.dayofweek
station_time2['HOUR'] = pd.DatetimeIndex(station_time2['DATE_TIME']).hour
station_time2.head()

Unnamed: 0,STATION,DATE_TIME,TIME_ENTRIES_EXITS,DAY_NUM,HOUR
0,1 AV,2018-12-29 07:00:00,672.0,5,7
1,1 AV,2018-12-29 11:00:00,3278.0,5,11
2,1 AV,2018-12-29 15:00:00,7066.0,5,15
3,1 AV,2018-12-29 19:00:00,9225.0,5,19
4,1 AV,2018-12-29 23:00:00,7260.0,5,23


In [31]:
highest_stations_time2 = station_time2.loc[((station_time2["STATION"] == "34 ST-PENN STA")
                                     | (station_time2["STATION"] == "34 ST-HERALD SQ") 
                                     | (station_time2["STATION"] == "GRD CNTRL-42 ST") 
                                     | (station_time2["STATION"] == "TIMES SQ-42 ST")                                  
                                     | (station_time2["STATION"] == "125 ST") 
                                     | (station_time2["STATION"] == "86 ST")
                                     | (station_time2["STATION"] == "14 ST-UNION SQ") 
                                     | (station_time2["STATION"] == "42 ST-PORT AUTH") 
                                     | (station_time2["STATION"] == "59 ST COLUMBUS") 
                                     | (station_time2["STATION"] == "59 ST")
                                     | (station_time2["STATION"] == "23 ST")
                                     | (station_time2["STATION"] == "FULTON ST")
                                     | (station_time2["STATION"] == "PATH NEW WTC"))].reset_index().copy()

In [47]:
highest_stations_time_avg2 = highest_stations_time2.groupby(["HOUR"])[['TIME_ENTRIES_EXITS']].mean().reset_index().copy()
highest_stations_time_avg2.sort_values('HOUR', ascending=True, inplace=True)
highest_stations_time_avg2.head()

Unnamed: 0,HOUR,TIME_ENTRIES_EXITS
0,0,4242.737304
1,1,1607.293277
2,2,366.341255
3,3,1620.708926
4,4,649.627108


In [45]:
highest_stations_time_avg2.drop(columns=["SHIFTED_ENTRIES_EXITS", "SHIFTED TIME"], inplace=True)

In [33]:
import pickle
highest_stations_time_avg2.to_pickle('highest_stations_time_avg2.pickle')