# Exploratory data analysis

#### Importing packages 

In [1]:
import pandas as pd
import numpy as np
import pickle

#### Loading the pickled turnstiles dataframe

In [2]:
turnstiles_df = pd.read_pickle('turnstiles_df.pickle')
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DATE_TIME,ENTRIES_EXITS
209469,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2021-03-26,21:00:00,2021-03-26 21:00:00,6098
209468,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2021-03-26,17:00:00,2021-03-26 17:00:00,6098
209467,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2021-03-26,13:00:00,2021-03-26 13:00:00,6098
209466,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2021-03-26,09:00:00,2021-03-26 09:00:00,6098
209465,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2021-03-26,05:00:00,2021-03-26 05:00:00,6098


## 1. Finding the station with the highest entries/exits foot traffic

### 1.1 Calculating daily entries/exits traffic for each turnstile first


Finding the maximum (end of day) entries/exits traffic for each turnstile  
(the maximum entries and exits would occur at the last reading of the day. Since rows are listed in descending order of time, we can grab the last reading of the day (located at the top of the group) using the .first() DataFrame method)

In [3]:
turnstiles_daily = (turnstiles_df
                        .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"],as_index=False)
                        .ENTRIES_EXITS.first())
turnstiles_daily.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES_EXITS
0,A002,R051,02-00-00,59 ST,2020-12-26,10066556
1,A002,R051,02-00-00,59 ST,2020-12-27,10066986
2,A002,R051,02-00-00,59 ST,2020-12-28,10067929
3,A002,R051,02-00-00,59 ST,2020-12-29,10068802
4,A002,R051,02-00-00,59 ST,2020-12-30,10069652


Calculating daily turnstile entries/exits traffic

In [4]:
turnstiles_daily[["PREV_DATE", "PREV_ENTRIES_EXITS"]] = (turnstiles_daily
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])[["DATE", "ENTRIES_EXITS"]]
                                                       .shift(1))
turnstiles_daily.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES_EXITS,PREV_DATE,PREV_ENTRIES_EXITS
0,A002,R051,02-00-00,59 ST,2020-12-26,10066556,NaT,
1,A002,R051,02-00-00,59 ST,2020-12-27,10066986,2020-12-26,10066556.0
2,A002,R051,02-00-00,59 ST,2020-12-28,10067929,2020-12-27,10066986.0
3,A002,R051,02-00-00,59 ST,2020-12-29,10068802,2020-12-28,10067929.0
4,A002,R051,02-00-00,59 ST,2020-12-30,10069652,2020-12-29,10068802.0


Dropping the rows for the earliest date in the dataframe

In [5]:
turnstiles_daily.dropna(subset=["PREV_DATE"], axis=0, inplace=True)
turnstiles_daily.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES_EXITS,PREV_DATE,PREV_ENTRIES_EXITS
1,A002,R051,02-00-00,59 ST,2020-12-27,10066986,2020-12-26,10066556.0
2,A002,R051,02-00-00,59 ST,2020-12-28,10067929,2020-12-27,10066986.0
3,A002,R051,02-00-00,59 ST,2020-12-29,10068802,2020-12-28,10067929.0
4,A002,R051,02-00-00,59 ST,2020-12-30,10069652,2020-12-29,10068802.0
5,A002,R051,02-00-00,59 ST,2020-12-31,10070231,2020-12-30,10069652.0


Checking to see if daily entries/exits are positive and not too large

In [6]:
(turnstiles_daily["ENTRIES_EXITS"] - turnstiles_daily["PREV_ENTRIES_EXITS"]).describe()

count    4.488750e+05
mean    -2.130679e+03
std      8.983102e+06
min     -2.731588e+09
25%      1.310000e+02
50%      3.950000e+02
75%      7.800000e+02
max      3.539499e+09
dtype: float64

Checking to see how many rows have entries/exits smaller than their previous entries/exits

In [7]:
turnstiles_daily[turnstiles_daily["ENTRIES_EXITS"] < turnstiles_daily["PREV_ENTRIES_EXITS"]].shape

(4110, 8)

#### Note:
- We have negative values for both entries and exits 
- Some values are way too big for both entries and exits

#### Creating a daily entries/exits traffic column taking into consideration reverse entries/exits and entries/exits that seem too big

(Setting max counter as 1,000,000)

In [8]:
def get_daily_counts(row, max_counter):
    counter = row["ENTRIES_EXITS"] - row["PREV_ENTRIES_EXITS"]
    
    if counter < 0:
        counter = -counter  # adjust for "reverse" counter
        
    if counter > max_counter:
        # Maybe counter was reset, so it may make sense to take the minimum
        print(f'entries: {row["ENTRIES_EXITS"]} <-- {row["PREV_ENTRIES_EXITS"]}')
        counter = min(row["ENTRIES_EXITS"], row["PREV_ENTRIES_EXITS"])
        
    if counter > max_counter:
        # If we still get a counter that is too big, set to zero
        return 0
    
    return counter

turnstiles_daily["DAILY_ENTRIES_EXITS"] = turnstiles_daily.apply(get_daily_counts, axis=1, max_counter=1000000)

turnstiles_daily.head()

entries: 326 <-- 15510851.0
entries: 98330 <-- 35933562.0
entries: 301 <-- 41538834.0
entries: 252 <-- 6821752.0
entries: 393220 <-- 202882759.0
entries: 2685 <-- 1648400.0
entries: 243 <-- 3889973.0
entries: 89 <-- 5153759.0
entries: 197 <-- 2024782.0
entries: 8273099 <-- 5447183.0
entries: 1638448 <-- 8273831.0
entries: 136 <-- 1641837.0
entries: 766 <-- 1288384.0
entries: 327907 <-- 3435059.0
entries: 16860616 <-- 180544.0
entries: 721483908 <-- 2840043.0
entries: 262388 <-- 3737209.0
entries: 108 <-- 2669927.0
entries: 16712067 <-- 8670197.0
entries: 279 <-- 16712991.0
entries: 673 <-- 4184933.0
entries: 241 <-- 202655055.0
entries: 198 <-- 17997783.0
entries: 3379 <-- 29079019.0
entries: 112 <-- 14331544.0
entries: 258 <-- 1052823.0
entries: 2293806 <-- 10907224.0
entries: 43 <-- 3368330.0
entries: 1377460 <-- 2549981.0
entries: 1261 <-- 1379132.0
entries: 517 <-- 2731588401.0
entries: 57 <-- 1306536.0
entries: 2 <-- 83886189.0
entries: 229 <-- 8272936.0
entries: 1638695 <-- 14401

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES_EXITS,PREV_DATE,PREV_ENTRIES_EXITS,DAILY_ENTRIES_EXITS
1,A002,R051,02-00-00,59 ST,2020-12-27,10066986,2020-12-26,10066556.0,430.0
2,A002,R051,02-00-00,59 ST,2020-12-28,10067929,2020-12-27,10066986.0,943.0
3,A002,R051,02-00-00,59 ST,2020-12-29,10068802,2020-12-28,10067929.0,873.0
4,A002,R051,02-00-00,59 ST,2020-12-30,10069652,2020-12-29,10068802.0,850.0
5,A002,R051,02-00-00,59 ST,2020-12-31,10070231,2020-12-30,10069652.0,579.0


Checking to see if the daily entries/exits traffic makes sense

In [9]:
turnstiles_daily.DAILY_ENTRIES_EXITS.describe()

count    448875.000000
mean        615.024557
std        5265.107219
min           0.000000
25%         138.000000
50%         400.000000
75%         785.000000
max      891655.000000
Name: DAILY_ENTRIES_EXITS, dtype: float64

### 1.2 Calculating daily entries/exits traffic for each station

In [10]:
station_daily = turnstiles_daily.groupby(["STATION", "DATE"])[['DAILY_ENTRIES_EXITS']].sum().reset_index().copy()
station_daily.head()

Unnamed: 0,STATION,DATE,DAILY_ENTRIES_EXITS
0,1 AV,2020-12-27,6748.0
1,1 AV,2020-12-28,12405.0
2,1 AV,2020-12-29,13486.0
3,1 AV,2020-12-30,13509.0
4,1 AV,2020-12-31,11429.0


In [11]:
station_daily.sort_values('DAILY_ENTRIES_EXITS', ascending=False).head()

Unnamed: 0,STATION,DATE,DAILY_ENTRIES_EXITS
31709,TIMES SQ-42 ST,2021-02-05,1113904.0
5393,34 ST-HERALD SQ,2021-03-25,908551.0
8580,7 AV,2021-01-31,898062.0
23084,JFK JAMAICA CT1,2021-02-17,834591.0
21048,GRD CNTRL-42 ST,2021-03-23,778514.0


### 1.2.1 Ans: Calculating total entries/exits traffic for each station to find the station with the highest traffic

In [12]:
station_totals = station_daily.groupby('STATION')['DAILY_ENTRIES_EXITS'].sum() \
    .reset_index() \
    .sort_values('DAILY_ENTRIES_EXITS', ascending=False).copy()

station_totals.head()

Unnamed: 0,STATION,DAILY_ENTRIES_EXITS
61,34 ST-PENN STA,6218194.0
59,34 ST-HERALD SQ,5375856.0
233,GRD CNTRL-42 ST,4646933.0
352,TIMES SQ-42 ST,4613925.0
9,125 ST,4442009.0


### 1.2.2 Ans: Calculating average entries/exits traffic per day for each station to find the stations with the highest traffic

In [13]:
station_average = station_daily.groupby('STATION')['DAILY_ENTRIES_EXITS'].mean() \
    .reset_index() \
    .sort_values('DAILY_ENTRIES_EXITS', ascending=False).copy()

station_average_highest = station_average.head(10).copy()
station_average_highest

Unnamed: 0,STATION,DAILY_ENTRIES_EXITS
61,34 ST-PENN STA,69091.044444
59,34 ST-HERALD SQ,59731.733333
233,GRD CNTRL-42 ST,51632.588889
352,TIMES SQ-42 ST,51265.833333
9,125 ST,49355.655556
110,86 ST,48684.777778
68,42 ST-PORT AUTH,42630.077778
14,14 ST-UNION SQ,42573.811111
86,59 ST COLUMBUS,40999.722222
85,59 ST,40159.788889


 ## 1.3 Ranking Stations by Traffic

In [14]:
station_average.describe()

Unnamed: 0,DAILY_ENTRIES_EXITS
count,378.0
mean,8116.206598
std,9583.192128
min,132.1
25%,2683.05
50%,5064.861111
75%,9335.516667
max,69091.044444


In [15]:
station_average["TRAFFIC_LEVEL"] = pd.qcut(station_average["DAILY_ENTRIES_EXITS"], q=3, labels=["Low", "Medium", "High"])
station_average.sample(5)

Unnamed: 0,STATION,DAILY_ENTRIES_EXITS,TRAFFIC_LEVEL
376,YORK ST,3429.322222,Low
23,167 ST,11616.655556,High
153,BEVERLEY ROAD,2188.533333,Low
253,JAMAICA VAN WK,3632.466667,Medium
41,21 ST,1277.588889,Low


 ## 2. Finding the days of the week with the highest entries/exits foot traffic*
 * for the stations with the highest traffic

 #### Creating a new dataframe with only the highest traffic stations

In [16]:
highest_stations = station_daily.loc[(station_daily["STATION"] == "34 ST-PENN STA")
                                     | (station_daily["STATION"] == "34 ST-HERALD SQ") 
                                     | (station_daily["STATION"] == "GRD CNTRL-42 ST") 
                                     | (station_daily["STATION"] == "TIMES SQ-42 ST") 
                                     | (station_daily["STATION"] == "125 ST")
                                     | (station_daily["STATION"] == "86 ST")
                                     | (station_daily["STATION"] == "14 ST-UNION SQ") 
                                     | (station_daily["STATION"] == "42 ST-PORT AUTH")
                                     | (station_daily["STATION"] == "59 ST COLUMBUS")
                                     | (station_daily["STATION"] == "59 ST")
                                     | (station_daily["STATION"] == "23 ST")
                                     | (station_daily["STATION"] == "FULTON ST")
                                     | (station_daily["STATION"] == "PATH NEW WTC")].copy()
highest_stations.head()


Unnamed: 0,STATION,DATE,DAILY_ENTRIES_EXITS
808,125 ST,2020-12-27,28787.0
809,125 ST,2020-12-28,48154.0
810,125 ST,2020-12-29,50170.0
811,125 ST,2020-12-30,51662.0
812,125 ST,2020-12-31,46992.0


In [17]:
highest_stations['DAY_OF_WEEK_NUM'] = pd.to_datetime(station_daily['DATE']).dt.dayofweek
highest_stations.head()

Unnamed: 0,STATION,DATE,DAILY_ENTRIES_EXITS,DAY_OF_WEEK_NUM
808,125 ST,2020-12-27,28787.0,6
809,125 ST,2020-12-28,48154.0,0
810,125 ST,2020-12-29,50170.0,1
811,125 ST,2020-12-30,51662.0,2
812,125 ST,2020-12-31,46992.0,3


#### Pickling HIGHEST_STATIONS for visualization

In [18]:
import pickle
highest_stations.to_pickle('highest_stations.pickle')

In [19]:
highest_stations.sort_values('DAILY_ENTRIES_EXITS', ascending=False)

Unnamed: 0,STATION,DATE,DAILY_ENTRIES_EXITS,DAY_OF_WEEK_NUM
31709,TIMES SQ-42 ST,2021-02-05,1113904.0,4
5393,34 ST-HERALD SQ,2021-03-25,908551.0,3
21048,GRD CNTRL-42 ST,2021-03-23,778514.0,1
31738,TIMES SQ-42 ST,2021-03-06,701727.0,5
7820,59 ST COLUMBUS,2021-03-22,674316.0,0
...,...,...,...,...
28286,PATH NEW WTC,2021-01-31,8341.0,6
28258,PATH NEW WTC,2021-01-03,7172.0,6
28293,PATH NEW WTC,2021-02-07,7049.0,6
28287,PATH NEW WTC,2021-02-01,4032.0,0


### 2. Ans:

In [20]:
day_of_week_avg = highest_stations.groupby('DAY_OF_WEEK_NUM')['DAILY_ENTRIES_EXITS'].mean() \
    .reset_index() \
    .sort_values('DAILY_ENTRIES_EXITS', ascending=False)

day_of_week_avg.head(10)

Unnamed: 0,DAY_OF_WEEK_NUM,DAILY_ENTRIES_EXITS
4,4,63840.059172
3,3,54758.159763
1,1,50875.526627
2,2,49642.360947
0,0,48425.147929
5,5,35500.211538
6,6,23767.171598


#### Pickling HIGHEST_STATIONS for visualization

In [21]:
import pickle
day_of_week_avg.to_pickle('day_of_week_avg.pickle')

## 3. Finding the hours of the day with the highest entries/exits foot traffic 

#### Same(ish) steps as above for finding "turnstiles_daily", except using DATE_TIME, then TIME, then adding an hours column

In [3]:
turnstiles_time = (turnstiles_df
                        .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"],as_index=False)
                        .ENTRIES_EXITS.first())
turnstiles_time.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,ENTRIES_EXITS
0,A002,R051,02-00-00,59 ST,2020-12-26 03:00:00,10066078
1,A002,R051,02-00-00,59 ST,2020-12-26 07:00:00,10066092
2,A002,R051,02-00-00,59 ST,2020-12-26 11:00:00,10066167
3,A002,R051,02-00-00,59 ST,2020-12-26 15:00:00,10066277
4,A002,R051,02-00-00,59 ST,2020-12-26 19:00:00,10066456


In [4]:
turnstiles_time[["PREV_DATE_TIME", "PREV_ENTRIES_EXITS"]] = (turnstiles_time
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])[["DATE_TIME", "ENTRIES_EXITS"]]
                                                       .shift(1))
turnstiles_time.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,ENTRIES_EXITS,PREV_DATE_TIME,PREV_ENTRIES_EXITS
0,A002,R051,02-00-00,59 ST,2020-12-26 03:00:00,10066078,NaT,
1,A002,R051,02-00-00,59 ST,2020-12-26 07:00:00,10066092,2020-12-26 03:00:00,10066078.0
2,A002,R051,02-00-00,59 ST,2020-12-26 11:00:00,10066167,2020-12-26 07:00:00,10066092.0
3,A002,R051,02-00-00,59 ST,2020-12-26 15:00:00,10066277,2020-12-26 11:00:00,10066167.0
4,A002,R051,02-00-00,59 ST,2020-12-26 19:00:00,10066456,2020-12-26 15:00:00,10066277.0


In [6]:
turnstiles_time.dropna(subset=["PREV_DATE_TIME"], axis=0, inplace=True)

In [7]:
def get_time_counts(row, max_counter):
    counter = row["ENTRIES_EXITS"] - row["PREV_ENTRIES_EXITS"]
    
    if counter < 0:
        counter = -counter  # adjust for "reverse" counter
        
    if counter > max_counter:
        # Maybe counter was reset, so it may make sense to take the minimum
        counter = min(row["ENTRIES_EXITS"], row["PREV_ENTRIES_EXITS"])
        
    if counter > max_counter:
        # If we still get a counter that is too big, set to zero
        return 0
    
    return counter

turnstiles_time["TIME_ENTRIES_EXITS"] = turnstiles_time.apply(get_time_counts, axis=1, max_counter=100000)

turnstiles_time.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,ENTRIES_EXITS,PREV_DATE_TIME,PREV_ENTRIES_EXITS,TIME_ENTRIES_EXITS
1,A002,R051,02-00-00,59 ST,2020-12-26 07:00:00,10066092,2020-12-26 03:00:00,10066078.0,14.0
2,A002,R051,02-00-00,59 ST,2020-12-26 11:00:00,10066167,2020-12-26 07:00:00,10066092.0,75.0
3,A002,R051,02-00-00,59 ST,2020-12-26 15:00:00,10066277,2020-12-26 11:00:00,10066167.0,110.0
4,A002,R051,02-00-00,59 ST,2020-12-26 19:00:00,10066456,2020-12-26 15:00:00,10066277.0,179.0
5,A002,R051,02-00-00,59 ST,2020-12-26 23:00:00,10066556,2020-12-26 19:00:00,10066456.0,100.0


Checking to see if the timeblock entries/exits traffic makes sense

In [8]:
turnstiles_time.TIME_ENTRIES_EXITS.describe()

count    2.714713e+06
mean     9.301527e+01
std      2.841851e+02
min      0.000000e+00
25%      7.000000e+00
50%      4.500000e+01
75%      1.280000e+02
max      9.800300e+04
Name: TIME_ENTRIES_EXITS, dtype: float64

#### Same steps as above for finding "station_daily"

In [9]:
station_time = turnstiles_time.groupby(["STATION", "DATE_TIME"])[['TIME_ENTRIES_EXITS']].sum().reset_index().copy()
station_time.head()

Unnamed: 0,STATION,DATE_TIME,TIME_ENTRIES_EXITS
0,1 AV,2020-12-26 07:00:00,265.0
1,1 AV,2020-12-26 11:00:00,1213.0
2,1 AV,2020-12-26 15:00:00,1926.0
3,1 AV,2020-12-26 19:00:00,2434.0
4,1 AV,2020-12-26 23:00:00,1300.0


In [10]:
station_time['DAY_NUM'] = pd.to_datetime(station_time['DATE_TIME']).dt.dayofweek
station_time['HOUR'] = pd.DatetimeIndex(station_time['DATE_TIME']).hour
station_time.head()

Unnamed: 0,STATION,DATE_TIME,TIME_ENTRIES_EXITS,DAY_NUM,HOUR
0,1 AV,2020-12-26 07:00:00,265.0,5,7
1,1 AV,2020-12-26 11:00:00,1213.0,5,11
2,1 AV,2020-12-26 15:00:00,1926.0,5,15
3,1 AV,2020-12-26 19:00:00,2434.0,5,19
4,1 AV,2020-12-26 23:00:00,1300.0,5,23


In [11]:
station_time.sort_values('TIME_ENTRIES_EXITS', ascending=False).head()

Unnamed: 0,STATION,DATE_TIME,TIME_ENTRIES_EXITS,DAY_NUM,HOUR
57390,57 ST-7 AV,2021-01-28 11:00:00,100928.0,3,11
126621,CITY / BUS,2021-02-03 10:39:37,97850.0,2,10
126632,CITY / BUS,2021-02-03 12:26:10,97812.0,2,12
176214,GROVE STREET,2021-01-13 07:00:52,95592.0,2,7
196055,HARRISON,2021-02-26 10:35:26,93985.0,4,10


In [12]:
station_time.groupby(["HOUR"])[['TIME_ENTRIES_EXITS']].median()

Unnamed: 0_level_0,TIME_ENTRIES_EXITS
HOUR,Unnamed: 1_level_1
0,233.0
1,23.0
2,8.0
3,73.0
4,30.0
5,6.0
6,9.0
7,161.0
8,319.0
9,82.0


#### Same steps as highest_stations_daily

In [13]:
highest_stations_time = station_time.loc[(station_time["STATION"] == "34 ST-PENN STA")
                                     | (station_time["STATION"] == "34 ST-HERALD SQ") 
                                     | (station_time["STATION"] == "GRD CNTRL-42 ST") 
                                     | (station_time["STATION"] == "TIMES SQ-42 ST")                                  
                                     | (station_time["STATION"] == "125 ST") 
                                     | (station_time["STATION"] == "86 ST")
                                     | (station_time["STATION"] == "14 ST-UNION SQ") 
                                     | (station_time["STATION"] == "42 ST-PORT AUTH") 
                                     | (station_time["STATION"] == "59 ST COLUMBUS") 
                                     | (station_time["STATION"] == "59 ST")
                                     | (station_time["STATION"] == "23 ST")
                                     | (station_time["STATION"] == "FULTON ST")
                                     | (station_time["STATION"] == "PATH NEW WTC")].reset_index().copy()
highest_stations_time.head()

Unnamed: 0,index,STATION,DATE_TIME,TIME_ENTRIES_EXITS,DAY_NUM,HOUR
0,5454,125 ST,2020-12-26 04:00:00,409.0,5,4
1,5455,125 ST,2020-12-26 07:00:00,600.0,5,7
2,5456,125 ST,2020-12-26 08:00:00,1546.0,5,8
3,5457,125 ST,2020-12-26 11:00:00,2889.0,5,11
4,5458,125 ST,2020-12-26 12:00:00,3179.0,5,12


In [14]:
highest_stations_time.sort_values('TIME_ENTRIES_EXITS', ascending=False)

Unnamed: 0,index,STATION,DATE_TIME,TIME_ENTRIES_EXITS,DAY_NUM,HOUR
15195,307006,PATH NEW WTC,2021-01-13 11:46:28,93055.0,2,11
19233,311044,PATH NEW WTC,2021-01-29 07:55:09,67837.0,4,7
28278,320089,PATH NEW WTC,2021-03-05 12:05:11,50153.0,4,12
17518,309329,PATH NEW WTC,2021-01-22 13:54:25,43298.0,4,13
2609,35153,23 ST,2021-03-15 08:00:00,32500.0,0,8
...,...,...,...,...,...,...
19586,311397,PATH NEW WTC,2021-01-30 17:03:21,0.0,5,17
15880,307691,PATH NEW WTC,2021-01-16 04:36:10,0.0,5,4
19559,311370,PATH NEW WTC,2021-01-30 14:42:06,0.0,5,14
33055,324866,PATH NEW WTC,2021-03-24 02:44:25,0.0,2,2


In [15]:
highest_stations_time.sort_values('HOUR', ascending=True).head()

Unnamed: 0,index,STATION,DATE_TIME,TIME_ENTRIES_EXITS,DAY_NUM,HOUR
8218,168195,FULTON ST,2021-01-23 00:00:00,2090.0,5,0
29966,321777,PATH NEW WTC,2021-03-12 00:58:16,50.0,4,0
29967,321778,PATH NEW WTC,2021-03-12 00:58:31,5.0,4,0
19414,311225,PATH NEW WTC,2021-01-30 00:54:54,77.0,5,0
19413,311224,PATH NEW WTC,2021-01-30 00:51:28,25.0,5,0


In [16]:
highest_stations_time_avg = highest_stations_time.groupby(["HOUR"])[['TIME_ENTRIES_EXITS']].mean().reset_index().copy()
highest_stations_time_avg.sort_values('HOUR', ascending=True, inplace=True)

#### Pickling "highest_stations_time_avg" for visualization

In [17]:
import pickle
highest_stations_time_avg.to_pickle('highest_stations_time_avg.pickle')