# TTC Subway Ridership

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd
import os
import seaborn as sns

%matplotlib inline
plt.style.use('ggplot')
sns.set_context("notebook")
import warnings
warnings.filterwarnings("ignore")

### 1. Import and Combine Data

In [4]:
# Find all ttc subway ridership data file from the path
filenames = [file for file in os.listdir("TTC_ridership") if 'ttc-subway-ridership' in file and file.endswith('.csv')]
filenames

['ttc-subway-ridership-2014.csv',
 'ttc-subway-ridership-2015.csv',
 'ttc-subway-ridership-2016.csv',
 'ttc-subway-ridership-2018.csv',
 'ttc-subway-ridership-2022.csv']

In [5]:
# Import the TTC subway station name list
ttc_stations = pd.read_csv("TTC_ridership\TTC_stations.csv")
ttc_stations

Unnamed: 0,Station Name,Line
0,Finch,1
1,North York Centre,1
2,Sheppard-Yonge (1 YONGE-UNIVERSITY),1
3,York Mills,1
4,Lawrence,1
...,...,...
75,Lawrence East,3
76,Ellesmere,3
77,Midland,3
78,Scarborough Centre,3


In [6]:
# Separate the TTC subway stations to 4 subway lines
line1 = ttc_stations[ttc_stations["Line"] == 1]
line2 = ttc_stations[ttc_stations["Line"] == 2]
line3 = ttc_stations[ttc_stations["Line"] == 3]
line4 = ttc_stations[ttc_stations["Line"] == 4]

In [16]:
# Start counting
count = 0

# Run through all files to put ridership and year data
for filename in filenames:
    # Read the yearly ridership data
    ridership_yearly_data = pd.read_csv(os.path.join("TTC_ridership", filename), header = None)
    # Get the year from the file name
    year = filename.split(".")[0][-4:]
    # Separate the name from the data
    name = [ridership_yearly_data[0][i] for i in range(0, len(ridership_yearly_data), 2)]
    # Separate the corresponding ridership count from the data
    ridership = [ridership_yearly_data[0][i] for i in range(1, len(ridership_yearly_data)+1, 2)]
    
    if count == 0:
        # Create a new ridership dataframe with data
        ttc_subway_ridership = pd.DataFrame({"Station Name": name, str(year): ridership})
    else:
        # Merge two dataframe by add a column for that year
        ttc_subway_ridership = ttc_subway_ridership.merge(pd.DataFrame({"Station Name": name, str(year): ridership}), how = "left", on = "Station Name")
    count = count + 1

ttc_subway_ridership.head()
    

Unnamed: 0,Station Name,2014,2015,2016,2018,2022
0,Bloor-Yonge (1 YONGE-UNIVERSITY),216190,216190,204630,204630,155186
1,Bathurst,35510,36460,29320,26900,26234
2,College,47940,47790,47600,44370,42883
3,Bay,31050,30860,27090,32690,24260
4,Davisville,23040,25330,24300,25990,13973


In [17]:
# Merge TTC station line to each station
ttc_subway_ridership = ttc_subway_ridership.merge(ttc_stations, how = "left", on = "Station Name")
ttc_subway_ridership = ttc_subway_ridership.set_index("Station Name")
ttc_subway_ridership.head()

Unnamed: 0_level_0,2014,2015,2016,2018,2022,Line
Station Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bloor-Yonge (1 YONGE-UNIVERSITY),216190,216190,204630,204630,155186,1
Bathurst,35510,36460,29320,26900,26234,2
College,47940,47790,47600,44370,42883,1
Bay,31050,30860,27090,32690,24260,2
Davisville,23040,25330,24300,25990,13973,1


In [25]:
# Convert all numeric columns to integers
numeric_columns = ttc_subway_ridership.columns.difference(['Station Name'])
ttc_subway_ridership[numeric_columns] = ttc_subway_ridership[numeric_columns].replace(',', '', regex=True).apply(pd.to_numeric, errors='coerce').astype('Int64')
ttc_subway_ridership

Unnamed: 0_level_0,2014,2015,2016,2018,2022,Line
Station Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bloor-Yonge (1 YONGE-UNIVERSITY),216190,216190,204630,204630,155186,1
Bathurst,35510,36460,29320,26900,26234,2
College,47940,47790,47600,44370,42883,1
Bay,31050,30860,27090,32690,24260,2
Davisville,23040,25330,24300,25990,13973,1
...,...,...,...,...,...,...
McCowan,3930,4620,3710,3860,2331,3
Leslie,6460,6600,6200,5990,5920,4
Midland,2460,2410,2370,2440,1293,3
Sheppard-Yonge (4 SHEPPARD),44590,45750,45750,45750,34532,4


### 2. Find the busiest stations 

In [32]:
# Find the index and value of the maximum for each column
max_values = ttc_subway_ridership.iloc[:, 0:5].max()
max_indices = ttc_subway_ridership.iloc[:, 0:5].idxmax()

# Combine results into a DataFrame
result_df = pd.DataFrame({
    'Column': max_values.index,
    'MaxIndex': max_indices.values,
    'MaxValue': max_values.values
})

print("Index and value of maximum for each column:")
print(result_df)

Index and value of maximum for each column:
  Column                          MaxIndex  MaxValue
0   2014  Bloor-Yonge (1 YONGE-UNIVERSITY)    216190
1   2015  Bloor-Yonge (1 YONGE-UNIVERSITY)    216190
2   2016  Bloor-Yonge (1 YONGE-UNIVERSITY)    204630
3   2018  Bloor-Yonge (1 YONGE-UNIVERSITY)    204630
4   2022  Bloor-Yonge (1 YONGE-UNIVERSITY)    155186


## 3. Find the total volume of each line

In [33]:
ttc_subway_ridership.groupby("Line").sum()

Unnamed: 0_level_0,2014,2015,2016,2018,2022
Line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1492510,1472850,1440680,1457900,1299146
2,1071210,1021270,1006110,1055280,722242
3,77630,77140,70610,70170,32914
4,95360,98150,95550,100310,78963


These values can be used to normalize the graph in TTC delay data.

In [35]:
ttc_subway_ridership.to_csv("Output\TTC_subway_ridership_by_lines.csv")