# Identifying potential bus routes - combined analysis with geospatial and passenger volume data

In [2]:
import pandas as pd
import numpy as np

In [72]:
ridership_df = pd.read_csv('data/cleaned/exceeding_stop_sequences_with_trips.csv')
geospatial_analysis_df = pd.read_csv('data/cleaned/BusMRTOverlap.csv')
bus_category_df = pd.read_csv('data/cleaned/bus_route_trips.csv')

In [73]:
# set columns for ridership data

# Rename columns to remove "Unnamed" and clarify the index
# This assumes 'ServiceNo' and 'Threshold' should be top-level headers only
ridership_df.columns = pd.MultiIndex.from_tuples([
    ('ServiceNo', '', ''),
    ('Threshold', '', ''),
    ('2024-07', 'Direction_1', '%_Exceed'),
    ('2024-07', 'Direction_1', 'StopSeqs'),
    ('2024-07', 'Direction_2', '%_Exceed'),
    ('2024-07', 'Direction_2', 'StopSeqs'),
    ('2024-08', 'Direction_1', '%_Exceed'),
    ('2024-08', 'Direction_1', 'StopSeqs'),
    ('2024-08', 'Direction_2', '%_Exceed'),
    ('2024-08', 'Direction_2', 'StopSeqs'),
    ('2024-09', 'Direction_1', '%_Exceed'),
    ('2024-09', 'Direction_1', 'StopSeqs'),
    ('2024-09', 'Direction_2', '%_Exceed'),
    ('2024-09', 'Direction_2', 'StopSeqs')
])

# Drop any rows with NaN in 'ServiceNo' if these were added accidentally as data rows
ridership_df = ridership_df.dropna(subset=[('ServiceNo', '', '')]).reset_index(drop=True)

For this analysis we will only consider Trunk services, and be excluding all other types of Bus Routes

In [74]:
trunk_services = bus_category_df[bus_category_df["Category"] == "TRUNK"]

ridership_df = ridership_df[ridership_df[('ServiceNo', '', '')].isin(trunk_services["ServiceNo"])]
ridership_df = ridership_df.reset_index(drop=True)

geospatial_analysis_df = geospatial_analysis_df[geospatial_analysis_df["Bus_ServiceNo"].isin(trunk_services["ServiceNo"])]
geospatial_analysis_df = geospatial_analysis_df.reset_index(drop=True)

In [75]:
ridership_df.head()

Unnamed: 0_level_0,ServiceNo,Threshold,2024-07,2024-07,2024-07,2024-07,2024-08,2024-08,2024-08,2024-08,2024-09,2024-09,2024-09,2024-09
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Direction_1,Direction_1,Direction_2,Direction_2,Direction_1,Direction_1,Direction_2,Direction_2,Direction_1,Direction_1,Direction_2,Direction_2
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,%_Exceed,StopSeqs,%_Exceed,StopSeqs,%_Exceed,StopSeqs,%_Exceed,StopSeqs,%_Exceed,StopSeqs,%_Exceed,StopSeqs
0,10,8.0,9.46,"[4, 20, 36, 42, 55, 62, 67]",18.92,"[10, 13, 15, 21, 34, 36, 37, 41, 47, 48, 53, 5...",12.16,"[2, 4, 20, 30, 36, 42, 55, 62, 67]",20.27,"[10, 13, 15, 21, 34, 36, 37, 39, 41, 47, 48, 5...",9.46,"[2, 4, 20, 42, 55, 62, 67]",20.27,"[10, 13, 15, 21, 34, 36, 37, 39, 41, 47, 48, 5..."
1,100,8.0,13.79,"[2, 6, 13, 14, 16, 39, 44, 52]",13.79,"[5, 6, 27, 29, 33, 43, 47, 51]",13.79,"[2, 6, 13, 14, 16, 39, 44, 52]",15.52,"[5, 6, 27, 29, 33, 36, 43, 47, 51]",15.52,"[2, 6, 13, 14, 16, 39, 44, 51, 52]",15.52,"[5, 6, 27, 29, 33, 36, 43, 47, 51]"
2,100A,8.0,61.54,"[2, 5, 6, 7, 9, 10, 12, 13]",0.0,[],61.54,"[2, 5, 6, 7, 9, 10, 12, 13]",0.0,[],61.54,"[2, 5, 6, 7, 9, 10, 12, 13]",0.0,[]
3,101,8.0,10.64,"[7, 12, 26, 40, 42]",0.0,[],12.77,"[7, 12, 26, 37, 40, 42]",0.0,[],12.77,"[7, 12, 26, 37, 40, 42]",0.0,[]
4,102,8.0,20.0,"[3, 10, 15, 20, 28, 34, 37, 42, 44, 47, 52, 53]",0.0,[],20.0,"[3, 10, 15, 20, 28, 34, 37, 42, 44, 47, 52, 53]",0.0,[],20.0,"[3, 10, 15, 20, 28, 34, 37, 42, 44, 47, 52, 53]",0.0,[]


In [76]:
geospatial_analysis_df.head()

Unnamed: 0,MRT_Line,Bus_ServiceNo,Bus_Route_Length_m,Overlap_Length_m,Coverage_Percentage,Consecutive_Coverage_Percentage,Max_Consecutive_Segments,Weighted_Average_Angle,Weighted_Average_Score
0,Bukit Panjang LRT,973A,1249.539533,1249.539533,16.666667,100.0,2.0,30.7,52.806667
1,Jurong Region,179,7984.510855,7984.510855,9.333333,100.0,8.0,39.8,51.693333
2,Jurong Region,98A,2960.207397,2960.207397,6.666667,100.0,3.0,43.8,51.426667
3,Punggol LRT,84W,5231.193672,5231.193672,55.0,50.0,3.0,44.6,50.92
4,Jurong Region,154A,2451.852352,2451.852352,6.666667,100.0,3.0,40.1,50.686667


## 1. Limit the scope from geospatial analysis
We will consider new lines that are introduced recently or will be introduced in the future:
- Thomson East Coast (June 2024)
- Jurong Regional Line (~2027)
- Cross Island Line (~2032)  

Within this scope, we identify the top 20 buses identified that have a high degree of overlap with new MRT Lines, based on a weighted scoring system.  
A high `Weighted_Average_Score` refers to a bus route having many stops that are close to an MRT line, and have a general route trajectory similar to the given MRT line

In [77]:
identified_buses_df = geospatial_analysis_df[~geospatial_analysis_df['MRT_Line'].str.contains("LRT") \
    & (geospatial_analysis_df['MRT_Line'].str.contains("Jurong Region") \
    | geospatial_analysis_df['MRT_Line'].str.contains("Cross Island") \
    | geospatial_analysis_df['MRT_Line'].str.contains("Thomson East Coast"))] \
    [:20]
identified_buses_df

Unnamed: 0,MRT_Line,Bus_ServiceNo,Bus_Route_Length_m,Overlap_Length_m,Coverage_Percentage,Consecutive_Coverage_Percentage,Max_Consecutive_Segments,Weighted_Average_Angle,Weighted_Average_Score
1,Jurong Region,179,7984.510855,7984.510855,9.333333,100.0,8.0,39.8,51.693333
2,Jurong Region,98A,2960.207397,2960.207397,6.666667,100.0,3.0,43.8,51.426667
4,Jurong Region,154A,2451.852352,2451.852352,6.666667,100.0,3.0,40.1,50.686667
5,Jurong Region,975A,1555.25883,1292.371822,1.333333,100.0,2.0,47.8,50.093333
9,Jurong Region,181M,5683.547588,5683.547588,6.666667,83.333333,5.0,33.4,42.68
34,Cross Island,112A,3991.852192,2315.355008,3.030303,50.0,2.0,40.0,29.212121
35,Jurong Region,198A,4147.384256,4147.384256,6.666667,40.0,2.0,50.3,28.726667
36,Cross Island,114,3755.500572,3079.033696,3.030303,50.0,2.0,37.5,28.712121
37,Cross Island,83T,5486.421431,4719.286876,6.060606,50.0,3.0,30.0,28.424242
38,Jurong Region,98M,15371.17417,15023.1655,10.666667,37.5,6.0,44.2,28.106667


In [78]:
identified_bus_routes = identified_buses_df['Bus_ServiceNo'].unique()
identified_bus_routes

array(['179', '98A', '154A', '975A', '181M', '112A', '198A', '114', '83T',
       '98M', '249A', '160A', '154B', '199', '99', '179A', '158A', '185',
       '49', '181'], dtype=object)

## 2. Further narrow down the list using ridership data
We want to further supplement our choices with ridership data - for the routes identified via geospatial analysis, we re-rank them based on increasing ridership.  

In our ridership analysis, for each bus route, we counted the number of stops that have low passenger volume for the majority of the day (more details in ridership analysis).  
This allows us to infer which bus routes have low ridership.  
A high `%_Exceed` means that the bus route has a high number of stops that experience low passenger volume in majority (threshold=8hrs) of the day. 

After re-ranking, we take the top 10 bus routes with low ridership.

In [89]:
# Filter ridership data to only include identified bus routes
filtered_ridership_df = ridership_df[ridership_df[('ServiceNo', '', '')].isin(identified_bus_routes)]

# Sort out the column names and select relevant columns
filtered_ridership_df.columns = ['|'.join(filter(None, col)).strip() for col in filtered_ridership_df.columns.values]
filtered_ridership_df = filtered_ridership_df[['ServiceNo', 'Threshold', '2024-09|Direction_1|%_Exceed', '2024-09|Direction_1|StopSeqs', '2024-09|Direction_2|%_Exceed', '2024-09|Direction_2|StopSeqs']]

# Sort by decreasing '2024-09|Direction_1|%_Exceed'
filtered_ridership_df['2024-09|Direction_1|%_Exceed'] = filtered_ridership_df['2024-09|Direction_1|%_Exceed'].astype(float)
filtered_ridership_df = filtered_ridership_df.sort_values(by='2024-09|Direction_1|%_Exceed', ascending=False)
filtered_ridership_df = filtered_ridership_df[:10]
filtered_ridership_df

Unnamed: 0,ServiceNo,Threshold,2024-09|Direction_1|%_Exceed,2024-09|Direction_1|StopSeqs,2024-09|Direction_2|%_Exceed,2024-09|Direction_2|StopSeqs
287,83T,8.0,93.75,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...",0.0,[]
157,198A,8.0,92.31,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]",0.0,[]
132,181M,8.0,88.89,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 1...",0.0,[]
388,98A,8.0,81.82,"[2, 4, 5, 6, 7, 8, 9, 10, 11]",0.0,[]
85,154B,8.0,75.0,"[2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 19, 20...",0.0,[]
20,112A,8.0,75.0,"[2, 3, 4, 7, 8, 9, 10, 11, 12]",0.0,[]
90,158A,8.0,74.07,"[2, 3, 4, 5, 6, 7, 8, 9, 13, 15, 16, 17, 20, 2...",0.0,[]
376,975A,8.0,71.43,"[2, 3, 5, 6, 7]",0.0,[]
84,154A,8.0,55.56,"[2, 5, 6, 8, 9]",0.0,[]
390,98M,8.0,39.58,"[3, 5, 6, 8, 9, 11, 13, 14, 15, 27, 33, 35, 36...",0.0,[]


Note: It turns out that most of the bus routes identified by geospatial anlaysis happen to only service 1 direction.

In [91]:
merged_df = filtered_ridership_df.merge(identified_buses_df, left_on='ServiceNo', right_on='Bus_ServiceNo')
merged_df = merged_df.drop(columns=['2024-09|Direction_2|%_Exceed', '2024-09|Direction_2|StopSeqs', 'Bus_ServiceNo', 'Threshold'])

# Reorder columns
merged_df = merged_df[['ServiceNo',  'MRT_Line', 'Weighted_Average_Score','Bus_Route_Length_m', 'Overlap_Length_m', '2024-09|Direction_1|%_Exceed', '2024-09|Direction_1|StopSeqs', 'Coverage_Percentage',	'Consecutive_Coverage_Percentage',	'Max_Consecutive_Segments',	'Weighted_Average_Angle']]
merged_df

Unnamed: 0,ServiceNo,MRT_Line,Weighted_Average_Score,Bus_Route_Length_m,Overlap_Length_m,2024-09|Direction_1|%_Exceed,2024-09|Direction_1|StopSeqs,Coverage_Percentage,Consecutive_Coverage_Percentage,Max_Consecutive_Segments,Weighted_Average_Angle
0,83T,Cross Island,28.424242,5486.421431,4719.286876,93.75,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...",6.060606,50.0,3.0,30.0
1,198A,Jurong Region,28.726667,4147.384256,4147.384256,92.31,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]",6.666667,40.0,2.0,50.3
2,181M,Jurong Region,42.68,5683.547588,5683.547588,88.89,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 1...",6.666667,83.333333,5.0,33.4
3,98A,Jurong Region,51.426667,2960.207397,2960.207397,81.82,"[2, 4, 5, 6, 7, 8, 9, 10, 11]",6.666667,100.0,3.0,43.8
4,154B,Jurong Region,27.428571,13376.866259,7814.496218,75.0,"[2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 19, 20...",12.0,28.571429,4.0,56.0
5,112A,Cross Island,29.212121,3991.852192,2315.355008,75.0,"[2, 3, 4, 7, 8, 9, 10, 11, 12]",3.030303,50.0,2.0,40.0
6,158A,Thomson East Coast,24.965263,9716.0712,4650.733102,74.07,"[2, 3, 4, 5, 6, 7, 8, 9, 13, 15, 16, 17, 20, 2...",5.263158,40.0,4.0,34.3
7,975A,Jurong Region,50.093333,1555.25883,1292.371822,71.43,"[2, 3, 5, 6, 7]",1.333333,100.0,2.0,47.8
8,154A,Jurong Region,50.686667,2451.852352,2451.852352,55.56,"[2, 5, 6, 8, 9]",6.666667,100.0,3.0,40.1
9,98M,Jurong Region,28.106667,15371.17417,15023.1655,39.58,"[3, 5, 6, 8, 9, 11, 13, 14, 15, 27, 33, 35, 36...",10.666667,37.5,6.0,44.2


## 3. Plot the bus routes and stations for visual inspection
For making choosing the routes for final recommendations, we shall plot these routes along with the relevant MRT lines for visual inspection and a sanity check.  

From the previous table, it looks like bus 98A might be a good candidate - high weighted avg score, and a high number of stops with low ridership. It has also high consecutive coverage by MRT stations.

In [94]:
import geopandas as gpd

bus_stops_df = gpd.read_file('data/cleaned/BusStops.geojson')
bus_routes_df = pd.read_json('data/cleaned/BusRoutes.json', lines=True)
rail_stations_df = gpd.read_file('data/cleaned/RailStationsMerged.geojson')

In [97]:
bus_stops_df.head()

Unnamed: 0,BUS_STOP_N,BUS_ROOF_N,LOC_DESC,geometry
0,65059,B12,ST ANNE'S CH,POINT (103.9013 1.39303)
1,16171,B06,YUSOF ISHAK HSE,POINT (103.77437 1.29892)
2,61101,NIL,BLK 120,POINT (103.8637 1.33564)
3,1239,B01,SULTAN PLAZA,POINT (103.86165 1.30285)
4,17269,B01,BLK 730,POINT (103.76264 1.30492)


In [98]:
bus_routes_df.head()

Unnamed: 0,ServiceNo,Operator,Direction,StopSequence,BusStopCode,Distance,WD_FirstBus,WD_LastBus,SAT_FirstBus,SAT_LastBus,SUN_FirstBus,SUN_LastBus
0,10,SBST,1,1,75009,0.0,500,2300,500,2300,500,2300
1,10,SBST,1,2,76059,0.6,502,2302,502,2302,502,2302
2,10,SBST,1,3,76069,1.1,504,2304,504,2304,503,2304
3,10,SBST,1,4,96289,2.3,508,2308,508,2309,507,2308
4,10,SBST,1,5,96109,2.7,509,2310,509,2311,508,2309


In [99]:
rail_stations_df.head()

Unnamed: 0,Name,Description,StationType,StationName,StationCode,StationLine,geometry
0,kml_105,<center><table><tr><th colspan='2' align='cent...,MRT,Admiralty,NS10,North-South,"POLYGON Z ((103.80013 1.44004 0, 103.80003 1.4..."
1,kml_154,<center><table><tr><th colspan='2' align='cent...,MRT,Aljunied,EW9,East-West,"POLYGON Z ((103.88373 1.31643 0, 103.88374 1.3..."
2,kml_1,<center><table><tr><th colspan='2' align='cent...,MRT,Ang Mo Kio,NS16,North-South,"POLYGON Z ((103.84988 1.36925 0, 103.84976 1.3..."
3,kml_163,<center><table><tr><th colspan='2' align='cent...,MRT,Aviation Park,CR2,Cross Island,"POLYGON Z ((104.0021 1.37095 0, 104.00237 1.37..."
4,kml_243,<center><table><tr><th colspan='2' align='cent...,MRT,Bahar Junction,JS7,Jurong Region,"POLYGON Z ((103.70444 1.34699 0, 103.70445 1.3..."


In [165]:
rail_stations_df.StationLine.unique()

array(['North-South', 'East-West', 'Cross Island', 'Jurong Region',
       'Sengkang LRT', 'Bukit Panjang LRT', 'Circle', 'Downtown',
       'Thomson East Coast', 'North-East', 'Punggol LRT'], dtype=object)

In [145]:
def get_line_color(stn_code):
    if pd.isna(stn_code):  # Handle NaN cases
        return 'gray'  # Default color for missing station code
    if stn_code.startswith('NS'):
        return 'lightred'
    elif stn_code.startswith('EW') or stn_code.startswith('CG'):
        return 'green'
    elif stn_code.startswith('NE'):
        return 'purple'
    elif stn_code.startswith('CC'):
        return 'orange'
    elif stn_code.startswith('DT'):
        return 'blue'
    elif stn_code.startswith('TE'):
        return 'darkred'
    elif stn_code.startswith('J'):
        return 'lightgreen'
    else:
        return 'gray' 

In [179]:
import folium

PLOT_LOCATION = [1.3521, 103.8198]

def plot_bus_route_and_mrt_lines(bus_route: str, mrt_lines: list[str]):
    chosen_bus_stops = bus_routes_df[bus_routes_df['ServiceNo'] == bus_route]['BusStopCode']
    chosen_bus_stops_geo = bus_stops_df[bus_stops_df['BUS_STOP_N'].isin(chosen_bus_stops)]
    mrt_stations_geo = rail_stations_df[rail_stations_df['StationLine'].isin(mrt_lines)]

    m = folium.Map(location=[1.3521, 103.8198], zoom_start=12)

    print(f"Number of bus stops: {len(chosen_bus_stops_geo)}")
    for i, stop in chosen_bus_stops_geo.iterrows():
        folium.Marker(
            [stop['geometry'].y, stop['geometry'].x], 
            tooltip=f"Name: {stop['LOC_DESC']}", 
            icon=folium.Icon(color='black')
        ).add_to(m)

    print(f"Number of MRT stations: {len(mrt_stations_geo)}")
    for i, station in mrt_stations_geo.iterrows():
        tooltip = f"Station: {station['StationCode']} {station['StationName']}"
        if station['geometry'] is None:
            print(f"Station {station['StationCode']} has no geometry")
        elif station['geometry'].geom_type == 'Point':
            loc = [station['geometry'].y, station['geometry'].x]
        else:
            loc = station['geometry'].centroid
        folium.Marker(
            [loc.y, loc.x], 
            tooltip=tooltip, 
            icon=folium.Icon(color=get_line_color(station['StationCode']))
        ).add_to(m)

    return m

### Bus 98A
This one is v sus

In [180]:
plot_bus_route_and_mrt_lines("98A", ["Jurong Region", "East-West", "North-South"])

Number of bus stops: 11
Number of MRT stations: 105


### Bus 98M
It's less sus but the north part I think is still not v suitable - blocked by PIE. 

In [190]:
plot_bus_route_and_mrt_lines("98M", ["Jurong Region", "East-West", "North-South"])

Number of bus stops: 47
Number of MRT stations: 105


### Bus 181M
I think this is a good one + i used to stay there and i think i agree

In [181]:
plot_bus_route_and_mrt_lines("181M", ["Jurong Region", "East-West"])

Number of bus stops: 17
Number of MRT stations: 66


### Bus 975A
Seems viable 

In [182]:
plot_bus_route_and_mrt_lines("975A", ["Jurong Region", "East-West", "North-South"])

Number of bus stops: 7
Number of MRT stations: 105


### Bus 83T
This one doesnt seem right, geospatial data says it is close to Cross Island but it isn't. It's close to Punggol LRT tho.

In [184]:
geospatial_analysis_df[geospatial_analysis_df['Bus_ServiceNo'] == '83T']

Unnamed: 0,MRT_Line,Bus_ServiceNo,Bus_Route_Length_m,Overlap_Length_m,Coverage_Percentage,Consecutive_Coverage_Percentage,Max_Consecutive_Segments,Weighted_Average_Angle,Weighted_Average_Score
37,Cross Island,83T,5486.421431,4719.286876,6.060606,50.0,3.0,30.0,28.424242


In [183]:
plot_bus_route_and_mrt_lines("83T", ["Cross Island", "North-East", "Punggol LRT"])

Number of bus stops: 16
Number of MRT stations: 55
Station PW1 has no geometry


### Bus 154A
This has the same issue as 98A and 98M

In [186]:
plot_bus_route_and_mrt_lines("154A", ["East-West", "Jurong Region"])

Number of bus stops: 9
Number of MRT stations: 66


### Bus 158A
I think this one can argue with low ridershipo

In [193]:
merged_df[merged_df['ServiceNo'] == '158A']

Unnamed: 0,ServiceNo,MRT_Line,Weighted_Average_Score,Bus_Route_Length_m,Overlap_Length_m,2024-09|Direction_1|%_Exceed,2024-09|Direction_1|StopSeqs,Coverage_Percentage,Consecutive_Coverage_Percentage,Max_Consecutive_Segments,Weighted_Average_Angle
6,158A,Thomson East Coast,24.965263,9716.0712,4650.733102,74.07,"[2, 3, 4, 5, 6, 7, 8, 9, 13, 15, 16, 17, 20, 2...",5.263158,40.0,4.0,34.3


In [189]:
plot_bus_route_and_mrt_lines("158A", ["Thomson East Coast", "East-West", "North-East"])

Number of bus stops: 27
Number of MRT stations: 108


### Bus 112A
I'm not sure about this one

In [192]:
plot_bus_route_and_mrt_lines("112A", ["Cross Island", "North-East", "Sengkang LRT"])

Number of bus stops: 12
Number of MRT stations: 54
