## Attempting to cluster bus stops to the nearest train station using hierarchical clustering.

In [9]:
import pandas as pd
import geopandas as gpd
import numpy as np
import torch
from sklearn.cluster import AgglomerativeClustering
from shapely.geometry import Point

In [3]:
# Read Train data
RailStations = gpd.read_file('../data/cleaned/RailStationsMerged.geojson')
RailStations.head()

Unnamed: 0,Name,Description,StationType,StationName,StationCode,StationLine,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,MRT,Ang Mo Kio,NS16,North-South,"POLYGON Z ((103.84988 1.36925 0, 103.84976 1.3..."
1,kml_2,<center><table><tr><th colspan='2' align='cent...,MRT,Buangkok,NE15,North-East,"POLYGON Z ((103.89304 1.38166 0, 103.89283 1.3..."
2,kml_3,<center><table><tr><th colspan='2' align='cent...,LRT,Bakau,SE3,Sengkang LRT,"POLYGON Z ((103.90538 1.38786 0, 103.90529 1.3..."
3,kml_4,<center><table><tr><th colspan='2' align='cent...,LRT,Riviera,PE4,Punggol LRT,"POLYGON Z ((103.916 1.39444 0, 103.91634 1.394..."
4,kml_5,<center><table><tr><th colspan='2' align='cent...,LRT,Fernvale,SW5,Sengkang LRT,"POLYGON Z ((103.8765 1.39148 0, 103.87648 1.39..."


In [4]:
# Read Bus data
BusStops = gpd.read_file('../data/cleaned/BusStops.geojson')
BusStops.head()

Unnamed: 0,BUS_STOP_N,BUS_ROOF_N,LOC_DESC,geometry
0,65059,B12,ST ANNE'S CH,POINT (103.9013 1.39303)
1,16171,B06,YUSOF ISHAK HSE,POINT (103.77437 1.29892)
2,61101,NIL,BLK 120,POINT (103.8637 1.33564)
3,1239,B01,SULTAN PLAZA,POINT (103.86165 1.30285)
4,17269,B01,BLK 730,POINT (103.76264 1.30492)


In [5]:
# First we need to reproject both GeoDataFrames
rail_stations = RailStations.to_crs(epsg=3857)
bus_stops = BusStops.to_crs(epsg=3857)
print(rail_stations.head())
print(bus_stops.head())

    Name                                        Description StationType  \
0  kml_1  <center><table><tr><th colspan='2' align='cent...         MRT   
1  kml_2  <center><table><tr><th colspan='2' align='cent...         MRT   
2  kml_3  <center><table><tr><th colspan='2' align='cent...         LRT   
3  kml_4  <center><table><tr><th colspan='2' align='cent...         LRT   
4  kml_5  <center><table><tr><th colspan='2' align='cent...         LRT   

  StationName StationCode   StationLine  \
0  Ang Mo Kio        NS16   North-South   
1    Buangkok        NE15    North-East   
2       Bakau         SE3  Sengkang LRT   
3     Riviera         PE4   Punggol LRT   
4    Fernvale         SW5  Sengkang LRT   

                                            geometry  
0  POLYGON Z ((11560516.161 152438.322 0, 1156050...  
1  POLYGON Z ((11565320.289 153820.46 0, 11565296...  
2  POLYGON Z ((11566694.327 154511.376 0, 1156668...  
3  POLYGON Z ((11567875.886 155243.313 0, 1156791...  
4  POLYGON Z ((

In [19]:
# Extract coordinates from the 'geometry' column for both datasets
bus_stops['coords'] = bus_stops['geometry'].apply(lambda x: (x.x, x.y))
rail_stations['coords'] = rail_stations['geometry'].apply(lambda poly: poly.centroid.coords[0])

# Convert coordinates to DataFrames for easier clustering
bus_stop_coords = pd.DataFrame(bus_stops['coords'].tolist(), columns=['x', 'y'])
rail_station_coords = pd.DataFrame(rail_stations['coords'].tolist(), columns=['x', 'y'])

# Combine all coordinates (bus stops + train stations) for clustering
all_coords = pd.concat([bus_stop_coords, rail_station_coords], ignore_index=True)

# Perform Hierarchical Agglomerative Clustering
# n_clusters = number of train stations (each bus stop will cluster with a nearby station)
clustering = AgglomerativeClustering(n_clusters=len(rail_stations), linkage='single')
labels = clustering.fit_predict(all_coords)

# Assign cluster labels to the original bus stops
bus_stops['nearest_station_cluster'] = labels[:len(bus_stops)]

# Map each bus stop to its nearest train station using cluster labels
clustered_stops = bus_stops.merge(
    rail_stations.assign(cluster=range(len(rail_stations))),
    left_on='nearest_station_cluster', right_on='cluster', suffixes=('_bus', '_station')
)

# Select relevant columns for the final DataFrame
result = clustered_stops[['BUS_STOP_N', 'LOC_DESC', 'StationName', 'StationCode', 'StationLine']]

# Display the result
print(result.head())

   BUS_STOP_N         LOC_DESC StationName StationCode   StationLine
0       65059     ST ANNE'S CH        Defu         nan  Unknown Line
1       16171  YUSOF ISHAK HSE   Chinatown         NE4    North-East
2       61101          BLK 120        Defu         nan  Unknown Line
3        1239     SULTAN PLAZA        Defu         nan  Unknown Line
4       17269          BLK 730   Chinatown         NE4    North-East


In [21]:
# Step 2: Perform Hierarchical Clustering on Train Stations
x_distance_threshold = 0.005  # Adjust this threshold as needed
clustering = AgglomerativeClustering(
    n_clusters=None,  # Let the clustering determine the number of clusters
    distance_threshold=x_distance_threshold,  # Set the threshold for clustering
    linkage='single'
)

# Fit the model and assign cluster labels to train stations
rail_stations['station_cluster'] = clustering.fit_predict(rail_station_coords)

# Display clusters of train stations
print("Clusters of Train Stations:")
print(rail_stations[['StationName', 'station_cluster']].sort_values(by='station_cluster'))

Clusters of Train Stations:
         StationName  station_cluster
88         Woodlands                0
89         Woodlands                0
22       Jurong East                1
23       Jurong East                1
5            Punggol                2
..               ...              ...
83              Hume              216
47         Boon Keng              217
118        Bugis Mrt              218
130      Farrer Park              219
126  Punggol Central              220

[253 rows x 2 columns]
