In [2]:
import geopandas as gpd
import folium
import pandas as pd
import matplotlib.pyplot as plt
import re

### Clean Planning Area

In [None]:
PlanningArea: gpd.GeoDataFrame = gpd.read_file('raw/MasterPlan2019PlanningAreaBoundary.geojson')
PlanningArea.head()

Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.93208 1.30555 0, 103.93208 1.3..."
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.72042 1.32824 0, 103.72003 1.3..."
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.76408 1.37001 0, 103.76444 1.3..."
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.82361 1.26018 0, 103.82362 1.2..."
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.77445 1.39029 0, 103.77499 1.3..."


In [5]:
# def extract_fri
html = PlanningArea["Description"][0]
def extract_region_and_planning_area(html: str) -> (str, str):
    values = re.findall(r"<td>(.*?)</td>", html)

    return {
        "Region": values[0],
        "Planning Area": values[3]
    }

extract_region_and_planning_area(html)

{'Region': 'BEDOK', 'Planning Area': 'EAST REGION'}

In [6]:
PlanningArea["Region"] = PlanningArea["Description"].apply(lambda x: extract_region_and_planning_area(x)["Region"])
PlanningArea["Planning Area"] = PlanningArea["Description"].apply(lambda x: extract_region_and_planning_area(x)["Planning Area"])
PlanningArea.head()

Unnamed: 0,Name,Description,geometry,Region,Planning Area
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.93208 1.30555 0, 103.93208 1.3...",BEDOK,EAST REGION
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.72042 1.32824 0, 103.72003 1.3...",BOON LAY,WEST REGION
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.76408 1.37001 0, 103.76444 1.3...",BUKIT BATOK,WEST REGION
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.82361 1.26018 0, 103.82362 1.2...",BUKIT MERAH,CENTRAL REGION
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.77445 1.39029 0, 103.77499 1.3...",BUKIT PANJANG,WEST REGION


In [7]:
PlanningArea.to_file('cleaned/PlanningArea.geojson', driver='GeoJSON')

### Clean BusStops

In [9]:
# Load the shapefile into a GeoDataFrame
BusStops: gpd.GeoDataFrame = gpd.read_file('raw/BusStopLocation_Jul2024/BusStop.shp')

# Ensure the GeoDataFrame is in the correct CRS (coordinate reference system)
# The .prj file indicates that the data is in SVY21, which is EPSG:3414
BusStops: gpd.GeoDataFrame = BusStops.to_crs(epsg=4326)  # Convert to WGS 84 (EPSG:4326) for Folium
BusStops.head()

Unnamed: 0,BUS_STOP_N,BUS_ROOF_N,LOC_DESC,geometry
0,65059,B12,ST ANNE'S CH,POINT (103.9013 1.39303)
1,16171,B06,YUSOF ISHAK HSE,POINT (103.77437 1.29892)
2,61101,NIL,BLK 120,POINT (103.8637 1.33564)
3,1239,B01,SULTAN PLAZA,POINT (103.86165 1.30285)
4,17269,B01,BLK 730,POINT (103.76264 1.30492)


In [10]:
BusStops.to_file('cleaned/BusStops.geojson', driver='GeoJSON')

### Clean RailStations and RailLines

Commented out below was using the mrt station data from LTA itself, which had many issues. Use the URA data from data.gov instead

In [121]:
# def fix_invalid_geometries(gdf):
#     gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)
#     return gdf

# import pyogrio

# # dataset has a problem and this solves it
# pyogrio.set_gdal_config_options({"OGR_GEOMETRY_ACCEPT_UNCLOSED_RING": "OFF"})

# Stations: gpd.GeoDataFrame = gpd.read_file('../data/TrainStation_Jul2024/RapidTransitSystemStation.shp')
# Stations = fix_invalid_geometries(Stations)
# Stations.to_crs(epsg=4326, inplace=True)

# # getting the area of the stations reveals that some a unusaly big
# Stations['area'] = Stations['geometry'].to_crs(epsg=3857).area

# # get the fake stations
# Stations["is_mrt_or_lrt"] = Stations['STN_NAM_DE'].str.contains(r"MRT STATION|LRT STATION", regex=True, case=False, na=False)
# Stations.where(Stations["is_mrt_or_lrt"] == False).dropna(subset=["is_mrt_or_lrt"])[["STN_NAM_DE", "area"]]

# Stations = Stations.where(Stations["is_mrt_or_lrt"] == True).dropna(subset=["is_mrt_or_lrt"])

In [6]:
RailStations = gpd.read_file('raw/MasterPlan2019RailStationLayer.geojson')
RailStations.head()

Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.84988 1.36925 0, 103.84976 1.3..."
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.89304 1.38166 0, 103.89283 1.3..."
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.90538 1.38786 0, 103.90529 1.3..."
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.916 1.39444 0, 103.91634 1.394..."
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.8765 1.39148 0, 103.87648 1.39..."


In [7]:
RailLines = gpd.read_file('raw/MasterPlan2019RailLineLayer.geojson')
RailLines.head()

Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.7365 1.35301 0, 103.73649 1...."
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.73708 1.35164 0, 103.73768 1..."
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.71328 1.35252 0, 103.71327 1..."
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.7132 1.35265 0, 103.71318 1...."
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.70767 1.34888 0, 103.70767 1..."


In [20]:
import re

html = RailStations['Description'][0]
# todo: extract info from description
def extract_data_from_description_stations(description: str):

    values = re.findall(r"<td>(.*?)</td>", description)
    
    return {
        "StationType": values[1],
        "StationName": values[2],
    }

extract_data_from_description_stations(html)

{'StationType': 'MRT', 'StationName': 'ANG MO KIO INTERCHANGE'}

In [21]:
RailStations_new = RailStations.apply(lambda x: pd.Series(extract_data_from_description_stations(x['Description'])), axis=1)
RailStations = pd.concat([RailStations, RailStations_new], axis=1) 

In [22]:
RailStations.to_file('cleaned/RailStations.geojson', driver='GeoJSON')

In [25]:
def extract_grd_level_and_rail_type(description: str):
    values = re.findall(r"<td>(.*?)</td>", description)
    
    return {
        "GroundLevel": values[0],
        "RailType": values[1],
    }

In [26]:
RailLines_new = RailLines.apply(lambda x: pd.Series(extract_grd_level_and_rail_type(x['Description'])), axis=1)
RailLines = pd.concat([RailLines, RailLines_new], axis=1)
RailLines.head()

Unnamed: 0,Name,Description,geometry,GroundLevel,RailType
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.7365 1.35301 0, 103.73649 1....",ABOVEGROUND,MRT
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.73708 1.35164 0, 103.73768 1...",ABOVEGROUND,MRT
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.71328 1.35252 0, 103.71327 1...",ABOVEGROUND,MRT
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.7132 1.35265 0, 103.71318 1....",ABOVEGROUND,MRT
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.70767 1.34888 0, 103.70767 1...",ABOVEGROUND,MRT


In [27]:
RailLines.to_file('cleaned/RailLines.geojson', driver='GeoJSON')

In [None]:
# def get_train_service_lines(stn_no):
#     mapping = {
#         'NS': 'North-South',
#         'EW': 'East-West',
#         'CG': 'East-West',
#         'NE': 'North-East',
#         'CC': 'Circle',
#         'CE': 'Circle',
#         'DT': 'Downtown',
#         'TE': 'Thomson East Coast',
#         'BP': 'Bukit Panjang LRT',
#         'ST': 'Sengkang LRT', # STC is Sengkang LRT
#         'SE': 'Sengkang LRT',
#         'SW': 'Sengkang LRT',
#         'PW': 'Punggol LRT',
#         'PE': 'Punggol LRT',
#         'PT': 'Punggol LRT',  # PTC is Punggol LRT
#     }
#     line = mapping.get(stn_no[:2], 'Unknown')
#     if line == "Unknown":
#         print(f"Unknown train service line for station number: {stn_no}")
#     return line

# Stations['STN_LINE'] = Stations['STN_NO'].map(get_train_service_lines)
# Stations.head()

### Clean BusRoutes

In [5]:
BusRoutes = pd.read_json('raw/BusRoutes.json')
BusRoutes.head()
BusRoutes.to_json('cleaned/BusRoutes.json', orient='records', lines=True)