In [19]:
import geopandas as gpd
import pandas as pd
import re
import json

### Clean Planning Area

In [None]:
PlanningArea: gpd.GeoDataFrame = gpd.read_file('raw/MasterPlan2019PlanningAreaBoundary.geojson')
PlanningArea.head()

Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.93208 1.30555 0, 103.93208 1.3..."
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.72042 1.32824 0, 103.72003 1.3..."
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.76408 1.37001 0, 103.76444 1.3..."
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.82361 1.26018 0, 103.82362 1.2..."
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.77445 1.39029 0, 103.77499 1.3..."


In [5]:
# def extract_fri
html = PlanningArea["Description"][0]
def extract_region_and_planning_area(html: str) -> (str, str):
    values = re.findall(r"<td>(.*?)</td>", html)

    return {
        "Region": values[0],
        "Planning Area": values[3]
    }

extract_region_and_planning_area(html)

{'Region': 'BEDOK', 'Planning Area': 'EAST REGION'}

In [6]:
PlanningArea["Region"] = PlanningArea["Description"].apply(lambda x: extract_region_and_planning_area(x)["Region"])
PlanningArea["Planning Area"] = PlanningArea["Description"].apply(lambda x: extract_region_and_planning_area(x)["Planning Area"])
PlanningArea.head()

Unnamed: 0,Name,Description,geometry,Region,Planning Area
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.93208 1.30555 0, 103.93208 1.3...",BEDOK,EAST REGION
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.72042 1.32824 0, 103.72003 1.3...",BOON LAY,WEST REGION
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.76408 1.37001 0, 103.76444 1.3...",BUKIT BATOK,WEST REGION
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.82361 1.26018 0, 103.82362 1.2...",BUKIT MERAH,CENTRAL REGION
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.77445 1.39029 0, 103.77499 1.3...",BUKIT PANJANG,WEST REGION


In [7]:
PlanningArea.to_file('cleaned/PlanningArea.geojson', driver='GeoJSON')

### Clean BusStops

In [52]:
# Load the shapefile into a GeoDataFrame
BusStops: gpd.GeoDataFrame = gpd.read_file('raw/BusStopLocation_Jul2024/BusStop.shp')

# Ensure the GeoDataFrame is in the correct CRS (coordinate reference system)
# The .prj file indicates that the data is in SVY21, which is EPSG:3414
BusStops: gpd.GeoDataFrame = BusStops.to_crs(epsg=4326)  # Convert to WGS 84 (EPSG:4326) for Folium
BusStops['BUS_STOP_N'] = BusStops['BUS_STOP_N'].fillna(-1)
BusStops['BUS_STOP_N'] = BusStops['BUS_STOP_N'].replace("UNK", -1)
BusStops['BUS_STOP_N'] = BusStops['BUS_STOP_N'].astype(int)
BusStops.head()

Unnamed: 0,BUS_STOP_N,BUS_ROOF_N,LOC_DESC,geometry
0,65059,B12,ST ANNE'S CH,POINT (103.9013 1.39303)
1,16171,B06,YUSOF ISHAK HSE,POINT (103.77437 1.29892)
2,61101,NIL,BLK 120,POINT (103.8637 1.33564)
3,1239,B01,SULTAN PLAZA,POINT (103.86165 1.30285)
4,17269,B01,BLK 730,POINT (103.76264 1.30492)


In [53]:
BusStops.to_file('cleaned/BusStops.geojson', driver='GeoJSON')
dtypes = BusStops.dtypes.apply(lambda x: x.name).to_dict()
with open('cleaned/dtypes/BusStops.json', 'w') as f:
    json.dump(dtypes, f)

### Clean RailStations and RailLines

Commented out below was using the mrt station data from LTA itself, which had many issues. Use the URA data from data.gov instead

In [121]:
# def fix_invalid_geometries(gdf):
#     gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)
#     return gdf

# import pyogrio

# # dataset has a problem and this solves it
# pyogrio.set_gdal_config_options({"OGR_GEOMETRY_ACCEPT_UNCLOSED_RING": "OFF"})

# Stations: gpd.GeoDataFrame = gpd.read_file('../data/TrainStation_Jul2024/RapidTransitSystemStation.shp')
# Stations = fix_invalid_geometries(Stations)
# Stations.to_crs(epsg=4326, inplace=True)

# # getting the area of the stations reveals that some a unusaly big
# Stations['area'] = Stations['geometry'].to_crs(epsg=3857).area

# # get the fake stations
# Stations["is_mrt_or_lrt"] = Stations['STN_NAM_DE'].str.contains(r"MRT STATION|LRT STATION", regex=True, case=False, na=False)
# Stations.where(Stations["is_mrt_or_lrt"] == False).dropna(subset=["is_mrt_or_lrt"])[["STN_NAM_DE", "area"]]

# Stations = Stations.where(Stations["is_mrt_or_lrt"] == True).dropna(subset=["is_mrt_or_lrt"])

In [4]:
RailStations = gpd.read_file('raw/MasterPlan2019RailStationLayer.geojson')
RailStations.head()

Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.84988 1.36925 0, 103.84976 1.3..."
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.89304 1.38166 0, 103.89283 1.3..."
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.90538 1.38786 0, 103.90529 1.3..."
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.916 1.39444 0, 103.91634 1.394..."
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.8765 1.39148 0, 103.87648 1.39..."


In [5]:
RailLines = gpd.read_file('raw/MasterPlan2019RailLineLayer.geojson')
RailLines.head()

Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.7365 1.35301 0, 103.73649 1...."
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.73708 1.35164 0, 103.73768 1..."
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.71328 1.35252 0, 103.71327 1..."
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.7132 1.35265 0, 103.71318 1...."
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.70767 1.34888 0, 103.70767 1..."


In [6]:
import re

html = RailStations['Description'][0]
# todo: extract info from description
def extract_data_from_description_stations(description: str):

    values = re.findall(r"<td>(.*?)</td>", description)
    
    return {
        "StationType": values[1],
        "StationName": values[2],
    }

extract_data_from_description_stations(html)

{'StationType': 'MRT', 'StationName': 'ANG MO KIO INTERCHANGE'}

In [7]:
RailStations_new = RailStations.apply(lambda x: pd.Series(extract_data_from_description_stations(x['Description'])), axis=1)
RailStations = pd.concat([RailStations, RailStations_new], axis=1) 

In [8]:
RailStations.to_file('cleaned/RailStations.geojson', driver='GeoJSON')

In [9]:
def extract_grd_level_and_rail_type(description: str):
    values = re.findall(r"<td>(.*?)</td>", description)
    
    return {
        "GroundLevel": values[0],
        "RailType": values[1],
    }

In [10]:
RailLines_new = RailLines.apply(lambda x: pd.Series(extract_grd_level_and_rail_type(x['Description'])), axis=1)
RailLines = pd.concat([RailLines, RailLines_new], axis=1)
RailLines.head()

Unnamed: 0,Name,Description,geometry,GroundLevel,RailType
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.7365 1.35301 0, 103.73649 1....",ABOVEGROUND,MRT
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.73708 1.35164 0, 103.73768 1...",ABOVEGROUND,MRT
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.71328 1.35252 0, 103.71327 1...",ABOVEGROUND,MRT
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.7132 1.35265 0, 103.71318 1....",ABOVEGROUND,MRT
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"LINESTRING Z (103.70767 1.34888 0, 103.70767 1...",ABOVEGROUND,MRT


In [11]:
RailLines.to_file('cleaned/RailLines.geojson', driver='GeoJSON')

In [None]:
# def get_train_service_lines(stn_no):
#     mapping = {
#         'NS': 'North-South',
#         'EW': 'East-West',
#         'CG': 'East-West',
#         'NE': 'North-East',
#         'CC': 'Circle',
#         'CE': 'Circle',
#         'DT': 'Downtown',
#         'TE': 'Thomson East Coast',
#         'BP': 'Bukit Panjang LRT',
#         'ST': 'Sengkang LRT', # STC is Sengkang LRT
#         'SE': 'Sengkang LRT',
#         'SW': 'Sengkang LRT',
#         'PW': 'Punggol LRT',
#         'PE': 'Punggol LRT',
#         'PT': 'Punggol LRT',  # PTC is Punggol LRT
#     }
#     line = mapping.get(stn_no[:2], 'Unknown')
#     if line == "Unknown":
#         print(f"Unknown train service line for station number: {stn_no}")
#     return line

# Stations['STN_LINE'] = Stations['STN_NO'].map(get_train_service_lines)
# Stations.head()

### Clean BusRoutes

In [55]:
BusRoutes = pd.read_json('raw/BusRoutes.json')
BusRoutes['BusStopCode'] = BusRoutes['BusStopCode'].astype(int)
BusRoutes.head()
BusRoutes.to_json('cleaned/BusRoutes.json', orient='records', lines=True)
dtypes = BusRoutes.dtypes.apply(lambda x: x.name).to_dict()
with open('cleaned/dtypes/BusRoutes.json', 'w') as f:
    json.dump(dtypes, f)

In [56]:
pd.read_json('cleaned/BusRoutes.json', lines=True).head()

Unnamed: 0,ServiceNo,Operator,Direction,StopSequence,BusStopCode,Distance,WD_FirstBus,WD_LastBus,SAT_FirstBus,SAT_LastBus,SUN_FirstBus,SUN_LastBus
0,10,SBST,1,1,75009,0.0,500,2300,500,2300,500,2300
1,10,SBST,1,2,76059,0.6,502,2302,502,2302,502,2302
2,10,SBST,1,3,76069,1.1,504,2304,504,2304,503,2304
3,10,SBST,1,4,96289,2.3,508,2308,508,2309,507,2308
4,10,SBST,1,5,96109,2.7,509,2310,509,2311,508,2309


### Clean Train Station

In [12]:
excel_file_path = '../data/Train Station Codes and Chinese Names.xls'
train_station_data = pd.read_excel(excel_file_path)

# Drop the Chinese names column
train_station_data = train_station_data.drop(columns=['mrt_station_chinese', 'mrt_line_chinese'])

# Normalize the station names in both datasets to lowercase for joining
train_station_data['StationName'] = train_station_data['mrt_station_english'].str.upper()

# Merge station codes if a station has multiple entries (e.g., NS17/CC15)
train_station_data = train_station_data.groupby('StationName')['stn_code'].apply(lambda x: '/'.join(x)).reset_index()

# Normalize the station names to lowercase for both DataFrames for easier merging
train_station_data['StationName'] = train_station_data['StationName'].str.lower().replace(" station", "").str.replace(" interchange", "").str.strip()
train_station_data.head()

Unnamed: 0,StationName,stn_code
0,admiralty,NS10
1,aljunied,EW9
2,ang mo kio,NS16
3,bakau,SE3
4,bangkit,BP9


In [28]:
def get_train_service_lines(stn_code):
    mapping = {
        'NS': 'North-South',
        'EW': 'East-West',
        'CG': 'East-West',
        'NE': 'North-East',
        'CC': 'Circle',
        'CE': 'Circle',
        'DT': 'Downtown',
        'TE': 'Thomson East Coast',
        'BP': 'Bukit Panjang LRT',
        'ST': 'Sengkang LRT', # STC is Sengkang LRT
        'SE': 'Sengkang LRT',
        'SW': 'Sengkang LRT',
        'PW': 'Punggol LRT',
        'PE': 'Punggol LRT',
        'PT': 'Punggol LRT',  # PTC is Punggol LRT
    }
    # Split the station code if it contains multiple lines (e.g., NS1/EW24)
    lines = stn_code.split('/')
    
    # Look up the full name for each line prefix and collect the results
    full_lines = [mapping.get(line[:2], 'Unknown Line') for line in lines]
    
    # Return the list of full train service lines (you can return a list or a string)
    return full_lines

In [39]:
# Load the GeoDataFrame
RailStations = gpd.read_file('cleaned/RailStations.geojson')
RailStations['StationName'] = RailStations['Description'].apply(lambda desc: re.sub(r"\b(STATION|INTERCHANGE)\b", "", re.search(r'<th>NAME</th>\s*<td>(.*?)</td>', desc).group(1)).strip().lower())
# Remove duplicates based on the 'StationName'
RailStations = RailStations.drop_duplicates(subset=['StationName'], keep='first')

# Merge the train_station_data with the GeoDataFrame on the StationName column
merged_data = pd.merge(RailStations, train_station_data, on='StationName', how='left')

# Capitalize the first letter of each StationName
merged_data['StationName'] = merged_data['StationName'].str.title()

# Rename the 'stn_code' column to 'StationCode'
merged_data.rename(columns={'stn_code': 'StationCode'}, inplace=True)

# Convert StationCode to string type
merged_data['StationCode'] = merged_data['StationCode'].astype(str)

# Add StationLine and ensure we get lines as lists
merged_data['StationLine'] = merged_data['StationCode'].apply(get_train_service_lines)

# Combine StationCode and StationLine into tuples and then explode together
merged_data['StationLineCode'] = merged_data.apply(lambda row: list(zip(row['StationCode'].split('/'), row['StationLine'])), axis=1)

# Explode the combined StationLineCode column to separate the codes and lines while keeping them in sync
expanded_data = merged_data.explode('StationLineCode')

# Now split the tuples back into separate columns
expanded_data['StationCode'] = expanded_data['StationLineCode'].apply(lambda x: x[0])
expanded_data['StationLine'] = expanded_data['StationLineCode'].apply(lambda x: x[1])

# Drop the intermediate column 'StationLineCode'
expanded_data.drop(columns=['StationLineCode'], inplace=True)

# Reset the index after exploding
expanded_data.reset_index(drop=True, inplace=True)

# Output the expanded dataset
expanded_data.head(10)

Unnamed: 0,Name,Description,StationType,StationName,geometry,StationCode,StationLine
0,kml_1,<center><table><tr><th colspan='2' align='cent...,MRT,Ang Mo Kio,"POLYGON Z ((103.84988 1.36925 0, 103.84976 1.3...",NS16,North-South
1,kml_2,<center><table><tr><th colspan='2' align='cent...,MRT,Buangkok,"POLYGON Z ((103.89304 1.38166 0, 103.89283 1.3...",NE15,North-East
2,kml_3,<center><table><tr><th colspan='2' align='cent...,LRT,Bakau,"POLYGON Z ((103.90538 1.38786 0, 103.90529 1.3...",SE3,Sengkang LRT
3,kml_4,<center><table><tr><th colspan='2' align='cent...,LRT,Riviera,"POLYGON Z ((103.916 1.39444 0, 103.91634 1.394...",PE4,Punggol LRT
4,kml_5,<center><table><tr><th colspan='2' align='cent...,LRT,Fernvale,"POLYGON Z ((103.8765 1.39148 0, 103.87648 1.39...",SW5,Sengkang LRT
5,kml_6,<center><table><tr><th colspan='2' align='cent...,MRT,Punggol,"POLYGON Z ((103.90156 1.40386 0, 103.90172 1.4...",NE17,North-East
6,kml_6,<center><table><tr><th colspan='2' align='cent...,MRT,Punggol,"POLYGON Z ((103.90156 1.40386 0, 103.90172 1.4...",PTC,Punggol LRT
7,kml_7,<center><table><tr><th colspan='2' align='cent...,MRT,Bendemeer,"POLYGON Z ((103.86336 1.31408 0, 103.86347 1.3...",DT23,Downtown
8,kml_8,<center><table><tr><th colspan='2' align='cent...,LRT,Kupang,"POLYGON Z ((103.88164 1.3981 0, 103.8811 1.398...",SW3,Sengkang LRT
9,kml_9,<center><table><tr><th colspan='2' align='cent...,MRT,Jalan Besar,"POLYGON Z ((103.85524 1.30475 0, 103.85523 1.3...",DT22,Downtown


In [40]:
# Save the merged data to a GeoJSON file
expanded_data.to_file('cleaned/RailStationsMerged.geojson', driver='GeoJSON')

In [41]:
expanded_data["StationLine"].sort_values().unique().tolist()

['Bukit Panjang LRT',
 'Circle',
 'Downtown',
 'East-West',
 'North-East',
 'North-South',
 'Punggol LRT',
 'Sengkang LRT',
 'Thomson East Coast',
 'Unknown Line']

## Further cleaning of RailStationsMerged.geojson
Follows from the version saved in previous cell. Data to be replaced is instead stored in RailStationsMergedOld.geojson. 

In [22]:
import geopandas as gpd
import pandas as pd

to_clean = gpd.read_file('cleaned/RailStationsMerged.geojson')
to_clean.head()

Unnamed: 0,Name,Description,StationType,StationName,StationCode,StationLine,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,MRT,Ang Mo Kio,NS16,North-South,"POLYGON Z ((103.84988 1.36925 0, 103.84976 1.3..."
1,kml_2,<center><table><tr><th colspan='2' align='cent...,MRT,Buangkok,NE15,North-East,"POLYGON Z ((103.89304 1.38166 0, 103.89283 1.3..."
2,kml_3,<center><table><tr><th colspan='2' align='cent...,LRT,Bakau,SE3,Sengkang LRT,"POLYGON Z ((103.90538 1.38786 0, 103.90529 1.3..."
3,kml_4,<center><table><tr><th colspan='2' align='cent...,LRT,Riviera,PE4,Punggol LRT,"POLYGON Z ((103.916 1.39444 0, 103.91634 1.394..."
4,kml_5,<center><table><tr><th colspan='2' align='cent...,LRT,Fernvale,SW5,Sengkang LRT,"POLYGON Z ((103.8765 1.39148 0, 103.87648 1.39..."


In [23]:
to_clean[(to_clean["StationLine"] == "Unknown Line") | (to_clean["StationCode"] == "nan")]

Unnamed: 0,Name,Description,StationType,StationName,StationCode,StationLine,geometry
18,kml_16,<center><table><tr><th colspan='2' align='cent...,MRT,Nanyang Crescent,,Unknown Line,"POLYGON Z ((103.68174 1.34911 0, 103.68174 1.3..."
19,kml_17,<center><table><tr><th colspan='2' align='cent...,MRT,Peng Kang Hill,,Unknown Line,"POLYGON Z ((103.67821 1.34443 0, 103.67832 1.3..."
20,kml_18,<center><table><tr><th colspan='2' align='cent...,MRT,Bukit Batok West,,Unknown Line,"POLYGON Z ((103.73888 1.34598 0, 103.73895 1.3..."
21,kml_19,<center><table><tr><th colspan='2' align='cent...,MRT,Toh Guan,,Unknown Line,"POLYGON Z ((103.74279 1.34046 0, 103.7428 1.34..."
24,kml_22,<center><table><tr><th colspan='2' align='cent...,MRT,Jurong Town Hall,,Unknown Line,"POLYGON Z ((103.74522 1.32679 0, 103.74522 1.3..."
25,kml_23,<center><table><tr><th colspan='2' align='cent...,MRT,Founders' Memorial,,Unknown Line,"POLYGON Z ((103.8688 1.29171 0, 103.86881 1.29..."
30,kml_28,<center><table><tr><th colspan='2' align='cent...,MRT,Punggol Coast,,Unknown Line,"POLYGON Z ((103.91087 1.41534 0, 103.91092 1.4..."
31,kml_29,<center><table><tr><th colspan='2' align='cent...,MRT,Xilin,,Unknown Line,"POLYGON Z ((103.96481 1.32952 0, 103.96488 1.3..."
55,kml_53,<center><table><tr><th colspan='2' align='cent...,MRT,Tengah Plantation,,Unknown Line,"POLYGON Z ((103.73309 1.35784 0, 103.73317 1.3..."
56,kml_54,<center><table><tr><th colspan='2' align='cent...,MRT,Tavistock,,Unknown Line,"POLYGON Z ((103.8617 1.37017 0, 103.86207 1.37..."


In [4]:
# export StationName	StationCode	StationLine columns into a csv file
to_clean[["StationName", "StationCode", "StationLine"]].to_csv('temp/RailStationsMerged.csv', index=False)

In [24]:
to_clean.shape

(253, 7)

In [43]:
# Manually defined DataFrame with updated values
manual_updates = pd.read_csv('temp/RailStationsMergedManualUpdate.csv')

# outer join the two dataframes on StationName
merged = pd.merge(to_clean, manual_updates, on='StationName', how='outer')
# Remove rows where StationCode_y is NaN
merged_cleaned = merged.dropna(subset=['StationCode_y'])

# Display the rows where StationLine_x is "Unknown Line" or StationCode_x is "nan"
merged_cleaned[(merged_cleaned["StationLine_x"] == "Unknown Line") | (merged_cleaned["StationCode_x"] == "nan")].head()

Unnamed: 0,Name,Description,StationType,StationName,StationCode_x,StationLine_x,geometry,StationCode_y,StationLine_y
5,kml_163,<center><table><tr><th colspan='2' align='cent...,MRT,Aviation Park,,Unknown Line,"POLYGON Z ((104.0021 1.37095 0, 104.00237 1.37...",CR2,Cross Island
6,kml_243,<center><table><tr><th colspan='2' align='cent...,MRT,Bahar Junction,,Unknown Line,"POLYGON Z ((103.70444 1.34699 0, 103.70445 1.3...",JS7,Jurong Region
19,kml_64,<center><table><tr><th colspan='2' align='cent...,MRT,Bedok South,,Unknown Line,"POLYGON Z ((103.95033 1.31814 0, 103.95037 1.3...",TE30,Thomson East Coast
42,kml_18,<center><table><tr><th colspan='2' align='cent...,MRT,Bukit Batok West,,Unknown Line,"POLYGON Z ((103.73888 1.34598 0, 103.73895 1.3...",JE3,Jurong Region
43,kml_100,<center><table><tr><th colspan='2' align='cent...,MRT,Bukit Brown,,Unknown Line,"POLYGON Z ((103.83052 1.33357 0, 103.83052 1.3...",CC18,Circle


In [44]:
# Replace the StationCode_x and StationLine_x with StationCode_y and StationLine_y using .loc
merged_cleaned.loc[:, 'StationCode'] = merged_cleaned['StationCode_y']
merged_cleaned.loc[:, 'StationLine'] = merged_cleaned['StationLine_y']

# Drop the columns with suffixes _x and _y using .loc
merged_cleaned.drop(columns=['StationCode_x', 'StationLine_x', 'StationCode_y', 'StationLine_y'], inplace=True)

# Save the cleaned data to a GeoJSON file
merged_cleaned.to_file('cleaned/RailStationsMerged.geojson', driver='GeoJSON')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_cleaned.drop(columns=['StationCode_x', 'StationLine_x', 'StationCode_y', 'StationLine_y'], inplace=True)


In [46]:
check_cleaned = gpd.read_file('cleaned/RailStationsMerged.geojson')

In [48]:
check_cleaned[(check_cleaned["StationLine"] == "Unknown Line") | (check_cleaned["StationCode"] == "nan") | (check_cleaned["StationCode"].isna())]

Unnamed: 0,Name,Description,StationType,StationName,StationCode,StationLine,geometry
