In [None]:

import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np

wards = gpd.read_file("../data/processed/wards_with_population.geojson")
stops_df = pd.read_csv("../data/kaggle/dmrc_gtfs_dataset/stops.csv")
stop_times = pd.read_csv("../data/kaggle/dmrc_gtfs_dataset/stop_times.csv")
trips = pd.read_csv("../data/kaggle/dmrc_gtfs_dataset/trips.csv")
routes = pd.read_csv("../data/kaggle/dmrc_gtfs_dataset/routes.csv")
fare_rules = pd.read_csv("../data/kaggle/dmrc_gtfs_dataset/fare_rules.csv")
fare_attributes = pd.read_csv("../data/kaggle/dmrc_gtfs_dataset/fare_attributes.csv")

stops_gdf = gpd.GeoDataFrame(
    stops_df,
    geometry=gpd.points_from_xy(stops_df["stop_lon"], stops_df["stop_lat"]),
    crs="EPSG:4326"
)
stops_gdf = stops_gdf.to_crs(epsg=32643)
wards = wards.to_crs(epsg=32643)

stops_in_wards = gpd.sjoin(stops_gdf, wards, how="inner", predicate="within")

stop_counts = stops_in_wards.groupby("Ward_Name").size().reset_index(name="stop_count")
wards = wards.drop(columns=[col for col in wards.columns if "stop_count" in col], errors="ignore")
wards = wards.merge(stop_counts, on="Ward_Name", how="left")
wards["stop_count"] = wards["stop_count"].fillna(0)
wards["stop_density"] = wards["stop_count"] / wards["area_km2"]

stop_routes = stop_times.merge(trips[["trip_id", "route_id"]], on="trip_id", how="left")
stop_route_map = stop_routes[["stop_id", "route_id"]].drop_duplicates()
stops_in_wards_simple = stops_in_wards[["stop_id", "Ward_Name"]]
stop_route_ward = stop_route_map.merge(stops_in_wards_simple, on="stop_id", how="inner")
stop_route_ward = stop_route_ward.dropna(subset=["route_id"])
route_counts = stop_route_ward.groupby("Ward_Name")["route_id"].nunique().reset_index()
route_counts.columns = ["Ward_Name", "route_count"]
wards = wards.drop(columns=[col for col in wards.columns if "route_count" in col], errors="ignore")
wards = wards.merge(route_counts, on="Ward_Name", how="left")
wards["route_count"] = wards["route_count"].fillna(0)
wards["route_density"] = wards["route_count"] / wards["area_km2"]

overlap = stop_route_ward.groupby(["Ward_Name", "stop_id"]).size().reset_index(name="route_per_stop")
overlap_score = overlap.groupby("Ward_Name")["route_per_stop"].mean().reset_index(name="overlap_score")
wards = wards.merge(overlap_score, on="Ward_Name", how="left")
wards["overlap_score"] = wards["overlap_score"].fillna(0)

ward_counts = stops_in_wards.groupby("stop_id")["Ward_Name"].nunique().reset_index(name="ward_count")
shared_stops = ward_counts[ward_counts["ward_count"] > 1]["stop_id"]
stops_multi = stops_in_wards[stops_in_wards["stop_id"].isin(shared_stops)]
border_overlap = stops_multi.groupby("Ward_Name").size().reset_index(name="border_overlap")
wards = wards.merge(border_overlap, on="Ward_Name", how="left")
wards["border_overlap"] = wards["border_overlap"].fillna(0)
wards["border_overlap_score"] = wards["border_overlap"] / wards["stop_count"].replace(0, np.nan)
wards["border_overlap_score"] = wards["border_overlap_score"].fillna(0)

fare_map = fare_rules.merge(fare_attributes, on="fare_id", how="left")
fare_map = fare_map[["route_id", "price"]].dropna()
route_fare = stop_route_ward.merge(fare_map, on="route_id", how="left")
fare_avg = route_fare.groupby("Ward_Name")["price"].mean().reset_index(name="avg_fare")
wards = wards.merge(fare_avg, on="Ward_Name", how="left")
wards["avg_fare"] = wards["avg_fare"].fillna(0)

wards.to_file("../data/processed/wards_enriched.geojson", driver="GeoJSON")
wards.to_csv("../data/processed/wards_enriched.csv", index=False)

print(" Feature engineering complete with advanced features!")

✅ Feature engineering complete with advanced features!
