In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from scipy.spatial import distance_matrix
from shapely.geometry import Point
from scipy.spatial import cKDTree
from pykrige.ok import OrdinaryKriging

In [9]:
journey_data = pd.read_csv("merged_journey_data.csv")
nodes_data = pd.read_csv("nodes.csv")
income_data = pd.read_csv("DemographicDataset/raw/income.csv")
population_data = pd.read_csv("DemographicDataset/raw/2022 population.csv")[['lsoa21', 'population']]
population_data.rename(columns={"lsoa21": "lsoa21cd", "population": "Population"}, inplace=True)
population_data['Population'] = pd.to_numeric(population_data['Population'], errors='coerce').fillna(0)
population_data = population_data.groupby('lsoa21cd', as_index=False)['Population'].sum()

deprivation_data = pd.read_csv("DemographicDataset/raw/indices_of_deprivation.csv")[['LSOA code (2011)', 'Index of Multiple Deprivation (IMD) Score']]
deprivation_data.rename(columns={"LSOA code (2011)": "lsoa21cd", "Index of Multiple Deprivation (IMD) Score": "IMD Score"}, inplace=True)
deprivation_data['IMD Score'] = pd.to_numeric(deprivation_data['IMD Score'], errors='coerce').fillna(0)

boroughs_path = "GIS/"
borough_files = [f for f in os.listdir(boroughs_path) if f.endswith(".shp")]
lsoa_boundaries_list = [
gpd.read_file(os.path.join(boroughs_path, f)).to_crs("EPSG:4326")
for f in borough_files
]
lsoa_boundaries = gpd.GeoDataFrame(pd.concat(lsoa_boundaries_list, ignore_index=True))

# Merge population and deprivation data into lsoa_boundaries
lsoa_boundaries = lsoa_boundaries.merge(population_data, on='lsoa21cd', how='left')
lsoa_boundaries = lsoa_boundaries.merge(deprivation_data, on='lsoa21cd', how='left')

# Re-project to a projected CRS for accurate centroid calculation
lsoa_boundaries_projected = lsoa_boundaries.to_crs("EPSG:27700")
lsoa_boundaries_projected['centroid'] = lsoa_boundaries_projected.geometry.centroid

# Re-project centroids back to geographic CRS
lsoa_boundaries['centroid'] = lsoa_boundaries_projected['centroid'].to_crs("EPSG:4326")

centroids = lsoa_boundaries[['lsoa21cd', 'centroid', 'Population', 'IMD Score']].copy()
centroids['x'] = centroids['centroid'].x
centroids['y'] = centroids['centroid'].y

In [10]:
print(len(population_data))
print(len(deprivation_data))
print(len(lsoa_boundaries))

4994
4835
4994


In [11]:
potential_stations = nodes_data[nodes_data['name'] == 'Potential Node']
existing_stations = nodes_data[nodes_data['name'] != 'Potential Node']

potential_stations_gdf = gpd.GeoDataFrame(
    potential_stations[['lat', 'lon', 'id']].reset_index(drop=True),
    geometry=gpd.points_from_xy(potential_stations['lon'], potential_stations['lat']),
    crs="EPSG:4326"
)

existing_stations_gdf = gpd.GeoDataFrame(
    existing_stations[['lat', 'lon', 'id', 'name']].reset_index(drop=True),
    geometry=gpd.points_from_xy(existing_stations['lon'], existing_stations['lat']),
    crs="EPSG:4326"
)

potential_stations_with_lsoa = gpd.sjoin(potential_stations_gdf, lsoa_boundaries, how="left", predicate="within")
existing_stations_with_lsoa = gpd.sjoin(existing_stations_gdf, lsoa_boundaries, how="left", predicate="within")

potential_stations_with_lsoa = potential_stations_with_lsoa.dropna(subset=['lsoa21cd'])

In [12]:
potential_stations_with_lsoa

Unnamed: 0,lat,lon,id,geometry,index_right,lsoa21cd,lsoa21nm,msoa21cd,msoa21nm,lad22cd,lad22nm,Population,IMD Score,centroid
0,51.542030,-0.009190,potential_0,POINT (-0.00919 51.54203),3761.0,E01034217,Newham 038E,E02006995,Newham 038,E09000025,Newham,2298.0,,POINT (-0.01256 51.54405)
1,51.549220,-0.011890,potential_1,POINT (-0.01189 51.54922),3750.0,E01034213,Newham 038A,E02006995,Newham 038,E09000025,Newham,1856.0,,POINT (-0.01449 51.54959)
2,51.459730,-0.203730,potential_2,POINT (-0.20373 51.45973),4803.0,E01004511,Wandsworth 010B,E02000932,Wandsworth 010,E09000032,Wandsworth,1588.0,8.1,POINT (-0.19966 51.4591)
3,51.459090,-0.201900,potential_3,POINT (-0.2019 51.45909),4803.0,E01004511,Wandsworth 010B,E02000932,Wandsworth 010,E09000032,Wandsworth,1588.0,8.1,POINT (-0.19966 51.4591)
4,51.458840,-0.199950,potential_4,POINT (-0.19995 51.45884),4781.0,E01035493,Wandsworth 010K,E02000932,Wandsworth 010,E09000032,Wandsworth,1728.0,,POINT (-0.1957 51.45785)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1573,51.495660,-0.036900,potential_1573,POINT (-0.0369 51.49566),4212.0,E01004059,Southwark 008D,E02000814,Southwark 008,E09000028,Southwark,2119.0,17.3,POINT (-0.04151 51.49563)
1574,51.500170,-0.022640,potential_1574,POINT (-0.02264 51.50017),4529.0,E01032771,Tower Hamlets 033B,E02006854,Tower Hamlets 033,E09000030,Tower Hamlets,2130.0,13.2,POINT (-0.01967 51.50277)
1575,51.499248,0.002041,potential_1575,POINT (0.00204 51.49925),1773.0,E01034188,Greenwich 040A,E02006992,Greenwich 040,E09000011,Greenwich,1536.0,,POINT (0.00378 51.49832)
1576,51.493630,-0.010260,potential_1576,POINT (-0.01026 51.49363),4531.0,E01004217,Tower Hamlets 030C,E02000893,Tower Hamlets 030,E09000030,Tower Hamlets,2219.0,22.5,POINT (-0.01111 51.4901)


In [None]:
income_data.rename(columns={"LSOA code": "lsoa21cd"}, inplace=True)
income_data['50th percentile (£)'] = income_data['50th percentile (£)'].astype(str).str.replace(',', '').astype(float)

# Prepare data for Kriging interpolation

# Interpolate Population
ok_population = OrdinaryKriging(
    centroids['x'], centroids['y'], centroids['Population'], variogram_model='linear'
)
population_interp, _ = ok_population.execute(
    'points', potential_stations_gdf.geometry.x, potential_stations_gdf.geometry.y
)
potential_stations_gdf['Interpolated Population'] = population_interp

# Replace direct assignment with interpolated values
potential_stations_with_lsoa = pd.merge(
    potential_stations_gdf, income_data, on='lsoa21cd', how='left'
)
potential_stations_with_lsoa['Population'] = potential_stations_gdf['Interpolated Population']

existing_stations_with_lsoa = pd.merge(existing_stations_with_lsoa, income_data, on='lsoa21cd', how='left')
existing_stations_with_lsoa = pd.merge(existing_stations_with_lsoa, population_data, on='lsoa21cd', how='left').fillna(0)
existing_stations_with_lsoa = pd.merge(existing_stations_with_lsoa, deprivation_data, on='lsoa21cd', how='left').fillna(0)

KeyError: 'lsoa21'

In [None]:
start_counts = journey_data['Start station'].value_counts().reset_index()
start_counts.columns = ['Station Name', 'Start Count']

end_counts = journey_data['End station'].value_counts().reset_index()
end_counts.columns = ['Station Name', 'End Count']

station_demand = pd.merge(start_counts, end_counts, on='Station Name', how='outer').fillna(0)
station_demand['Total Demand'] = station_demand['Start Count'] + station_demand['End Count']
station_demand['Normalized Demand'] = station_demand['Total Demand'] / station_demand['Total Demand'].max()

existing_stations_with_income = pd.merge(
    existing_stations_with_lsoa, 
    station_demand, 
    left_on='name',
    right_on='Station Name', 
    how='left'
).fillna(0)

In [None]:
station_radius = 400
existing_tree = cKDTree(existing_stations_with_lsoa[['lat', 'lon']].values)

density_weights = np.zeros(len(potential_stations_with_lsoa))

for i, (lat, lon) in enumerate(potential_stations_with_lsoa[['lat', 'lon']].values):
    nearby_stations = existing_tree.query_ball_point([lat, lon], station_radius / 111_000)
    density_weights[i] = len(nearby_stations)

density_weights = np.exp(-density_weights)
potential_stations_with_lsoa['Density Weight'] = density_weights

In [None]:
potential_stations_with_lsoa['Wealth Equity Score'] = 1 - (
    potential_stations_with_lsoa['50th percentile (£)'] / potential_stations_with_lsoa['50th percentile (£)'].max()
)
potential_stations_with_lsoa['Wealth Equity Score'] = potential_stations_with_lsoa['Wealth Equity Score'].fillna(0)

max_imd = potential_stations_with_lsoa['IMD Score'].max()
max_income = potential_stations_with_lsoa['50th percentile (£)'].max()
potential_stations_with_lsoa['Normalized IMD Score'] = potential_stations_with_lsoa['IMD Score'] / max_imd
potential_stations_with_lsoa['Normalized Income Score'] = potential_stations_with_lsoa['50th percentile (£)'] / max_income

potential_stations_with_lsoa = potential_stations_with_lsoa.dropna(subset=['lat', 'lon'])
existing_stations_with_income = existing_stations_with_income.dropna(subset=['lat', 'lon'])

In [None]:
if 'geometry' in lsoa_boundaries.columns:
    lsoa_boundaries['area'] = lsoa_boundaries['geometry'].area
else:
    raise ValueError("missing geometry")

potential_stations_with_lsoa = potential_stations_with_lsoa.merge(
    lsoa_boundaries[['lsoa21cd', 'area']], on='lsoa21cd', how='left'
)

potential_stations_with_lsoa['Population Density'] = (
    potential_stations_with_lsoa['Population'] / potential_stations_with_lsoa['area']
)
potential_stations_with_lsoa['Population Density'] = potential_stations_with_lsoa['Population Density'].fillna(0)

if 'Population Density' not in potential_stations_with_lsoa.columns:
    raise ValueError("Population Density not found")

existing_stations_with_income.to_csv("processed_existing_stations.csv", index=False)
potential_stations_with_lsoa.to_csv("processed_potential_stations.csv", index=False)

In [None]:
existing_stations_with_income