In [None]:
import plotly.express as px

from lightgbm import LGBMRegressor, early_stopping, log_evaluation

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd
import geopandas as gpd
import json

In [None]:
# crimes from 2022-2025
crimes = pd.read_csv('data/all_crimes_2022-2025.csv')

# clean data
crimes = crimes[(crimes['Falls within'] == 'Metropolitan Police Service')]
crimes = crimes.drop(['Context', 'Crime ID', 'Reported by', 'Falls within',
                      'Last outcome category', 'LSOA name', 'LSOA code', 'Location'], axis=1)
crimes = crimes.dropna().reset_index(drop=True)

crimes_test = crimes[crimes['Month'].str.contains('2024')].reset_index(drop=True)
test_X_data = crimes_test[crimes_test['Crime type'] != 'Burglary'].reset_index(drop=True)
test_y_data = crimes_test[crimes_test['Crime type'] == 'Burglary'].drop(['Crime type'], axis=1).reset_index(drop=True)

# crimes = crimes[~crimes['Month'].astype(str).str.contains('2024')]
# crimes = crimes[~crimes['Month'].astype(str).str.contains('2025')]
crimes = crimes[crimes['Month'].astype(str).str.contains('2023')]

train_X_data = crimes[crimes['Crime type'] != 'Burglary'].reset_index(drop=True)
train_y_data = crimes[crimes['Crime type'] == 'Burglary'].drop(['Crime type'], axis=1).reset_index(drop=True)

In [None]:
wards = gpd.read_file('geodata/wards2019.geojson')
wards = wards.to_crs(epsg=4326)

In [None]:
# Assign each burglary point a geometry
def get_count(data, sort_by):
    wards = gpd.read_file('geodata/wards2019.geojson')
    wards = wards.to_crs(epsg=4326)

    gdf_crimes = gpd.GeoDataFrame(
        data,
        geometry=gpd.points_from_xy(data['Longitude'], data['Latitude']),
        crs="EPSG:4326"
    )
    
    # Join burglary points to wards, keeping GSS_Code and Name
    crimes_gdf_wards = gpd.sjoin(
        gdf_crimes,
        wards[['geometry', 'NAME', 'GSS_CODE']],
        how='left',
        predicate='within'
    )
    
    # Filter out City of London and nulls
    crimes_gdf_wards = crimes_gdf_wards[
        (crimes_gdf_wards['NAME'] != 'City of London') & 
        (crimes_gdf_wards['GSS_CODE'].notna())
    ].reset_index(drop=True)
    
    # Count burglaries per GSS_Code
    crimes_burg_counts = (
        crimes_gdf_wards
        .groupby(sort_by)
        .size()
        .reset_index(name='Count')
    )
    
    # Merge counts into full wards geodf
    return_value = wards.merge(crimes_burg_counts, on='GSS_CODE', how='left')
    return_value['Count'] = return_value['Count'].fillna(0)
    
    # Drop unnecessary columns
    return_value = return_value[return_value['DISTRICT'] != 'City and County of the City of London']
    return return_value.drop(columns=['DISTRICT', 'LAGSSCODE', 'HECTARES', 'NONLD_AREA'])

In [None]:
X_train_df = get_count(train_X_data, ['GSS_CODE', 'Crime type'])
y_train_df = get_count(train_y_data, ['GSS_CODE'])

X_test_df = get_count(test_X_data, ['GSS_CODE', 'Crime type'])
y_test_df = get_count(test_y_data, ['GSS_CODE'])

In [None]:
X_train_df = (
    X_train_df
    .pivot_table(
        index=['NAME','GSS_CODE','geometry'],
        columns='Crime type',
        values='Count',
        fill_value=0
    )
    .reset_index()
)
X_train_df.columns.name = None

X_test_df = (
    X_test_df
    .pivot_table(
        index=['NAME','GSS_CODE','geometry'],
        columns='Crime type',
        values='Count',
        fill_value=0
    )
    .reset_index()
)
X_test_df.columns.name = None

In [None]:
indep_vars = ['Anti-social behaviour',
       'Bicycle theft', 'Criminal damage and arson', 'Other theft', 'Robbery',
       'Shoplifting', 'Theft from the person', 'Vehicle crime']
X_train = X_train_df[indep_vars]
y_train = y_train_df[['Count']]
X_test = X_test_df[indep_vars]
y_test = y_test_df[['Count']]

# Initialize model
model = LGBMRegressor(objective='regression', random_state=42, n_estimators = 10_000, n_jobs=-1)

# Hyperparameter grid to search
param_grid = {
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 40, 60],
    'max_depth': [-1, 5]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search.fit(
    X_train, 
    y_train.values.ravel(),
    eval_set=[(X_test, y_test.values.ravel())],
    callbacks=[
        early_stopping(stopping_rounds=20, verbose=False),
        log_evaluation(period=0)
    ]
)

# Best parameters from GridSearchCV
print("Best hyperparameters:", grid_search.best_params_)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

# Evaluate performance
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Optimized RMSE: {rmse:.2f}')


In [None]:
pred_series = pd.Series(y_pred, index=X_test.index, name='Predicted_Count')
pred_series

In [None]:
wards_with_pred = wards.merge(
    pred_series.to_frame(), 
    left_index=True, 
    right_index=True, 
    how='left'
)

wards_with_pred = wards_with_pred[wards_with_pred['Predicted_Count'].notna()]

In [None]:
wards_with_pred['Predicted_Count'].mean()

In [None]:
fig = px.choropleth_map(
    wards_with_pred,
    geojson=json.loads(wards.to_json()),
    locations='GSS_CODE',
    featureidkey="properties.GSS_CODE",
    color='Predicted_Count',
    range_color=(0, 120),
    color_continuous_scale="OrRd",
    map_style="open-street-map",
    zoom=9,
    center={"lat": 51.5072, "lon": -0.1276},
    opacity=0.6,
    height=600
)

fig.update_layout(title='Predicted Count Heatmap by London Ward')
fig.show()

In [None]:
fig = px.choropleth_map(
    y_test_df,
    geojson=json.loads(wards.to_json()),
    locations='GSS_CODE',
    featureidkey="properties.GSS_CODE",
    color='Count',
    range_color=(0, 400),
    color_continuous_scale="OrRd",
    map_style="open-street-map",
    zoom=9,
    center={"lat": 51.5072, "lon": -0.1276},
    opacity=0.6,
    height=600
)

fig.update_layout(title='actual count Heatmap by London Ward')
fig.show()