In [1]:
import polars as pl, numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

print(f"XGBoost version: {xgb.__version__}")

XGBoost version: 3.0.2


In [68]:
X_train_df = pl.read_parquet("data/X_train.parquet").drop("LSOA code")
X_test_df  = pl.read_parquet("data/X_test.parquet").drop("LSOA code")

X_train = pl.read_parquet("data/X_train.parquet").drop("LSOA code").to_numpy()
y_train = pl.read_parquet("data/y_train.parquet").to_numpy().ravel()
X_test  = pl.read_parquet("data/X_test.parquet").drop("LSOA code").to_numpy()
y_test  = pl.read_parquet("data/y_test.parquet").to_numpy().ravel()

feature_names = X_train_df.columns
len(feature_names)

70

In [70]:
my_model = XGBRegressor()
my_model.fit(X_train, y_train, verbose=False)

In [71]:
predictions = my_model.predict(X_test)
predictions_rounded = np.round(predictions).astype(int)

print("=" * 10 + " Base Model Evaluation " + "=" * 10)
print(f"{'Mean Squared Error':<25}: {mean_squared_error(y_test, predictions):>10.4f}")
print(f"{'Root Mean Squared Error':<25}: {np.sqrt(mean_squared_error(y_test, predictions)):>10.4f}")
print(f"{'Mean Absolute Error':<25}: {mean_absolute_error(y_test, predictions):>10.4f}")
print(f"{'R²':<25}: {r2_score(y_test, predictions):>10.4f}")
print(f"{'Explained variance':<25}: {explained_variance_score(y_test, predictions):>10.4f}")
print("=" * 43)

Mean Squared Error       :     1.1988
Root Mean Squared Error  :     1.0949
Mean Absolute Error      :     0.7758
R²                       :     0.2830
Explained variance       :     0.2867


In [72]:
optimized_model = XGBRegressor(
    colsample_bytree=0.679486272613669,
    learning_rate=0.16678305712187014,
    max_depth=4,
    min_child_weight=8,
    n_estimators=468,
    subsample=0.9085081386743783,
    objective="count:poisson",
    eval_metric="rmse",
    tree_method="hist",
    random_state=42,
    early_stopping_rounds=72,
    n_jobs=-1
)

adjusted_model = optimized_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

In [73]:
opt_predictions = adjusted_model.predict(X_test)

print("=" * 10 + " Optimized Model Evaluation " + "=" * 10)
print(f"{'Mean Squared Error':<25}: {mean_squared_error(y_test, opt_predictions):>10.4f}")
print(f"{'Root Mean Squared Error':<25}: {np.sqrt(mean_squared_error(y_test, opt_predictions)):>10.4f}")
print(f"{'Mean Absolute Error':<25}: {mean_absolute_error(y_test, opt_predictions):>10.4f}")
print(f"{'R²':<25}: {r2_score(y_test, opt_predictions):>10.4f}")
print(f"{'Explained variance':<25}: {explained_variance_score(y_test, opt_predictions):>10.4f}")
print("=" * 48)

Mean Squared Error       :     1.1635
Root Mean Squared Error  :     1.0787
Mean Absolute Error      :     0.7850
R²                       :     0.3041
Explained variance       :     0.3047


In [74]:
importance = optimized_model.feature_importances_
order = np.argsort(importance)[::-1]

# top_n = 108
# print("\nTop features (by gain):")
# for idx in order[:top_n]:
#     print(f"{feature_names[idx]:<35s} {importance[idx]:>10.5f}")

for idx in order:
    print(f"{feature_names[idx]:<45s} {importance[idx]:>30f}")

burglary_count_ewm_12                                               0.288946
revictimization_risk                                                0.149039
burglary_count_sum_12                                               0.114716
burglary_count_ewm_6                                                0.057576
burglary_count_lag_1                                                0.048654
neighbor_burglary_weighted_avg                                      0.033175
neighbor_burglary_avg                                               0.028088
burglary_count_tema_6                                               0.025454
neighbor_burglary_max                                               0.019741
burglary_count_lag_3                                                0.018084
ALL_PROPERTIES                                                      0.016568
burglary_count_hma_4                                                0.014441
burglary_count_lag_6                                                0.011316