# Gradient Boosting Hotspot Modeling
Train a gradient boosting regressor on the cleaned Manila dataset to forecast next-year hotspot counts per district. This mirrors the loading pattern of the existing notebooks but swaps in scikit-learn's `GradientBoostingRegressor` for comparison.

In [1]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

# Add project root to Python path
if '__file__' in globals():
    project_root = Path(__file__).resolve().parent.parent
else:
    project_root = Path.cwd().parent if Path.cwd().name == 'notebook' else Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [2]:
# Locate dataset and output paths
NOTEBOOK_DIR = Path(__file__).resolve().parent if '__file__' in globals() else Path.cwd()
DATA_PATH = NOTEBOOK_DIR / 'Missing People - cleaned.csv'
OUTPUT_DIR = NOTEBOOK_DIR / 'outputs'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
if not DATA_PATH.exists():
    fallback_path = NOTEBOOK_DIR.parent / 'notebook' / 'Missing People - cleaned.csv'
    if fallback_path.exists():
        DATA_PATH = fallback_path
    else:
        raise FileNotFoundError("Run 01_data_preprocessing.ipynb to regenerate 'Missing People - cleaned.csv'.")

df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,Person_ID,Person ID,AGE,GENDER,Date Reported Missing,Time Reported Missing,Date Last Seen,Location Last Seen,Post URL,Time_Obj,Age_Group,City_Cleaned,Barangay_Cleaned,District_Cleaned,Latitude,Longitude,Location_Match_Level,Location_Match_Score,Year,Hour_Missing
0,MP-0001,,59,Male,2020-01-14,12:48 PM,2019-12-14,"Malate, Manila",https://www.facebook.com/share/p/1Fp5H7uddW/,2025-12-11 12:48:00,Adult,Manila City,,Malate,14.5714,120.9904,district,0.95,2020.0,12.0
1,MP-0002,,41,Male,2020-01-24,5:12 PM,2021-01-16,"Sampaloc, Manila",https://www.facebook.com/share/p/1CwZW3pbpf/,2025-12-11 17:12:00,Adult,Manila City,,Sampaloc,14.6133,121.0003,district,0.95,2020.0,17.0
2,MP-0003,,43,Male,2020-02-09,7:03 PM,,"Tondo, Manila",https://www.facebook.com/share/p/1CoiXoTEjb/,2025-12-11 19:03:00,Adult,Manila City,,Tondo,14.6186,120.9681,district,0.95,2020.0,19.0
3,MP-0004,,14,Male,2020-02-15,12:19 PM,,"Binondo, Manila",https://www.facebook.com/share/p/17Umn23xj9/,2025-12-11 12:19:00,Young Teen,Manila City,,Binondo,14.6006,120.9754,district,0.95,2020.0,12.0
4,MP-0005,,16,Male,2020-03-23,12:25,2025-03-11,"Paco,. Manila",https://www.facebook.com/share/p/1BhMzYvEJN/,2025-12-11 12:25:00,Teen,Manila City,,Paco,14.5833,120.9961,district,0.95,2020.0,12.0


In [3]:
# Basic cleaning and feature assembly
df['Date Reported Missing'] = pd.to_datetime(df['Date Reported Missing'], errors='coerce')
year_series = df['Year'] if 'Year' in df.columns else pd.Series(pd.NA, index=df.index)
df['Year'] = year_series.fillna(df['Date Reported Missing'].dt.year)
df['Year'] = df['Year'].astype('Int64')

# Use district when available, otherwise fall back to barangay label
district_series = df['District_Cleaned'] if 'District_Cleaned' in df.columns else pd.Series(pd.NA, index=df.index)
barangay_series = df['Barangay_Cleaned'] if 'Barangay_Cleaned' in df.columns else pd.Series(pd.NA, index=df.index)
df['District_Key'] = (district_series.fillna(barangay_series).fillna('Unknown').astype(str).str.strip())
df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')

# Aggregate yearly counts per district with centroid features
grouped = (
    df.dropna(subset=['Year'])
    .groupby(['District_Key', 'Year'])
    .agg(Count=('Person_ID', 'size'),
         Lat_Mean=('Latitude', 'mean'),
         Lon_Mean=('Longitude', 'mean'))
    .reset_index()
 )

# One-hot encode districts
X_base = grouped[['Year', 'Lat_Mean', 'Lon_Mean']].fillna(grouped[['Lat_Mean', 'Lon_Mean']].mean())
district_dummies = pd.get_dummies(grouped['District_Key'], prefix='dist')
X = pd.concat([X_base, district_dummies], axis=1)
y = grouped['Count']

X.head(), y.head()

(   Year  Lat_Mean  Lon_Mean  dist_Binondo  dist_Ermita  dist_Intramuros  \
 0  2020   14.6006  120.9754          True        False            False   
 1  2021   14.6006  120.9754          True        False            False   
 2  2023   14.6006  120.9754          True        False            False   
 3  2024   14.6006  120.9754          True        False            False   
 4  2025   14.6006  120.9754          True        False            False   
 
    dist_Malate  dist_Paco  dist_Pandacan  dist_Port Area  dist_Quiapo  \
 0        False      False          False           False        False   
 1        False      False          False           False        False   
 2        False      False          False           False        False   
 3        False      False          False           False        False   
 4        False      False          False           False        False   
 
    dist_Sampaloc  dist_San Andres  dist_San Miguel  dist_Santa Ana  \
 0          False        

In [4]:
# Train/validation split and model fitting
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)

val_pred = gbr.predict(X_val)
mae = mean_absolute_error(y_val, val_pred)
rmse = np.sqrt(mean_squared_error(y_val, val_pred))
mae, rmse

(1.8600604886856502, np.float64(2.600018809733669))

In [5]:
# Forecast next-year counts for all districts
latest_year = int(grouped['Year'].max())
next_year = latest_year + 1
latest_centroids = grouped.sort_values('Year').groupby('District_Key').tail(1)
forecast_df = latest_centroids[['District_Key', 'Lat_Mean', 'Lon_Mean']].copy()
forecast_df['Year'] = next_year

X_forecast_base = forecast_df[['Year', 'Lat_Mean', 'Lon_Mean']].fillna(grouped[['Lat_Mean', 'Lon_Mean']].mean())
forecast_dummies = pd.get_dummies(forecast_df['District_Key'], prefix='dist')
# Align dummy columns with training features
X_forecast = pd.concat([X_forecast_base, forecast_dummies], axis=1)
missing_cols = set(X.columns) - set(X_forecast.columns)
for col in missing_cols:
    X_forecast[col] = 0
X_forecast = X_forecast[X.columns]

pred_counts = gbr.predict(X_forecast)
forecast_df['Predicted_Count'] = np.maximum(0, pred_counts)
forecast_df = forecast_df[['District_Key', 'Predicted_Count']].sort_values('Predicted_Count', ascending=False)
forecast_df.head(15)

Unnamed: 0,Barangay District,Latitude,Longitude,Predicted_Cases,Prev_Year_Count
14,Tondo,14.6186,120.9681,12.94562,13.0
12,Santa Cruz,14.615,120.983,3.881321,4.0
7,Intramuros,14.5904,120.977,2.862444,1.0
10,Sampaloc,14.6133,121.0003,2.716217,4.0
2,San Andres,14.5746,121.0039,1.945931,2.0
3,Paco,14.5833,120.9961,1.797092,2.0
9,Binondo,14.6006,120.9754,1.331546,3.0
11,Santa Ana,14.5802,121.0116,1.323357,1.0
4,Quiapo,14.5998,120.9844,1.122464,1.0
8,Malate,14.5714,120.9904,1.119617,1.0


In [6]:
# Save forecast table
forecast_path = OUTPUT_DIR / 'gradient_boosting_predictions.csv'
forecast_df.to_csv(forecast_path, index=False)
print(f'Saved predictions to {forecast_path.relative_to(NOTEBOOK_DIR)}')
forecast_path

Saved predictions to outputs\gradient_boosting_predictions.csv


Saved predictions to outputs\gradient_boosting_predictions.csv


WindowsPath('c:/Users/keanu/Desktop/College/4th-yr-1st/Data Mining/missing-person-heatmap/notebook/outputs/gradient_boosting_predictions.csv')