In [None]:
## Yearly Trained Model Urban Core Model

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os, re, json, warnings, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm

from sklearn.model_selection import train_test_split, KFold, ParameterSampler
from sklearn.metrics import r2_score
import xgboost as xgb
import shap
from joblib import Parallel, delayed

plt.rcParams["figure.dpi"] = 120
plt.rcParams["savefig.dpi"] = 120

# ── Paths ────────────────────────────────────────────────────────────────────
CSV = "folder/AllCities_pixels_2003_2020.csv"
OUT_DIR = "folder/Urban_Core_Year_Wise_Results/outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# strip trailing _YYYY if present; we will append each year later
OUT_ROOT = re.sub(r"_(?:19|20)\d{2}$", "", OUT_DIR)

TARGET = "HI"  # predicted ensemble HI
NON_FEATURE = ["row", "col", "year", "lon", "city", "Residual_HI_1km"]
DROP_EXTRA = ["LST", "WSA", "POP", "GEO", "RAD", "DIST2COAST", "DPTnorm", "IMP", "HI_obs_12to1km"]
LIMIT = 368_077
N_SHAP = 5000

USE_GPU    = True
NUM_CORES  = 64
N_TRIALS   = 5     # (kept your value)
CV_FOLDS   = 3
EARLY_STOP = 50

param_grid = {
    "n_estimators":      [300, 500, 800, 1200, 1500],
    "learning_rate":     [0.01, 0.03, 0.05, 0.1],
    "max_depth":         [3, 4, 5, 6, 8],
    "min_child_weight":  [1, 3, 5, 8],
    "subsample":         [0.6, 0.8, 1.0],
    "colsample_bytree":  [0.6, 0.8, 1.0],
    "gamma":             [0, 0.1, 0.3, 1.0],
    "reg_alpha":         [0.0, 0.1, 0.5, 1.0],
    "reg_lambda":        [0.5, 1.0, 2.0, 5.0],
}
sampler = list(ParameterSampler(param_grid, n_iter=N_TRIALS, random_state=42))

# ── SHAP export helpers (GPU-safe) ───────────────────────────────────────────
# Cap numerical library threads inside workers (avoid thread storms)
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
# If you want to hide GPUs for the SHAP stage entirely, uncomment:
# os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")

N_JOBS = int(os.environ.get("N_JOBS", max(1, (os.cpu_count() or 2) - 1)))
N_JOBS = min(N_JOBS, 8)
PRED_BATCH = 200_000
BG_SAMPLE_N = None  # set to e.g. 1000 for a lighter SHAP background

_GLOBAL = {"explainer": None, "model": None, "feature_names": None}

def _force_cpu_predictor(mdl):
    try:
        booster = mdl.get_booster()
        booster.set_param({'predictor': 'cpu_predictor', 'nthread': 1})
    except Exception:
        try:
            mdl.set_params(predictor='cpu_predictor')
            if hasattr(mdl, 'n_jobs'):
                mdl.set_params(n_jobs=1)
        except Exception as e:
            warnings.warn(f"Could not force CPU predictor: {e}")

def _predict_cpu_batched(mdl, X_block, batch=PRED_BATCH):
    n = len(X_block)
    if n <= batch:
        return mdl.predict(X_block)
    out = []
    for i in range(0, n, batch):
        out.append(mdl.predict(X_block.iloc[i:i+batch]))
    return np.concatenate(out)

def _get_explainer(model, X_train, feature_names):
    if _GLOBAL["explainer"] is None:
        _force_cpu_predictor(model)
        _GLOBAL["model"] = model
        _GLOBAL["feature_names"] = list(feature_names)
        X_bg = X_train
        if BG_SAMPLE_N is not None and len(X_train) > BG_SAMPLE_N:
            X_bg = X_train.sample(BG_SAMPLE_N, random_state=42)
        _GLOBAL["explainer"] = shap.Explainer(model, X_bg, feature_names=feature_names)
    return _GLOBAL["explainer"], _GLOBAL["model"], _GLOBAL["feature_names"]

def _safe_city_name(city):
    return re.sub(r"[^\w\-]+", "_", str(city)).strip("_")

def _process_one_group(city, yr, idx, X_block, meta_block, *,
                       save_dir: Path, target_name: str,
                       model, X_train, feature_names):
    try:
        explainer, mdl, featnames = _get_explainer(model, X_train, feature_names)
        sv_obj = explainer(X_block)
        vals   = getattr(sv_obj, "values", np.array(sv_obj))
        base   = getattr(sv_obj, "base_values", explainer.expected_value)
        base   = np.atleast_1d(base)
        if base.size == 1:
            base = np.repeat(base, X_block.shape[0])

        shap_df = pd.DataFrame(vals, columns=featnames, index=X_block.index)

        meta = meta_block.copy()
        meta["city"] = city
        meta["year"] = yr
        meta["HI_pred"] = _predict_cpu_batched(mdl, X_block)
        meta["base_value"] = base

        out_df = pd.concat([meta.reset_index(drop=True),
                            shap_df.reset_index(drop=True)], axis=1)

        safe_city = _safe_city_name(city)
        fp_rows   = save_dir / f"shap_{safe_city}_{yr}.csv"
        out_df.to_csv(fp_rows, index=False)

        summ = pd.DataFrame({
            "feature": featnames,
            "mean_shap": np.nanmean(vals, axis=0),
            "mean_abs_shap": np.nanmean(np.abs(vals), axis=0),
            "n_rows": X_block.shape[0],
            "city": city,
            "year": yr
        })
        fp_summ = save_dir / f"shap_summary_{safe_city}_{yr}.csv"
        summ.to_csv(fp_summ, index=False)

        del sv_obj, vals, base, shap_df, out_df, summ
        gc.collect()
        return (city, yr, X_block.shape[0], None)
    except Exception as e:
        return (city, yr, 0, repr(e))

def run_shap_export(df, X, X_train, best, target_name, out_dir_year):
    SAVE_DIR = Path(out_dir_year) / "shap_city_year"
    SAVE_DIR.mkdir(parents=True, exist_ok=True)

    df_idxed = df.copy()
    df_idxed["_rowid"] = np.arange(len(df))
    if not {"city","year"}.issubset(df_idxed.columns):
        raise KeyError("df must contain 'city' and 'year' to export by city-year.")

    groups = df_idxed.groupby(["city", "year"], sort=True)
    tasks = []
    feature_names = list(X.columns)
    meta_keep = [c for c in ["row","col","lon","lat"] if c in df.columns] + [target_name]

    for (city, yr), g in groups:
        idx = g["_rowid"].to_numpy()
        if idx.size == 0:
            continue
        Xg = X.loc[idx]
        if Xg.shape[0] == 0:
            continue
        meta = df.loc[idx, meta_keep].copy()
        meta.rename(columns={target_name: "HI_true"}, inplace=True)
        tasks.append((city, yr, idx, Xg, meta))

    results = Parallel(n_jobs=N_JOBS, backend="loky", prefer="processes")(
        delayed(_process_one_group)(
            city, yr, idx, Xg, meta,
            save_dir=SAVE_DIR,
            target_name=target_name,
            model=best,
            X_train=X_train,
            feature_names=feature_names,
        )
        for (city, yr, idx, Xg, meta) in tqdm(tasks, total=len(tasks), desc="SHAP city-year (parallel)")
    )

    errs = [(c,y,e) for (c,y,_,e) in results if e]
    if errs:
        print("⚠️ Some groups failed:")
        for c,y,e in errs[:10]:
            print(f"  {c}-{y}: {e}")
        if len(errs) > 10:
            print(f"  ... and {len(errs)-10} more")

    print(f"✅ Wrote per city×year SHAP CSVs to: {str(SAVE_DIR)}  (groups={len(tasks)}, n_jobs={N_JOBS})")

# ── Load all data once ───────────────────────────────────────────────────────
df_all = pd.read_csv(CSV)
if TARGET not in df_all.columns:
    raise KeyError(f"Target '{TARGET}' not found in CSV.")

summary_rows = []  # collect per-year metrics

# ── Train, evaluate, save, SHAP per year ─────────────────────────────────────
for YEAR in range(2003, 2021):
    print("\n" + "="*80)
    print(f"Year {YEAR}")

    df = df_all.loc[df_all["year"] == YEAR].copy()
    if df.empty:
        print(f"No rows for {YEAR}, skipping.")
        continue

    if LIMIT is not None:
        df = df.iloc[:min(LIMIT, len(df))].copy()
    df = df.dropna(subset=[TARGET]).reset_index(drop=True)

    drop_for_X = [TARGET] + NON_FEATURE + DROP_EXTRA
    X = df.drop(columns=drop_for_X, errors="ignore")
    X = X.apply(pd.to_numeric, errors="coerce")
    y = df[TARGET].astype(float)

    print(f"TARGET = {TARGET} | YEAR={YEAR} | n_samples={len(y)} | n_features={X.shape[1]}")
    print(f"Dropped from X: {sorted(set(drop_for_X) & set(df.columns))}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    # HPO (kept as-is)
    best_score  = -np.inf
    best_params = None
    desc = f"HPO ({'GPU' if USE_GPU else 'CPU'}, {CV_FOLDS}-fold CV, {YEAR})"
    pbar = tqdm(sampler, total=len(sampler), desc=desc, unit="trial")

    for params in pbar:
        cv_scores = []
        kf = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=42)

        model = xgb.XGBRegressor(
            **params,
            objective="reg:squarederror",
            tree_method="gpu_hist" if USE_GPU else "hist",
            predictor="gpu_predictor" if USE_GPU else "auto",
            n_jobs=1,
            random_state=42
        )

        for tr_idx, val_idx in kf.split(X_train):
            X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=EARLY_STOP,
                verbose=False
            )
            y_val_pred = model.predict(X_val)
            cv_scores.append(r2_score(y_val, y_val_pred))

        mean_score = float(np.mean(cv_scores))
        if mean_score > best_score:
            best_score  = mean_score
            best_params = params
        pbar.set_postfix(best_R2=f"{best_score:.4f}", last_R2=f"{mean_score:.4f}")
    pbar.close()

    print("\nBest CV R^2:", f"{best_score:.4f}")
    print("Best params:")
    for k, v in best_params.items():
        print(f"  {k}: {v}")

    # Refit with early stopping
    best = xgb.XGBRegressor(
        **best_params,
        objective="reg:squarederror",
        tree_method="gpu_hist" if USE_GPU else "hist",
        predictor="gpu_predictor" if USE_GPU else "auto",
        n_jobs=NUM_CORES,
        random_state=42
    )
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train, y_train, test_size=0.20, random_state=42
    )
    best.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=EARLY_STOP,
        verbose=True
    )
    best_iter = getattr(best, "best_iteration", None)
    if best_iter is not None:
        print("Early-stopping best_iteration:", best_iter)

    # Evaluate
    y_pred = best.predict(X_test)
    test_r2 = r2_score(y_test, y_pred)
    print(f"Test R^2 ({TARGET}, {YEAR}): {test_r2:.4f}")

    # Per-year output dir
    OUT_DIR_Y = f"{OUT_ROOT}_{YEAR}"
    os.makedirs(OUT_DIR_Y, exist_ok=True)

    # Save metrics & params & model
    metrics_row = {
        "year": YEAR,
        "n_train": int(len(X_train)),
        "n_test": int(len(X_test)),
        "best_cv_r2": float(best_score),
        "test_r2": float(test_r2),
        "best_iteration": int(best_iter) if best_iter is not None else None
    }
    pd.DataFrame([metrics_row]).to_csv(os.path.join(OUT_DIR_Y, f"metrics_{YEAR}.csv"), index=False)

    with open(os.path.join(OUT_DIR_Y, f"best_params_{YEAR}.json"), "w") as f:
        json.dump(best_params, f, indent=2)

    model_path = os.path.join(OUT_DIR_Y, f"model_xgb_{YEAR}.json")
    best.save_model(model_path)
    print(f"Saved model to {model_path}  (load with xgb.XGBRegressor().load_model(path))")

    summary_rows.append(metrics_row)

    # Save a small per-city diagnostic (kept from your script, optional)
    if "city" in df.columns:
        test_idx = X_test.index
        df_test = df.loc[test_idx].copy()
        df_test["y_true"] = y_test.values
        df_test["y_pred"] = y_pred
        df_test["resid"]  = df_test["y_true"] - df_test["y_pred"]
        df_test["abs_err"] = df_test["resid"].abs()

        rows = []
        for city, g in df_test.groupby("city"):
            r2c = r2_score(g["y_true"], g["y_pred"]) if g["y_true"].nunique() >= 2 else np.nan
            rmse = float(np.sqrt(np.mean((g["y_true"] - g["y_pred"])**2)))
            mae  = float(np.mean(np.abs(g["y_true"] - g["y_pred"])))
            rows.append({"city": city, "n": len(g), "R2": r2c, "RMSE": rmse, "MAE": mae})
        city_metrics = pd.DataFrame(rows).sort_values("R2", na_position="last")
        city_metrics.to_csv(os.path.join(OUT_DIR_Y, f"per_city_metrics_{YEAR}.csv"), index=False)

        overall_r2 = r2_score(df_test["y_true"], df_test["y_pred"])
        deltas = []
        for city, g in df_test.groupby("city"):
            mask = df_test["city"] != city
            if mask.sum() >= 2 and df_test.loc[mask, "y_true"].nunique() >= 2:
                r2_wo = r2_score(df_test.loc[mask,"y_true"], df_test.loc[mask,"y_pred"])
                deltas.append({"city": city, "n": len(g), "delta_R2_if_removed": r2_wo - overall_r2})
        pd.DataFrame(deltas).sort_values("delta_R2_if_removed", ascending=False)\
            .to_csv(os.path.join(OUT_DIR_Y, f"city_deltaR2_if_removed_{YEAR}.csv"), index=False)

        cols_to_keep = ["city","year","lon","lat","y_true","y_pred","resid","abs_err"]
        keep = [c for c in cols_to_keep if c in df_test.columns]
        df_test.sort_values("abs_err", ascending=False).loc[:, keep].head(50)\
            .to_csv(os.path.join(OUT_DIR_Y, f"worst_rows_top50_{YEAR}.csv"), index=False)

    # ── SHAP export per city × year (files end with _{YEAR}) ─────────────────
    run_shap_export(df=df, X=X, X_train=X_train, best=best,
                    target_name=TARGET, out_dir_year=OUT_DIR_Y)

# Save overall summary across years
if summary_rows:
    pd.DataFrame(summary_rows).to_csv(os.path.join(OUT_ROOT, "model_performance_summary.csv"), index=False)
    print(f"\n📄 Wrote per-year performance summary to {os.path.join(OUT_ROOT, 'model_performance_summary.csv')}")


In [None]:
## Yearly Trained Model SemiUrban Model

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os, re, json, warnings, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm

from sklearn.model_selection import train_test_split, KFold, ParameterSampler
from sklearn.metrics import r2_score
import xgboost as xgb
import shap
from joblib import Parallel, delayed

plt.rcParams["figure.dpi"] = 120
plt.rcParams["savefig.dpi"] = 120

# ── Paths ────────────────────────────────────────────────────────────────────
CSV = "folder/Raw_Data_Urban_Semiurban_Extent/AllCities_pixels_2003_2020.csv"
OUT_DIR = "folder/Global_XGBoost_Model/Semiurban_Year_Wise_Results/outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# strip trailing _YYYY if present; we will append each year later
OUT_ROOT = re.sub(r"_(?:19|20)\d{2}$", "", OUT_DIR)

TARGET = "HI"  # predicted ensemble HI
NON_FEATURE = ["row", "col", "year", "lon", "city", "Residual_HI_1km"]
DROP_EXTRA = ["LST", "WSA", "POP", "GEO", "RAD", "DIST2COAST", "DPTnorm", "IMP", "HI_obs_12to1km"]
LIMIT = 368_077
N_SHAP = 5000

USE_GPU    = True
NUM_CORES  = 64
N_TRIALS   = 5     # (kept your value)
CV_FOLDS   = 3
EARLY_STOP = 50

param_grid = {
    "n_estimators":      [300, 500, 800, 1200, 1500],
    "learning_rate":     [0.01, 0.03, 0.05, 0.1],
    "max_depth":         [3, 4, 5, 6, 8],
    "min_child_weight":  [1, 3, 5, 8],
    "subsample":         [0.6, 0.8, 1.0],
    "colsample_bytree":  [0.6, 0.8, 1.0],
    "gamma":             [0, 0.1, 0.3, 1.0],
    "reg_alpha":         [0.0, 0.1, 0.5, 1.0],
    "reg_lambda":        [0.5, 1.0, 2.0, 5.0],
}
sampler = list(ParameterSampler(param_grid, n_iter=N_TRIALS, random_state=42))

# ── SHAP export helpers (GPU-safe) ───────────────────────────────────────────
# Cap numerical library threads inside workers (avoid thread storms)
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
# If you want to hide GPUs for the SHAP stage entirely, uncomment:
# os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")

N_JOBS = int(os.environ.get("N_JOBS", max(1, (os.cpu_count() or 2) - 1)))
N_JOBS = min(N_JOBS, 8)
PRED_BATCH = 200_000
BG_SAMPLE_N = None  # set to e.g. 1000 for a lighter SHAP background

_GLOBAL = {"explainer": None, "model": None, "feature_names": None}

def _force_cpu_predictor(mdl):
    try:
        booster = mdl.get_booster()
        booster.set_param({'predictor': 'cpu_predictor', 'nthread': 1})
    except Exception:
        try:
            mdl.set_params(predictor='cpu_predictor')
            if hasattr(mdl, 'n_jobs'):
                mdl.set_params(n_jobs=1)
        except Exception as e:
            warnings.warn(f"Could not force CPU predictor: {e}")

def _predict_cpu_batched(mdl, X_block, batch=PRED_BATCH):
    n = len(X_block)
    if n <= batch:
        return mdl.predict(X_block)
    out = []
    for i in range(0, n, batch):
        out.append(mdl.predict(X_block.iloc[i:i+batch]))
    return np.concatenate(out)

def _get_explainer(model, X_train, feature_names):
    if _GLOBAL["explainer"] is None:
        _force_cpu_predictor(model)
        _GLOBAL["model"] = model
        _GLOBAL["feature_names"] = list(feature_names)
        X_bg = X_train
        if BG_SAMPLE_N is not None and len(X_train) > BG_SAMPLE_N:
            X_bg = X_train.sample(BG_SAMPLE_N, random_state=42)
        _GLOBAL["explainer"] = shap.Explainer(model, X_bg, feature_names=feature_names)
    return _GLOBAL["explainer"], _GLOBAL["model"], _GLOBAL["feature_names"]

def _safe_city_name(city):
    return re.sub(r"[^\w\-]+", "_", str(city)).strip("_")

def _process_one_group(city, yr, idx, X_block, meta_block, *,
                       save_dir: Path, target_name: str,
                       model, X_train, feature_names):
    try:
        explainer, mdl, featnames = _get_explainer(model, X_train, feature_names)
        sv_obj = explainer(X_block)
        vals   = getattr(sv_obj, "values", np.array(sv_obj))
        base   = getattr(sv_obj, "base_values", explainer.expected_value)
        base   = np.atleast_1d(base)
        if base.size == 1:
            base = np.repeat(base, X_block.shape[0])

        shap_df = pd.DataFrame(vals, columns=featnames, index=X_block.index)

        meta = meta_block.copy()
        meta["city"] = city
        meta["year"] = yr
        meta["HI_pred"] = _predict_cpu_batched(mdl, X_block)
        meta["base_value"] = base

        out_df = pd.concat([meta.reset_index(drop=True),
                            shap_df.reset_index(drop=True)], axis=1)

        safe_city = _safe_city_name(city)
        fp_rows   = save_dir / f"shap_{safe_city}_{yr}.csv"
        out_df.to_csv(fp_rows, index=False)

        summ = pd.DataFrame({
            "feature": featnames,
            "mean_shap": np.nanmean(vals, axis=0),
            "mean_abs_shap": np.nanmean(np.abs(vals), axis=0),
            "n_rows": X_block.shape[0],
            "city": city,
            "year": yr
        })
        fp_summ = save_dir / f"shap_summary_{safe_city}_{yr}.csv"
        summ.to_csv(fp_summ, index=False)

        del sv_obj, vals, base, shap_df, out_df, summ
        gc.collect()
        return (city, yr, X_block.shape[0], None)
    except Exception as e:
        return (city, yr, 0, repr(e))

def run_shap_export(df, X, X_train, best, target_name, out_dir_year):
    SAVE_DIR = Path(out_dir_year) / "shap_city_year"
    SAVE_DIR.mkdir(parents=True, exist_ok=True)

    df_idxed = df.copy()
    df_idxed["_rowid"] = np.arange(len(df))
    if not {"city","year"}.issubset(df_idxed.columns):
        raise KeyError("df must contain 'city' and 'year' to export by city-year.")

    groups = df_idxed.groupby(["city", "year"], sort=True)
    tasks = []
    feature_names = list(X.columns)
    meta_keep = [c for c in ["row","col","lon","lat"] if c in df.columns] + [target_name]

    for (city, yr), g in groups:
        idx = g["_rowid"].to_numpy()
        if idx.size == 0:
            continue
        Xg = X.loc[idx]
        if Xg.shape[0] == 0:
            continue
        meta = df.loc[idx, meta_keep].copy()
        meta.rename(columns={target_name: "HI_true"}, inplace=True)
        tasks.append((city, yr, idx, Xg, meta))

    results = Parallel(n_jobs=N_JOBS, backend="loky", prefer="processes")(
        delayed(_process_one_group)(
            city, yr, idx, Xg, meta,
            save_dir=SAVE_DIR,
            target_name=target_name,
            model=best,
            X_train=X_train,
            feature_names=feature_names,
        )
        for (city, yr, idx, Xg, meta) in tqdm(tasks, total=len(tasks), desc="SHAP city-year (parallel)")
    )

    errs = [(c,y,e) for (c,y,_,e) in results if e]
    if errs:
        print("⚠️ Some groups failed:")
        for c,y,e in errs[:10]:
            print(f"  {c}-{y}: {e}")
        if len(errs) > 10:
            print(f"  ... and {len(errs)-10} more")

    print(f"✅ Wrote per city×year SHAP CSVs to: {str(SAVE_DIR)}  (groups={len(tasks)}, n_jobs={N_JOBS})")

# ── Load all data once ───────────────────────────────────────────────────────
df_all = pd.read_csv(CSV)
if TARGET not in df_all.columns:
    raise KeyError(f"Target '{TARGET}' not found in CSV.")

summary_rows = []  # collect per-year metrics

# ── Train, evaluate, save, SHAP per year ─────────────────────────────────────
for YEAR in range(2003, 2021):
    print("\n" + "="*80)
    print(f"Year {YEAR}")

    df = df_all.loc[df_all["year"] == YEAR].copy()
    if df.empty:
        print(f"No rows for {YEAR}, skipping.")
        continue

    if LIMIT is not None:
        df = df.iloc[:min(LIMIT, len(df))].copy()
    df = df.dropna(subset=[TARGET]).reset_index(drop=True)

    drop_for_X = [TARGET] + NON_FEATURE + DROP_EXTRA
    X = df.drop(columns=drop_for_X, errors="ignore")
    X = X.apply(pd.to_numeric, errors="coerce")
    y = df[TARGET].astype(float)

    print(f"TARGET = {TARGET} | YEAR={YEAR} | n_samples={len(y)} | n_features={X.shape[1]}")
    print(f"Dropped from X: {sorted(set(drop_for_X) & set(df.columns))}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    # HPO (kept as-is)
    best_score  = -np.inf
    best_params = None
    desc = f"HPO ({'GPU' if USE_GPU else 'CPU'}, {CV_FOLDS}-fold CV, {YEAR})"
    pbar = tqdm(sampler, total=len(sampler), desc=desc, unit="trial")

    for params in pbar:
        cv_scores = []
        kf = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=42)

        model = xgb.XGBRegressor(
            **params,
            objective="reg:squarederror",
            tree_method="gpu_hist" if USE_GPU else "hist",
            predictor="gpu_predictor" if USE_GPU else "auto",
            n_jobs=1,
            random_state=42
        )

        for tr_idx, val_idx in kf.split(X_train):
            X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=EARLY_STOP,
                verbose=False
            )
            y_val_pred = model.predict(X_val)
            cv_scores.append(r2_score(y_val, y_val_pred))

        mean_score = float(np.mean(cv_scores))
        if mean_score > best_score:
            best_score  = mean_score
            best_params = params
        pbar.set_postfix(best_R2=f"{best_score:.4f}", last_R2=f"{mean_score:.4f}")
    pbar.close()

    print("\nBest CV R^2:", f"{best_score:.4f}")
    print("Best params:")
    for k, v in best_params.items():
        print(f"  {k}: {v}")

    # Refit with early stopping
    best = xgb.XGBRegressor(
        **best_params,
        objective="reg:squarederror",
        tree_method="gpu_hist" if USE_GPU else "hist",
        predictor="gpu_predictor" if USE_GPU else "auto",
        n_jobs=NUM_CORES,
        random_state=42
    )
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train, y_train, test_size=0.20, random_state=42
    )
    best.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=EARLY_STOP,
        verbose=True
    )
    best_iter = getattr(best, "best_iteration", None)
    if best_iter is not None:
        print("Early-stopping best_iteration:", best_iter)

    # Evaluate
    y_pred = best.predict(X_test)
    test_r2 = r2_score(y_test, y_pred)
    print(f"Test R^2 ({TARGET}, {YEAR}): {test_r2:.4f}")

    # Per-year output dir
    OUT_DIR_Y = f"{OUT_ROOT}_{YEAR}"
    os.makedirs(OUT_DIR_Y, exist_ok=True)

    # Save metrics & params & model
    metrics_row = {
        "year": YEAR,
        "n_train": int(len(X_train)),
        "n_test": int(len(X_test)),
        "best_cv_r2": float(best_score),
        "test_r2": float(test_r2),
        "best_iteration": int(best_iter) if best_iter is not None else None
    }
    pd.DataFrame([metrics_row]).to_csv(os.path.join(OUT_DIR_Y, f"metrics_{YEAR}.csv"), index=False)

    with open(os.path.join(OUT_DIR_Y, f"best_params_{YEAR}.json"), "w") as f:
        json.dump(best_params, f, indent=2)

    model_path = os.path.join(OUT_DIR_Y, f"model_xgb_{YEAR}.json")
    best.save_model(model_path)
    print(f"Saved model to {model_path}  (load with xgb.XGBRegressor().load_model(path))")

    summary_rows.append(metrics_row)

    # Save a small per-city diagnostic (kept from your script, optional)
    if "city" in df.columns:
        test_idx = X_test.index
        df_test = df.loc[test_idx].copy()
        df_test["y_true"] = y_test.values
        df_test["y_pred"] = y_pred
        df_test["resid"]  = df_test["y_true"] - df_test["y_pred"]
        df_test["abs_err"] = df_test["resid"].abs()

        rows = []
        for city, g in df_test.groupby("city"):
            r2c = r2_score(g["y_true"], g["y_pred"]) if g["y_true"].nunique() >= 2 else np.nan
            rmse = float(np.sqrt(np.mean((g["y_true"] - g["y_pred"])**2)))
            mae  = float(np.mean(np.abs(g["y_true"] - g["y_pred"])))
            rows.append({"city": city, "n": len(g), "R2": r2c, "RMSE": rmse, "MAE": mae})
        city_metrics = pd.DataFrame(rows).sort_values("R2", na_position="last")
        city_metrics.to_csv(os.path.join(OUT_DIR_Y, f"per_city_metrics_{YEAR}.csv"), index=False)

        overall_r2 = r2_score(df_test["y_true"], df_test["y_pred"])
        deltas = []
        for city, g in df_test.groupby("city"):
            mask = df_test["city"] != city
            if mask.sum() >= 2 and df_test.loc[mask, "y_true"].nunique() >= 2:
                r2_wo = r2_score(df_test.loc[mask,"y_true"], df_test.loc[mask,"y_pred"])
                deltas.append({"city": city, "n": len(g), "delta_R2_if_removed": r2_wo - overall_r2})
        pd.DataFrame(deltas).sort_values("delta_R2_if_removed", ascending=False)\
            .to_csv(os.path.join(OUT_DIR_Y, f"city_deltaR2_if_removed_{YEAR}.csv"), index=False)

        cols_to_keep = ["city","year","lon","lat","y_true","y_pred","resid","abs_err"]
        keep = [c for c in cols_to_keep if c in df_test.columns]
        df_test.sort_values("abs_err", ascending=False).loc[:, keep].head(50)\
            .to_csv(os.path.join(OUT_DIR_Y, f"worst_rows_top50_{YEAR}.csv"), index=False)

    # ── SHAP export per city × year (files end with _{YEAR}) ─────────────────
    run_shap_export(df=df, X=X, X_train=X_train, best=best,
                    target_name=TARGET, out_dir_year=OUT_DIR_Y)

# Save overall summary across years
if summary_rows:
    pd.DataFrame(summary_rows).to_csv(os.path.join(OUT_ROOT, "model_performance_summary.csv"), index=False)
    print(f"\n📄 Wrote per-year performance summary to {os.path.join(OUT_ROOT, 'model_performance_summary.csv')}")
