# 03b – Ablation (ROC AUC & PR AUC) + simple spatial CV
Compares feature sets and evaluates with both ROC AUC and PR AUC (better for imbalance).
Saves a CSV summary to figures/ablation_scores.csv

In [1]:
%pip install -q -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import numpy as np
import joblib
import pandas as pd
from pathlib import Path
import sys

# Add the project root to sys.path so 'src' can be imported
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
	sys.path.append(str(project_root))
 
from src.models import train_baseline_rf
from src.visualization import plot_probability_map


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier

Xc = np.load(project_root / 'data/processed/X_coords.npy')

# Try critical labels first
if os.path.exists(project_root / "data/processed/y_labels_crit.npy"):
    y = np.load(project_root / "data/processed/y_labels_crit.npy")
    LABEL_SET = "critical"
else:
    y = np.load(project_root / "data/processed/y_labels.npy")
    LABEL_SET = "all"

print("Using label set:", LABEL_SET, "| positives:", y.sum(), "| rate:", float(y.mean()))

grid = joblib.load(project_root / "data/processed/grid_gdf.joblib")
# Build feature variants
variants = {"coords": [Xc]}

if os.path.exists(project_root / "data/processed/X_geo.npy"):
    variants["coords+geo"] = [Xc, np.load(project_root / "data/processed/X_geo.npy")]

if os.path.exists(project_root / "data/processed/X_gravity.npy"):
    base = variants.get("coords+geo", [Xc])
    variants["coords+geo+grav"] = base + [np.load(project_root / "data/processed/X_gravity.npy").reshape(-1,1)]

if os.path.exists(project_root / "data/processed/X_gravity_grad.npy"):
    base = variants.get("coords+geo+grav", variants.get("coords+geo", [Xc]))
    variants["coords+geo+grav+grad"] = base + [np.load(project_root / "data/processed/X_gravity_grad.npy").reshape(-1,1)]

if os.path.exists(project_root / "data/processed/X_geochem.npy"):
    base = variants.get("coords+geo+grav+grad", variants.get("coords+geo+grav", variants.get("coords+geo", [Xc])))
    variants["coords+geo+grav+grad+geochem"] = base + [np.load(project_root / "data/processed/X_geochem.npy")]

if os.path.exists(project_root / "data/processed/X_mag.npy"):
    base = variants.get("coords+geo+grav+grad+geochem", variants.get("coords+geo+grav", variants.get("coords+geo", [Xc])))
    variants["coords+geo+grav+grad+geochem+mag"] = base + [np.load(project_root / "data/processed/X_mag.npy")]


def make_X(parts): return np.hstack(parts)

# (A) Stratified 5-fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rows = []
for name, parts in variants.items():
    X = make_X(parts)
    aucs, pras = [], []
    for tr, te in skf.split(X, y):
        clf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
        clf.fit(X[tr], y[tr])
        p = clf.predict_proba(X[te])[:,1]
        aucs.append(roc_auc_score(y[te], p))
        pras.append(average_precision_score(y[te], p))
    rows.append({"variant": name, "cv": "stratified-5x",
                 "roc_auc": float(np.mean(aucs)),
                 "pr_auc": float(np.mean(pras))})

# --- Spatial block CV (4×4 LOBO) with safe variable names + diagnostics ---

y_labels = y  # preserve your binary labels

# 1) Project for centroid math (meters)
grid_proj = grid.to_crs("EPSG:5070")
centroids = grid_proj.geometry.centroid
x_cent = centroids.x.values
y_cent = centroids.y.values

# 2) Quantile bins for more even blocks
xbins = np.quantile(x_cent, [0, 0.25, 0.5, 0.75, 1.0])
ybins = np.quantile(y_cent, [0, 0.25, 0.5, 0.75, 1.0])

xi = np.clip(np.digitize(x_cent, xbins, right=False) - 1, 0, 3)
yi = np.clip(np.digitize(y_cent, ybins, right=False) - 1, 0, 3)
block_id = xi + 4 * yi  # 0..15

# 3) Diagnostics: how many positives / negatives per block?
blk_total = np.bincount(block_id, minlength=16)
blk_pos   = np.bincount(block_id, weights=y_labels, minlength=16)
blk_neg   = blk_total - blk_pos
print("Spatial 4x4 block totals:", blk_total.tolist())
print("Spatial 4x4 block positives:", blk_pos.astype(int).tolist())
print("Spatial 4x4 block negatives:", blk_neg.astype(int).tolist())

def has_both_classes(idx):
    if len(idx) == 0:
        return False
    s = y_labels[idx].sum()
    return (s > 0) and (s < len(idx))

rows_spatial = []
for name, parts in variants.items():
    X = np.hstack(parts)
    aucs, pras, used_blocks = [], [], 0
    for b in range(16):
        te = np.where(block_id == b)[0]
        if not has_both_classes(te):
            continue
        tr = np.setdiff1d(np.arange(len(y_labels)), te)
        if not has_both_classes(tr):
            continue

        clf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
        clf.fit(X[tr], y_labels[tr])
        p = clf.predict_proba(X[te])[:, 1]

        aucs.append(roc_auc_score(y_labels[te], p))
        pras.append(average_precision_score(y_labels[te], p))
        used_blocks += 1

    if aucs:
        rows_spatial.append({
            "variant": name,
            "cv": f"spatial-LOBO(4x4) used_blocks={used_blocks}",
            "roc_auc": float(np.mean(aucs)),
            "pr_auc":  float(np.mean(pras)),
        })
    else:
        rows_spatial.append({
            "variant": name,
            "cv": "spatial-LOBO(4x4) used_blocks=0",
            "roc_auc": np.nan,
            "pr_auc":  np.nan,
        })

rows += rows_spatial

df = pd.DataFrame(rows).sort_values(["cv","roc_auc"], ascending=[True, False])
print(df.to_string(index=False))

os.makedirs("figures", exist_ok=True)
df.to_csv(project_root / "figures/ablation_scores.csv", index=False)
print("Saved figures/ablation_scores.csv")

Using label set: critical | positives: 3162 | rate: 0.4135495684017787
Spatial 4x4 block totals: [319, 386, 390, 817, 346, 449, 460, 656, 453, 516, 533, 409, 794, 560, 528, 30]
Spatial 4x4 block positives: [166, 159, 185, 267, 105, 164, 242, 378, 225, 200, 322, 227, 303, 114, 93, 12]
Spatial 4x4 block negatives: [153, 227, 205, 550, 241, 285, 218, 278, 228, 316, 211, 182, 491, 446, 435, 18]
                         variant                               cv  roc_auc   pr_auc
coords+geo+grav+grad+geochem+mag spatial-LOBO(4x4) used_blocks=16 0.634547 0.552882
            coords+geo+grav+grad spatial-LOBO(4x4) used_blocks=16 0.618328 0.532575
    coords+geo+grav+grad+geochem spatial-LOBO(4x4) used_blocks=16 0.613070 0.530901
                 coords+geo+grav spatial-LOBO(4x4) used_blocks=16 0.605786 0.523955
                      coords+geo spatial-LOBO(4x4) used_blocks=16 0.593128 0.503629
                          coords spatial-LOBO(4x4) used_blocks=16 0.582689 0.496634
coords+geo+grav+gr