# 06 – Target shortlist (High anomaly, manageable uncertainty)
Trains a calibrated RF, combines with Bayesian mean & uncertainty,
and outputs data/processed/targets.csv (top 50 by RF prob, filtered by lower 60% Bayes uncertainty)


In [2]:

import os
import numpy as np
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from pathlib import Path

project_root = Path.cwd().parent

# Load
Xc = np.load(project_root / 'data/processed/X_coords.npy')
y  = np.load(project_root / 'data/processed/y_labels.npy')
grid = joblib.load(project_root / 'data/processed/grid_gdf.joblib')

parts = [Xc]

if os.path.exists(project_root / 'data/processed/X_geo.npy'):
    Xg = np.load(project_root / 'data/processed/X_geo.npy')
    if Xg.shape[0] == Xc.shape[0]:
        parts.append(Xg)

if os.path.exists(project_root / 'data/processed/X_gravity.npy'):
    Xgrav = np.load(project_root / 'data/processed/X_gravity.npy').reshape(-1,1)
    if Xgrav.shape[0] == Xc.shape[0]:
        parts.append(Xgrav)

if os.path.exists(project_root / 'data/processed/X_gravity_grad.npy'):
    Xgg = np.load(project_root / 'data/processed/X_gravity_grad.npy').reshape(-1,1)
    if Xgg.shape[0] == Xc.shape[0]:
        parts.append(Xgg)

X = np.hstack(parts)

# Calibrated RF (isotonic)
rf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
cal = CalibratedClassifierCV(rf, method="isotonic", cv=5)
cal.fit(X, y)
rf_probs = cal.predict_proba(X)[:,1]

# Load Bayes outputs
mean_path = project_root / 'data/processed/mean_probs.npy'
std_path  = project_root / 'data/processed/std_probs.npy'
if not (os.path.exists(mean_path) and os.path.exists(std_path)):
    raise FileNotFoundError("Run 04_bayesian_logreg.ipynb to produce mean/std arrays.")

bayes_mean = np.load(mean_path)
bayes_std  = np.load(std_path)

# Assemble and filter
df = grid.copy()
df['rf_p'] = rf_probs
df['bayes_mean'] = bayes_mean
df['bayes_std'] = bayes_std

unc_thresh = float(np.nanquantile(df['bayes_std'], 0.6))  # lower 60% uncertainty
targets = (df.query('bayes_std <= @unc_thresh')
             .sort_values('rf_p', ascending=False)
             .head(50)
             .copy())

# Write a portable CSV (centroids instead of WKT geometries)
targets['lon'] = targets.geometry.centroid.x
targets['lat'] = targets.geometry.centroid.y
out_cols = ['lon','lat','rf_p','bayes_mean','bayes_std']
os.makedirs(project_root / "data/processed", exist_ok=True)
targets[out_cols].to_csv(project_root / 'data/processed/targets.csv', index=False)
print("Saved data/processed/targets.csv (top 50 with moderate uncertainty)")


FileNotFoundError: Run 04_bayesian_logreg.ipynb to produce mean/std arrays.