# Calibrator Debug Notebook

This notebook helps you debug why the **LR calibrator** is producing unexpectedly low probabilities.

It focuses on the most common root causes:
- wrong `predict_proba` column (classes order not `[0, 1]`)
- wrong calibrator file loaded
- feature mismatch / distribution shift
- extreme disagreement between CNN and calibrator

It will:
1. Load calibrator and print `classes_`, coefficients, intercept.
2. Define a safe `proba_fake()` helper (select by class label).
3. Score a handful of files and compare **CNN-only** vs **calibrated**.
4. Recompute calibrator probabilities two ways (`[:,1]` vs `proba_fake`) to detect column mistakes.


In [None]:
# ---- Imports ----
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import joblib

# Your scorer module (must exist)
from src.inference.scorer import AudioSpoofScorer, ScorerConfig


In [None]:
# ---- Paths / config ----
CNN_MODEL = "models/cnn/audio_cnn_balanced_best.pt"

# Point this at the exact calibrator you think you are using
CALIBRATOR_PATH = "models/calibrators/agg_lr_real_or_fake_new.joblib"

DEVICE = "cpu"

# Dataset dirs (must have real/ and fake/ subfolders) for quick comparison
REAL_OR_FAKE_DIR = "data/audio/processed/real_or_fake"
ASVSPOOF_DIR = None  # optionally set e.g. "data/audio/processed/asvspoof_2021_df"

# Sampling
N_PER_CLASS = 50
SEED = 42

print("CNN_MODEL exists:", Path(CNN_MODEL).exists())
print("CALIBRATOR exists:", Path(CALIBRATOR_PATH).exists())
print("REAL_OR_FAKE_DIR exists:", Path(REAL_OR_FAKE_DIR).exists())


In [None]:
# ---- Load calibrator and inspect ----
obj = joblib.load(CALIBRATOR_PATH)
cal = obj["aggregator"] if isinstance(obj, dict) and "aggregator" in obj else obj

print("Calibrator type:", type(cal))
print("classes_:", getattr(cal, "classes_", None))

# Coefs / intercept (for LogisticRegression / linear models)
if hasattr(cal, "coef_"):
    print("coef_.shape:", cal.coef_.shape)
    print("intercept_:", cal.intercept_)
    if isinstance(obj, dict) and "feature_names" in obj:
        print("feature_names:", obj["feature_names"])
else:
    print("No coef_ (might be a pipeline or wrapped estimator).")


In [None]:
# ---- Safe probability extraction: pick the column for 'fake' by label ----
def proba_fake(calibrator, X):
    proba = calibrator.predict_proba(X)
    classes = list(getattr(calibrator, "classes_", []))

    # integer label case: fake==1
    if 1 in classes:
        idx = classes.index(1)
        return proba[:, idx]

    # string label cases
    for fake_name in ("fake", "spoof", "bonafide_fake", "1"):
        if fake_name in classes:
            idx = classes.index(fake_name)
            return proba[:, idx]

    raise ValueError(f"Cannot find fake class in calibrator.classes_: {classes}")

X_dummy = np.zeros((3, 5), dtype=np.float32)

print("predict_proba shape:", cal.predict_proba(X_dummy).shape)
print("predict_proba[:,1] sample:", cal.predict_proba(X_dummy)[:, 1])

try:
    print("proba_fake() sample:", proba_fake(cal, X_dummy))
except Exception as e:
    print("proba_fake() failed:", e)


In [None]:
# ---- Instantiate scorer (calibrator ON) ----
scorer = AudioSpoofScorer(
    cnn_model_path=CNN_MODEL,
    calibrator_path=CALIBRATOR_PATH,
    device=DEVICE,
    config=ScorerConfig(n_segments=6, clip_seconds=4.0, cnn_agg="median", low_thr=0.3, high_thr=0.7),
)
print("Scorer calibrator loaded:", scorer.calibrator is not None)
print("Scorer calibrator classes_:", getattr(scorer.calibrator, "classes_", None))


In [None]:
# ---- Helper: list & sample files ----
import random

AUDIO_EXTS = {".wav", ".flac", ".mp3", ".ogg", ".m4a"}

def list_audio_files(root: Path):
    return sorted([p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in AUDIO_EXTS])

def sample_files(base_dir: str, n_per_class: int = 50, seed: int = 42):
    base = Path(base_dir)
    real = list_audio_files(base / "real")
    fake = list_audio_files(base / "fake")
    rng = random.Random(seed)

    if n_per_class is not None:
        if len(real) > n_per_class:
            real = rng.sample(real, n_per_class)
        if len(fake) > n_per_class:
            fake = rng.sample(fake, n_per_class)

    files = [(p, 0) for p in sorted(real)] + [(p, 1) for p in sorted(fake)]
    return files

files_rof = sample_files(REAL_OR_FAKE_DIR, n_per_class=N_PER_CLASS, seed=SEED)
print("Sampled files:", len(files_rof), "(real+fake)")
print("Example:", files_rof[0])


In [None]:
# ---- Score files and compare CNN vs calibrator (API output) ----
rows = []
for p, y in files_rof:
    rep = scorer.score_file(str(p), threshold=None)  # no decision needed
    if not rep.get("ok", False):
        rows.append({"path": str(p), "y_true": y, "ok": False, "error": rep.get("error")})
        continue

    sig = rep.get("signals") or {}
    meta = rep.get("meta") or {}

    rows.append({
        "path": str(p),
        "y_true": y,
        "ok": True,
        "cnn_only_score": float(sig.get("cnn_only_score")),
        "calibrated_score_api": float(sig.get("calibrated_score")) if sig.get("calibrated_score") is not None else np.nan,
        "cnn_median": float(sig.get("cnn_median")),
        "cnn_max": float(sig.get("cnn_max")),
        "cnn_var": float(sig.get("cnn_var")),
        "total_seconds": float(sig.get("total_seconds")),
        "silence_ratio": float(sig.get("silence_ratio")),
        "effective_clip_seconds": meta.get("effective_clip_seconds"),
        "effective_n_segments": meta.get("effective_n_segments"),
    })

df = pd.DataFrame(rows)
df_ok = df[df["ok"]].copy()
df_ok.head()


In [None]:
# ---- Recompute calibrator probs locally using BOTH methods ----
X = df_ok[["cnn_median", "cnn_max", "cnn_var", "total_seconds", "silence_ratio"]].to_numpy(dtype=np.float32)

p_col1 = cal.predict_proba(X)[:, 1]
p_safe = proba_fake(cal, X)

df_ok["cal_proba_col1"] = p_col1.astype(float)
df_ok["cal_proba_safe"] = p_safe.astype(float)

df_ok[["cnn_only_score", "calibrated_score_api", "cal_proba_col1", "cal_proba_safe"]].head(15)


In [None]:
# ---- Which method matches the scorer output? ----
diff_col1 = float(np.nanmean(np.abs(df_ok["calibrated_score_api"] - df_ok["cal_proba_col1"])))
diff_safe = float(np.nanmean(np.abs(df_ok["calibrated_score_api"] - df_ok["cal_proba_safe"])))

print("Mean abs diff vs scorer - [:,1]:", diff_col1)
print("Mean abs diff vs scorer - proba_fake:", diff_safe)

if diff_safe < diff_col1:
    print("✅ Scorer output matches class-aware probability (or safe column).")
else:
    print("⚠️ Scorer output matches [:,1]. If classes_ isn't [0,1], this can be wrong.")


In [None]:
# ---- Disagreement analysis ----
df_ok["delta"] = (df_ok["cnn_only_score"] - df_ok["calibrated_score_api"]).abs()
print(df_ok["delta"].describe())

# Show the biggest CNN vs calibrator disagreements
df_ok.sort_values("delta", ascending=False).head(25)[
    ["path", "y_true", "cnn_only_score", "calibrated_score_api", "delta", "total_seconds", "silence_ratio", "cnn_var"]
]


In [None]:
# ---- Visualize score distributions ----
plt.figure()
plt.hist(df_ok[df_ok["y_true"]==0]["cnn_only_score"].values, bins=30, alpha=0.7, label="real (cnn)")
plt.hist(df_ok[df_ok["y_true"]==1]["cnn_only_score"].values, bins=30, alpha=0.7, label="fake (cnn)")
plt.title("CNN-only score distribution (real_or_fake)")
plt.xlabel("cnn_only_score"); plt.ylabel("count"); plt.legend(); plt.show()

plt.figure()
plt.hist(df_ok[df_ok["y_true"]==0]["calibrated_score_api"].values, bins=30, alpha=0.7, label="real (cal)")
plt.hist(df_ok[df_ok["y_true"]==1]["calibrated_score_api"].values, bins=30, alpha=0.7, label="fake (cal)")
plt.title("Calibrated score distribution (scorer output) (real_or_fake)")
plt.xlabel("calibrated_score_api"); plt.ylabel("count"); plt.legend(); plt.show()


In [None]:
# ---- Optional: run the same comparison for ASVSPOOF_DIR ----
if ASVSPOOF_DIR:
    files_asv = sample_files(ASVSPOOF_DIR, n_per_class=N_PER_CLASS, seed=SEED)
    rows2 = []
    for p, y in files_asv:
        rep = scorer.score_file(str(p), threshold=None)
        if not rep.get("ok", False):
            continue
        sig = rep.get("signals") or {}
        rows2.append({
            "path": str(p),
            "y_true": y,
            "cnn_only_score": float(sig.get("cnn_only_score")),
            "calibrated_score_api": float(sig.get("calibrated_score")) if sig.get("calibrated_score") is not None else np.nan,
            "total_seconds": float(sig.get("total_seconds")),
            "silence_ratio": float(sig.get("silence_ratio")),
            "cnn_var": float(sig.get("cnn_var")),
        })
    df_asv = pd.DataFrame(rows2)
    print("ASV rows:", len(df_asv))
    display(df_asv.head())

    plt.figure()
    plt.hist(df_asv[df_asv["y_true"]==0]["cnn_only_score"].values, bins=30, alpha=0.7, label="real (cnn)")
    plt.hist(df_asv[df_asv["y_true"]==1]["cnn_only_score"].values, bins=30, alpha=0.7, label="fake (cnn)")
    plt.title("CNN-only score distribution (ASV)")
    plt.legend(); plt.show()

    plt.figure()
    plt.hist(df_asv[df_asv["y_true"]==0]["calibrated_score_api"].values, bins=30, alpha=0.7, label="real (cal)")
    plt.hist(df_asv[df_asv["y_true"]==1]["calibrated_score_api"].values, bins=30, alpha=0.7, label="fake (cal)")
    plt.title("Calibrated score distribution (ASV)")
    plt.legend(); plt.show()
else:
    print("ASVSPOOF_DIR is None. Set it above if you want to test ASV dataset too.")


## What to do with the results

- If `classes_` is not `[0, 1]` (with `1` meaning fake), and the scorer uses `[:, 1]`, fix the scorer to select by class label.
- If the calibrator distribution is clearly wrong for short clips, gate calibrator usage or refit it using features computed with your current scoring policy.
- If CNN-only separates but calibrated does not, the calibrator is the issue (not the CNN).
