# Audio Authenticity API / Scorer Quick Test Notebook

This notebook helps you sanity-check your **local scorer (the same code used by the FastAPI)** against:

- a single file
- a directory structured as:
  - `.../real/`
  - `.../fake/`

It will print example outputs, compute confusion matrices at a threshold, and show basic score distributions.

> Assumes you have `src/inference/scorer.py` with `AudioSpoofScorer` and your CNN checkpoint + (optional) calibrator.


In [None]:
# ---- Imports ----
import os
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, roc_auc_score

# Import your scorer (same one used by the API)
from src.inference.scorer import AudioSpoofScorer, ScorerConfig


In [None]:
# ---- Configure paths ----
CNN_MODEL = "models/cnn/audio_cnn_balanced_best.pt"
CALIBRATOR = "models/calibrators/agg_lr_real_or_fake_new.joblib"  # set to None to disable
DEVICE = "cpu"  # or "cuda"

# Dataset directories (must have real/ and fake/ subfolders)
REAL_OR_FAKE_DIR = "data/audio/processed/real_or_fake"
ASVSPOOF21_DIR = None  # e.g. "data/audio/asvspoof21_eval" (must have real/ fake/ if you want directory eval)

# Single-file test (optional)
TEST_FILE = "data/audio/processed/real_or_fake/real/file20.wav_16k.wav_norm.wav_mono.wav_silence.wav"
THRESHOLD = 0.5

print('CNN_MODEL exists:', Path(CNN_MODEL).exists())
print('CALIBRATOR exists:', Path(CALIBRATOR).exists() if CALIBRATOR else None)
print('REAL_OR_FAKE_DIR exists:', Path(REAL_OR_FAKE_DIR).exists())
print('TEST_FILE exists:', Path(TEST_FILE).exists())


In [None]:
# ---- Instantiate scorer ----
scorer = AudioSpoofScorer(
    cnn_model_path=CNN_MODEL,
    calibrator_path=CALIBRATOR if CALIBRATOR and Path(CALIBRATOR).exists() else None,
    device=DEVICE,
    config=ScorerConfig(n_segments=6, clip_seconds=4.0, cnn_agg="median", low_thr=0.3, high_thr=0.7),
)
print("Loaded scorer. Calibrator enabled:", scorer.calibrator is not None)
print("CNN cfg sample_rate:", scorer.cfg.get("sample_rate"))


In [None]:
# ---- Score a single file ----
report = scorer.score_file(TEST_FILE, threshold=THRESHOLD)
report


In [None]:
# ---- Helper: score a dataset directory ----
AUDIO_EXTS = {".wav", ".flac", ".mp3", ".ogg", ".m4a"}  # torchaudio support varies by build

def list_audio_files(root: Path):
    return sorted([p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in AUDIO_EXTS])

def score_dataset_dir(base_dir: str, threshold: float = 0.5):
    base = Path(base_dir)
    real_dir = base / "real"
    fake_dir = base / "fake"
    assert real_dir.is_dir() and fake_dir.is_dir(), f"Expected {real_dir} and {fake_dir}"

    rows = []
    for label_name, y_true in [("real", 0), ("fake", 1)]:
        files = list_audio_files(base / label_name)
        for f in files:
            rep = scorer.score_file(str(f), threshold=threshold)
            if not rep.get("ok", False):
                rows.append({
                    "path": str(f),
                    "y_true": y_true,
                    "ok": False,
                    "error": rep.get("error"),
                })
                continue
            rows.append({
                "path": str(f),
                "y_true": y_true,
                "ok": True,
                "score": rep["score"],
                "tier": rep["tier"],
                "decision": rep["decision"],
                "cnn_only_score": rep["signals"]["cnn_only_score"],
                "calibrated_score": rep["signals"]["calibrated_score"],
                "cnn_median": rep["signals"]["cnn_median"],
                "cnn_max": rep["signals"]["cnn_max"],
                "cnn_var": rep["signals"]["cnn_var"],
                "silence_ratio": rep["signals"]["silence_ratio"],
                "total_seconds": rep["signals"]["total_seconds"],
            })
    df = pd.DataFrame(rows)
    return df

def summarize_df(df: pd.DataFrame, score_col: str = "score", threshold: float = 0.5):
    d = df[df["ok"]].copy()
    y_true = d["y_true"].to_numpy()
    probs = d[score_col].to_numpy(dtype=float)
    y_pred = (probs >= threshold).astype(int)

    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    auc = roc_auc_score(y_true, probs) if len(np.unique(y_true)) > 1 else np.nan

    real_acc = (y_pred[y_true==0] == 0).mean() if (y_true==0).any() else np.nan
    fake_acc = (y_pred[y_true==1] == 1).mean() if (y_true==1).any() else np.nan
    overall_acc = (y_pred == y_true).mean() if len(y_true) else np.nan

    tp = int(((y_pred==1) & (y_true==1)).sum())
    fp = int(((y_pred==1) & (y_true==0)).sum())
    precision = tp / (tp + fp) if (tp + fp) else 0.0

    return {
        "n_ok": int(len(d)),
        "auc": float(auc) if not np.isnan(auc) else None,
        "real_accuracy": float(real_acc) if not np.isnan(real_acc) else None,
        "fake_accuracy": float(fake_acc) if not np.isnan(fake_acc) else None,
        "overall_accuracy": float(overall_acc) if not np.isnan(overall_acc) else None,
        "precision": float(precision),
        "confusion_matrix": cm.tolist(),
        "labels": ["real","fake"],
    }


In [None]:
# ---- Score REAL_OR_FAKE dataset ----
df_rof = score_dataset_dir(REAL_OR_FAKE_DIR, threshold=THRESHOLD)
df_rof.head()


In [None]:
# ---- Summary (final score) ----
summary_final = summarize_df(df_rof, score_col="score", threshold=THRESHOLD)
summary_final


In [None]:
# ---- Summary (CNN-only score) ----
summary_cnn = summarize_df(df_rof, score_col="cnn_only_score", threshold=THRESHOLD)
summary_cnn


In [None]:
# ---- Investigate constant segment scores (like your API output) ----
# If segments are identical for many files, this will show it quickly.
# (Often indicates audio shorter than clip_seconds and being heavily padded, or model saturation.)
d = df_rof[df_rof["ok"]].copy()
print("Total ok:", len(d))
print("Duration seconds - min/median/max:", d["total_seconds"].min(), d["total_seconds"].median(), d["total_seconds"].max())
print("cnn_var - min/median/max:", d["cnn_var"].min(), d["cnn_var"].median(), d["cnn_var"].max())

# How many have (near) zero variance?
print("Fraction with cnn_var < 1e-6:", float((d["cnn_var"] < 1e-6).mean()))


In [None]:
# ---- Score distributions (histograms) ----
d = df_rof[df_rof["ok"]].copy()

plt.figure()
plt.hist(d[d["y_true"]==0]["score"].values, bins=30, alpha=0.7, label="real")
plt.hist(d[d["y_true"]==1]["score"].values, bins=30, alpha=0.7, label="fake")
plt.title("Score distribution (final score)")
plt.xlabel("score")
plt.ylabel("count")
plt.legend()
plt.show()

plt.figure()
plt.hist(d[d["y_true"]==0]["cnn_only_score"].values, bins=30, alpha=0.7, label="real")
plt.hist(d[d["y_true"]==1]["cnn_only_score"].values, bins=30, alpha=0.7, label="fake")
plt.title("Score distribution (CNN-only)")
plt.xlabel("cnn_only_score")
plt.ylabel("count")
plt.legend()
plt.show()


In [None]:
# ---- Optional: score ASVspoof directory (if structured as real/ fake/) ----
if ASVSPOOF21_DIR:
    df_asv = score_dataset_dir(ASVSPOOF21_DIR, threshold=THRESHOLD)
    print("ASV ok:", df_asv["ok"].sum(), "of", len(df_asv))
    print("Final score summary:", summarize_df(df_asv, score_col="score", threshold=THRESHOLD))
    print("CNN-only summary:", summarize_df(df_asv, score_col="cnn_only_score", threshold=THRESHOLD))
else:
    print("ASVSPOOF21_DIR is None. Set it above to run.")


## Notes / Debug Tips

If you see patterns like:
- **all segments identical** (cnn_var ≈ 0)
- **everything predicted fake** or **everything predicted real**

Common causes:
- many files shorter than `clip_seconds` → heavy padding + identical segment windows
- mismatch between training and inference transforms (cfg must match checkpoint)
- label direction inversion (interpreting `sigmoid(logit)` as P(fake) when it is P(real))
- calibrator applied out-of-domain (try disabling calibrator and re-check)

This notebook prints duration statistics and shows distributions so you can spot these quickly.
