In [None]:
# -------------------------------------------------------------------------
#  DIRECT-NEURAL-BIASING OPTUNA STUDY — 1-D SENSITIVITY PLOTS
#  - loads CSV
#  - keeps COMPLETE trials
#  - picks the best trial as baseline
#  - for every hyper-parameter, varies it while fixing the rest
#  - plots TP / FP counts   (with selectable scaling)
#  - plots TP-rate & FP-rate
# -------------------------------------------------------------------------

import re
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

# ─────────────────────────────────── SETTINGS ─────────────────────────────
CSV_PATH = "results/optuna_study_full_summary.csv"   # change if needed
scale    = "twin"                            # "twin" | "norm" | "raw"
                                           #   twin → dual y-axes
                                           #   norm → counts normalised 0-1
                                           #   raw  → original (single axis)

# ─────────────────────────── LOAD & PREPARE DATA ──────────────────────────
df = pd.read_csv(CSV_PATH)
df = df[df["state"] == "COMPLETE"].copy()

# --- detect which fractions exist via user_attrs_tp_frac_* ---------------
m = re.compile(r"user_attrs_tp_frac_(.*)")
fractions = sorted(float(m.match(c).group(1)) for c in df.columns if m.match(c))
if not fractions:
    raise ValueError("No user_attrs_tp_frac_* columns found in the CSV.")
frac = fractions[-1]                        # largest fraction (full data)

tp_col = f"user_attrs_tp_frac_{frac}"
fp_col = f"user_attrs_fp_frac_{frac}"
fn_col = f"user_attrs_fn_frac_{frac}"

# --- parameter columns ----------------------------------------------------
param_cols  = [c for c in df.columns if c.startswith("params_")]
param_names = {c: c.replace("params_", "") for c in param_cols}

# --- choose baseline trial (highest objective_score) ---------------------
best_idx = df["objective_score"].idxmax()
baseline = df.loc[best_idx, param_cols]

print(f"Using fraction={frac}")
print("Baseline parameters from best trial:")
display(baseline.rename(param_names))

# ─────────────────────────── HELPER: PLOT ONE PARAM ───────────────────────
def plot_tp_fp_vs_param(df, var_col, baseline, scale="twin"):
    name = param_names[var_col]

    # rows where all *other* parameters equal the baseline
    mask = pd.Series(True, index=df.index)
    for p in param_cols:
        if p != var_col:
            mask &= df[p] == baseline[p]
    sub = df[mask].copy()
    if sub[var_col].nunique() < 2:
        return                                  # nothing varies → skip

    # counts & rates
    x   = sub[var_col].astype(float)
    tp  = sub[tp_col]
    fp  = sub[fp_col]
    fn  = sub[fn_col]

    gt  = tp + fn
    tpr = (tp / gt).replace([pd.NA, pd.NaT], pd.NA)
    fpr = (fp / gt).replace([pd.NA, pd.NaT], pd.NA)

    valid = tpr.notna() & fpr.notna()
    x, tp, fp, tpr, fpr = x[valid], tp[valid], fp[valid], tpr[valid], fpr[valid]
    if len(x) == 0:
        return

    # ----- COUNT PLOT ------------------------------------------------------
    if scale == "norm":
        tp_plot = (tp - tp.min()) / (tp.max() - tp.min() or 1)
        fp_plot = (fp - fp.min()) / (fp.max() - fp.min() or 1)
        ylabel  = "normalised count (0–1)"
        norm_note = " (normalised)"
    else:
        tp_plot, fp_plot = tp, fp
        ylabel  = "count"
        norm_note = ""

    if scale == "twin":
        fig, ax1 = plt.subplots(figsize=(6, 4))
        ax2 = ax1.twinx()
        ax1.scatter(x, tp, label="TP", color="C0", s=40)
        ax2.scatter(x, fp, label="FP", color="C1", marker="x", s=40)
        ax1.set_ylabel("TP count")
        ax2.set_ylabel("FP count")
        ax1.set_xlabel(name)
        ax1.set_title(f"TP & FP counts vs {name} (dual axes)")
        fig.tight_layout()
        # combine legends
        lines, labels = [], []
        for ax in (ax1, ax2):
            l, lab = ax.get_legend_handles_labels()
            lines.extend(l); labels.extend(lab)
        ax1.legend(lines, labels, loc="best")
        plt.show()
    else:
        plt.figure(figsize=(6, 4))
        plt.scatter(x, tp_plot, label="TP count", s=40)
        plt.scatter(x, fp_plot, label="FP count", marker="x", s=40)
        plt.title(f"TP & FP counts{norm_note} vs {name}")
        plt.xlabel(name)
        plt.ylabel(ylabel)
        plt.legend()
        plt.tight_layout()
        plt.show()

    # ----- RATE PLOT -------------------------------------------------------
    plt.figure(figsize=(6, 4))
    plt.scatter(x, tpr, label="TP rate (sensitivity)", s=40)
    plt.scatter(x, fpr, label="FP / GT", marker="x", s=40)
    plt.title(f"Rates vs {name}  (others fixed)")
    plt.xlabel(name)
    plt.ylabel("rate")
    plt.legend()
    plt.tight_layout()
    plt.show()

# ─────────────────────────── PLOT ALL PARAMETERS ──────────────────────────
for col in param_cols:
    plot_tp_fp_vs_param(df, col, baseline, scale=scale)


In [None]:
# -------------------------------------------------------------------------
#  ONE-GRAPH “HOLISTIC” VIEW OF TP / FP TRADE-OFFS  (FIXED)
# -------------------------------------------------------------------------
import re
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

# ── settings ───────────────────────────────────────────────────────────────
CSV_PATH = "results/optuna_study_full_summary.csv"   # change if needed
rate_type = "per_gt"                         # "per_gt" | "per_det"

# ── load & keep COMPLETE trials ────────────────────────────────────────────
df = pd.read_csv(CSV_PATH)
df = df[df["state"] == "COMPLETE"].copy()

# ── detect the fraction stored in user_attrs --------------------------------
m = re.compile(r"user_attrs_tp_frac_(.*)")
fractions = sorted(float(m.match(c).group(1)) for c in df.columns if m.match(c))
if not fractions:
    raise ValueError("No user_attrs_tp_frac_* columns found.")
frac = fractions[-1]                                 # use full-data fraction

tp_col = f"user_attrs_tp_frac_{frac}"
fp_col = f"user_attrs_fp_frac_{frac}"
fn_col = f"user_attrs_fn_frac_{frac}"

# ── parameter columns & baseline trial -------------------------------------
param_cols  = [c for c in df.columns if c.startswith("params_")]
param_names = {c: c.replace("params_", "") for c in param_cols}

best_idx = df["objective_score"].idxmax()
baseline = df.loc[best_idx, param_cols]
base_tp, base_fp, base_fn = df.loc[best_idx, [tp_col, fp_col, fn_col]]

# ── prepare the figure -----------------------------------------------------
plt.figure(figsize=(7, 6))

for var_col in param_cols:
    # rows where ONLY this parameter varies
    mask = pd.Series(True, index=df.index)
    for p in param_cols:
        if p != var_col:
            mask &= df[p] == baseline[p]
    sub = df[mask].copy()
    if sub[var_col].nunique() < 2:
        continue                                       # nothing varies → skip

    sub = sub.sort_values(var_col)                    # clean line order
    tp  = sub[tp_col]
    fp  = sub[fp_col]
    fn  = sub[fn_col]

    gt  = tp + fn
    sens = tp / gt.replace(0, pd.NA)                  # TP / GT  (sensitivity)

    if rate_type == "per_det":
        fpr = fp / (tp + fp).replace(0, pd.NA)        # FP / (TP+FP)
        fp_label = "FP / (TP+FP)"
    else:
        fpr = fp / gt.replace(0, pd.NA)               # FP / GT
        fp_label = "FP / GT"

    valid = sens.notna() & fpr.notna()
    if valid.sum() == 0:
        continue

    plt.plot(fpr[valid], sens[valid],
             marker="o", linewidth=1.5, markersize=5,
             label=param_names[var_col])

# baseline point ------------------------------------------------------------
base_sens = base_tp / (base_tp + base_fn)
base_fpr  = (base_fp / (base_tp + base_fp)) if rate_type == "per_det" else (base_fp / (base_tp + base_fn))
plt.scatter([base_fpr], [base_sens], marker="*", s=150,
            color="k", label="baseline (best trial)", zorder=5)

# cosmetics -----------------------------------------------------------------
plt.xlabel(fp_label)
plt.ylabel("TP / GT  (sensitivity)")
plt.title("Trade-off between sensitivity and false-positive burden\n"
          "(each coloured line = one hyper-parameter)")
plt.grid(True, alpha=0.3)
plt.legend(bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
# ---------- EFFECT-SIZE SUMMARY TABLE ------------------------------------
summary = []

for p in param_cols:
    mask = pd.Series(True, index=df.index)
    for q in param_cols:
        if q != p:
            mask &= df[q] == baseline[q]
    sub = df[mask]
    if sub[p].nunique() < 2:
        continue

    tp, fp, fn = sub[tp_col], sub[fp_col], sub[fn_col]
    sens = tp / (tp + fn).replace(0, pd.NA)
    fpr  = (fp / (tp + fn)).replace(0, pd.NA) if rate_type=="per_gt" else (fp / (tp + fp)).replace(0, pd.NA)

    summary.append({
        "parameter": param_names[p],
        "min TP": int(tp.min()), "max TP": int(tp.max()),
        "min FP": int(fp.min()), "max FP": int(fp.max()),
        "Δsensitivity": float((sens.max() - sens.min())),
        "ΔFP rate": float((fpr.max() - fpr.min()))
    })

pd.DataFrame(summary).set_index("parameter")
