In [11]:
# ============================================================
# NO SUPERVISADO con dos carpetas:
#   ./sanos   (negativos)
#   ./nosanos (positivos)
#
# ROI RBC + SLIC + features (LAB/LCH/grad/Gabor)
# KMeans GLOBAL -> etiqueta "parásito" por score (C* + textura - dist hue magenta)
# Selección de umbral (F1) sobre parasite_area_fraction_RBC
# + GRÁFICAS: ROC, PR, F1-vs-umbral, histogramas, matriz de confusión
# ============================================================

from pathlib import Path
import numpy as np
import cv2
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import json

from skimage import color as skcolor, img_as_ubyte
from skimage.segmentation import slic, mark_boundaries
from skimage.filters import gabor
from skimage.measure import label, regionprops
from skimage.morphology import (remove_small_objects, remove_small_holes,
                                binary_opening, binary_closing, disk)

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score,
    confusion_matrix, ConfusionMatrixDisplay
)

# ------------------ Carpetas ------------------
DIR_NEG = "./sampled/sanos"    # sin parásito (y_true = 0)
DIR_POS = "./sampled/nosanos"  # con parásito (y_true = 1)

OUT_DIR = Path("./out_unsup_2sets")
OUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PATH = OUT_DIR / "model_unsup.joblib"
PER_IMAGE_CSV = OUT_DIR / "results_per_image.csv"
THRESH_REPORT = OUT_DIR / "threshold_report.txt"

# ------------------ SLIC / Gabor / Clustering ------------------
K_SUPERPIXELS = 600
COMPACTNESS = 10.0
SIGMA_SLIC = 1.0

GABOR_FREQUENCIES = [0.1, 0.2, 0.3]
GABOR_THETAS = [0, np.pi/6, np.pi/3, np.pi/2, 2*np.pi/3, 5*np.pi/6]

K_CLUSTERS = 4  # clusters globales

# ------------------ Morfología ------------------
OPENING_RADIUS = 2
CLOSING_RADIUS = 2
HOLE_MIN_SIZE = 64
COMP_MIN_SIZE = 64

# ------------------ Scoring "parásito" dentro de RBC ------------------
TARGET_H_DEG = 320.0     # magenta/púrpura
H_BAND_DEG   = 50.0
W_C = 1.0                # peso C* (croma)
W_T = 0.7                # peso textura (gradL + gabor)
W_H = 0.6                # penalización distancia de hue

# ------------------ Utils ------------------
def read_rgb_float01(path: str) -> np.ndarray:
    arr = cv2.imread(path, cv2.IMREAD_COLOR)
    if arr is None:
        raise FileNotFoundError(f"No se puede leer: {path}")
    arr = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
    return (arr.astype(np.float32) / 255.0)

def normalize_by_ref(rgb: np.ndarray, ref: np.ndarray, eps: float = 1e-8) -> np.ndarray:
    norm = rgb / (ref + eps)
    norm = np.nan_to_num(norm, nan=0.0, posinf=0.0, neginf=0.0)
    return np.clip(norm, 0.0, 1.0).astype(np.float32)

def gradients_L_from_lab(lab: np.ndarray) -> np.ndarray:
    L = lab[..., 0]
    Ln = (L - L.min()) / (L.max() - L.min() + 1e-8)
    gx = cv2.Sobel(Ln.astype(np.float32), cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(Ln.astype(np.float32), cv2.CV_32F, 0, 1, ksize=3)
    return np.sqrt(gx*gx + gy*gy)

def gabor_bank(gray01: np.ndarray, freqs, thetas):
    feats = []
    for f in freqs:
        for t in thetas:
            real, imag = gabor(gray01, frequency=f, theta=t)
            feats.append(np.sqrt(real**2 + imag**2).astype(np.float32))
    return feats

def pixel_features(rgb01: np.ndarray):
    lab = skcolor.rgb2lab(rgb01)
    lch = skcolor.lab2lch(lab)
    L = lab[...,0].astype(np.float32)
    a = lab[...,1].astype(np.float32)
    b = lab[...,2].astype(np.float32)
    C = lch[...,1].astype(np.float32)
    h = lch[...,2].astype(np.float32)
    gradL = gradients_L_from_lab(lab).astype(np.float32)
    Ln = (L - L.min()) / (L.max() - L.min() + 1e-8)
    gabs = gabor_bank(Ln, GABOR_FREQUENCIES, GABOR_THETAS)
    feat_list = [L, a, b, C, h, gradL] + gabs
    feats = np.stack(feat_list, axis=-1)  # (H,W,F)
    return feats, lab

def rbc_mask_from_lab(rgb01: np.ndarray) -> np.ndarray:
    lab = skcolor.rgb2lab(rgb01)
    lch = skcolor.lab2lch(lab)
    L = lab[...,0]; a = lab[...,1]; C = lch[...,1]
    tL = max(20.0, np.percentile(L, 60))
    tC = max(5.0,  np.percentile(C, 50))
    ta = max(0.0,  np.percentile(a, 45))
    mask = (L > tL) & (C > tC) & (a > ta)
    mask = remove_small_objects(mask, min_size=400)
    mask = binary_opening(mask, disk(3))
    mask = binary_closing(mask, disk(5))
    mask = remove_small_holes(mask, area_threshold=200)
    labimg = label(mask)
    if labimg.max() == 0:
        return mask
    areas = [(r.label, r.area) for r in regionprops(labimg)]
    best = max(areas, key=lambda x: x[1])[0]
    mask = (labimg == best)
    mask = remove_small_holes(mask, area_threshold=2000)
    return mask

def assign_superpixels_to_roi(segments: np.ndarray, roi_mask: np.ndarray, thr=0.5):
    spx_ids = np.unique(segments)
    in_roi = []
    for sid in spx_ids:
        m = (segments == sid)
        frac = np.count_nonzero(roi_mask & m) / float(np.count_nonzero(m))
        in_roi.append(frac >= thr)
    return spx_ids, np.array(in_roi, dtype=bool)

def circular_hue_distance_deg(h, target):
    d = np.abs((h - target + 180) % 360 - 180)
    return d

def choose_parasite_cluster_by_score(agg_means, agg_stds, labels, spx_ids, spx_in_roi, F_each):
    mean_C = agg_means[:, 3]
    mean_h = agg_means[:, 4]
    mean_grad = agg_means[:, 5]
    n_gab = F_each - 6
    if n_gab > 0:
        gabor_means = agg_means[:, 6:6+n_gab].mean(axis=1)
        texture = 0.5*mean_grad + 0.5*gabor_means
    else:
        texture = mean_grad
    hue_dist = circular_hue_distance_deg(mean_h, TARGET_H_DEG)
    hue_penalty = np.clip(hue_dist / H_BAND_DEG, 0.0, 1.0)
    def rnorm(x):
        p1, p99 = np.percentile(x, 1), np.percentile(x, 99)
        return np.clip((x - p1) / (p99 - p1 + 1e-8), 0, 1)
    Cn = rnorm(mean_C)
    Tn = rnorm(texture)
    spx_score = W_C*Cn + W_T*Tn - W_H*hue_penalty
    spx_score[~spx_in_roi] = -np.inf
    k_scores = []
    for k in np.unique(labels):
        m = (labels == k) & spx_in_roi
        if not np.any(m):
            k_scores.append(-np.inf)
        else:
            k_scores.append(np.mean(spx_score[m]))
    parasite_k = int(np.argmax(k_scores))
    return parasite_k, k_scores

def overlay_mask(rgb01, mask, alpha=0.45):
    out = rgb01.copy()
    color = np.array([1.0, 0.0, 1.0], dtype=np.float32)
    out[mask] = (1 - alpha) * out[mask] + alpha * color
    return out

def put_tag(img_rgb01: np.ndarray, text: str, pos=(10, 30)):
    img8 = img_as_ubyte(img_rgb01.copy())
    cv2.putText(img8, text, pos, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,0,0), 3, cv2.LINE_AA)
    cv2.putText(img8, text, pos, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1, cv2.LINE_AA)
    return img8

def colorize_clusters(segments: np.ndarray, labels: np.ndarray):
    H, W = segments.shape
    cmap = plt.get_cmap("tab10")
    out = np.zeros((H, W, 3), dtype=np.float32)
    for sid, lbl in zip(np.unique(segments), labels):
        out[segments == sid] = cmap(lbl % 10)[:3]
    return out

def collect_images(root: str):
    exts = ["*.png", "*.jpg", "*.jpeg", "*.tif", "*.tiff", "*.bmp"]
    files = []
    for ext in exts:
        files += list(Path(root).rglob(ext))
    return sorted([p for p in files if not p.name.endswith("_ref.png")])

def load_prepare(img_path: Path):
    ref_path = Path(str(img_path).rsplit(".", 1)[0] + "_ref.png")
    rgb = read_rgb_float01(str(img_path))
    if ref_path.exists():
        ref = read_rgb_float01(str(ref_path))
        rgbn = normalize_by_ref(rgb, ref)
    else:
        rgbn = rgb
    feats_px, lab = pixel_features(rgbn)
    H, W, F = feats_px.shape
    segments = slic(rgbn, n_segments=K_SUPERPIXELS, compactness=COMPACTNESS,
                    sigma=SIGMA_SLIC, start_label=0, channel_axis=-1)
    spx_ids = np.unique(segments)
    X = feats_px.reshape(-1, F)
    s = segments.reshape(-1)
    means = np.zeros((spx_ids.size, F), dtype=np.float32)
    stds  = np.zeros_like(means)
    sizes = np.zeros((spx_ids.size,), dtype=np.int32)
    for i, sid in enumerate(spx_ids):
        m = (s == sid)
        Xi = X[m]
        means[i] = Xi.mean(axis=0)
        stds[i]  = Xi.std(axis=0)
        sizes[i] = np.count_nonzero(m)
    agg = np.concatenate([means, stds], axis=1)
    rbc_mask = rbc_mask_from_lab(rgbn)
    spx_ids, spx_in_roi = assign_superpixels_to_roi(segments, rbc_mask, thr=0.5)
    return dict(rgb=rgbn, segments=segments, agg=agg, means=means, stds=stds,
                spx_ids=spx_ids, sizes=sizes, F_each=F, rbc_mask=rbc_mask)

# ------------------ Gráficas auxiliares ------------------
def plot_roc_pr(y_true, scores, out_dir: Path):
    fpr, tpr, _ = roc_curve(y_true, scores)
    auc = roc_auc_score(y_true, scores)
    prec, rec, _ = precision_recall_curve(y_true, scores)
    ap = average_precision_score(y_true, scores)

    plt.figure(figsize=(5.5,5))
    plt.plot(fpr, tpr, lw=2, label=f"AUC={auc:.3f}")
    plt.plot([0,1], [0,1], "--", lw=1)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC curve")
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_dir / "roc_curve.png", dpi=160)
    plt.close()

    plt.figure(figsize=(5.5,5))
    plt.plot(rec, prec, lw=2, label=f"AP={ap:.3f}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision–Recall curve")
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_dir / "precision_recall_curve.png", dpi=160)
    plt.close()

def plot_f1_vs_threshold(y_true, scores, grid, out_dir: Path):
    f1s, accs = [], []
    for thr in grid:
        y_pred = (scores >= thr).astype(int)
        f1s.append(f1_score(y_true, y_pred, zero_division=0))
        accs.append(accuracy_score(y_true, y_pred))
    plt.figure(figsize=(6,4))
    plt.plot(grid, f1s, label="F1", lw=2)
    plt.plot(grid, accs, label="Accuracy", lw=1.5)
    plt.xlabel("threshold")
    plt.ylabel("score")
    plt.title("F1 / Accuracy vs threshold")
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_dir / "f1_accuracy_vs_threshold.png", dpi=160)
    plt.close()

def plot_score_histograms(df, out_dir: Path, col="parasite_area_fraction_RBC"):
    plt.figure(figsize=(6,4))
    for name, sub in df.groupby("set"):
        plt.hist(sub[col], bins=30, alpha=0.6, label=name)
    plt.xlabel(col)
    plt.ylabel("count")
    plt.title(f"Histogram of {col} by set")
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_dir / f"hist_{col}_by_set.png", dpi=160)
    plt.close()

def plot_confusion(y_true, y_pred, out_path: Path, title="Confusion matrix"):
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    disp = ConfusionMatrixDisplay(cm, display_labels=["sano(0)","nosano(1)"])
    fig, ax = plt.subplots(figsize=(4.8,4.8))
    disp.plot(ax=ax, cmap="Blues", values_format="d", colorbar=False)
    ax.set_title(title)
    plt.tight_layout()
    plt.savefig(out_path, dpi=160)
    plt.close()

# ------------------ Pipeline principal ------------------
def run_pipeline():
    # 1) Cargar listas con etiqueta de verdad
    imgs_neg = [(p, 0) for p in collect_images(DIR_NEG)]
    imgs_pos = [(p, 1) for p in collect_images(DIR_POS)]
    all_imgs = imgs_neg + imgs_pos
    if not all_imgs:
        raise RuntimeError("No se encontraron imágenes en ./sanos o ./nosanos")

    print(f"Sanos (negativos): {len(imgs_neg)} | Nosanos (positivos): {len(imgs_pos)}")

    # 2) Cargar y acumular superpíxeles
    per_image = []
    all_agg = []
    for p, y in all_imgs:
        data = load_prepare(p)
        per_image.append((p, y, data))
        all_agg.append(data["agg"])
        print(f"[LOAD] {p.name} (y={y}) superpix={data['agg'].shape[0]}")

    X_all = np.vstack(all_agg).astype(np.float32)

    # 3) Escalado + KMeans global
    scaler = StandardScaler().fit(X_all)
    Xs_all = scaler.transform(X_all)
    kmeans = KMeans(n_clusters=K_CLUSTERS, n_init=50, random_state=42).fit(Xs_all)
    joblib.dump({"scaler": scaler, "kmeans": kmeans}, MODEL_PATH)
    print(f"[OK] Modelo guardado en {MODEL_PATH}")

    # 4) Pasada por imagen
    rows = []
    for p, y, data in per_image:
        rgb = data["rgb"]; seg = data["segments"]
        agg = data["agg"]; means = data["means"]; stds = data["stds"]
        spx_ids = data["spx_ids"]; sizes = data["sizes"]; F_each = data["F_each"]
        rbc_mask = data["rbc_mask"]

        X_img = scaler.transform(agg.astype(np.float32))
        lbls = kmeans.predict(X_img)

        _, spx_in_roi = assign_superpixels_to_roi(seg, rbc_mask, thr=0.5)
        parasite_k, k_scores = choose_parasite_cluster_by_score(means, stds, lbls, spx_ids, spx_in_roi, F_each)

        mask_par = np.isin(seg, spx_ids[lbls == parasite_k])
        mask_par_roi = mask_par & rbc_mask

        pix_par_global = int(np.count_nonzero(mask_par))
        pix_tot_global = int(mask_par.size)
        frac_global = pix_par_global / max(1, pix_tot_global)

        pix_par_roi = int(np.count_nonzero(mask_par_roi))
        pix_tot_roi = int(np.count_nonzero(rbc_mask))
        frac_roi = pix_par_roi / max(1, pix_tot_roi)

        # Guardar overlays por set
        split_dir = OUT_DIR / ("neg" if y == 0 else "pos")
        split_dir.mkdir(parents=True, exist_ok=True)
        clusters_rgb = colorize_clusters(seg, lbls)
        cv2.imwrite(str(split_dir / f"{p.stem}_clusters_overlay.png"),
                    cv2.cvtColor(img_as_ubyte(mark_boundaries(clusters_rgb, seg)), cv2.COLOR_RGB2BGR))
        overlay_par = overlay_mask(rgb, mask_par_roi, alpha=0.45)
        tag = f"Parasite(ROI) k={parasite_k}, score={k_scores[parasite_k]:.2f}, fracROI={frac_roi:.3f}"
        out_tag = put_tag(overlay_par, tag)
        cv2.imwrite(str(split_dir / f"{p.stem}_parasite_overlay.png"),
                    cv2.cvtColor(out_tag, cv2.COLOR_RGB2BGR))

        rows.append({
            "image": p.name,
            "set": "sanos" if y == 0 else "nosanos",
            "y_true": y,
            "parasite_cluster": parasite_k,
            "score_parasite_cluster": float(k_scores[parasite_k]),
            "pixels_parasite_global": pix_par_global,
            "pixels_total_global": pix_tot_global,
            "parasite_area_fraction_global": frac_global,
            "pixels_parasite_RBC": pix_par_roi,
            "pixels_RBC": pix_tot_roi,
            "parasite_area_fraction_RBC": frac_roi
        })

    df = pd.DataFrame(rows)
    df.to_csv(PER_IMAGE_CSV, index=False)
    print(f"[OK] Guardado: {PER_IMAGE_CSV}")

    # 5) Selección de umbral sobre el score por imagen
    #    Por defecto usamos la fracción en la ROI del glóbulo:
    scores = df["parasite_area_fraction_RBC"].to_numpy()
    # >>> Si prefieres usar el score del clúster:
    # scores = df["score_parasite_cluster"].to_numpy()

    y_true = df["y_true"].to_numpy().astype(int)

    if len(scores) == 0:
        raise RuntimeError("No se generaron scores por imagen.")

    uniq = np.unique(scores)
    if len(uniq) < 5:
        grid = np.linspace(scores.min(), scores.max(), 101)
    else:
        grid = np.quantile(scores, np.linspace(0, 1, 201))

    best = {"thr": None, "f1": -1, "acc": 0, "prec": 0, "rec": 0, "auc": None}
    for thr in grid:
        y_pred = (scores >= thr).astype(int)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred, zero_division=0)
        rec = recall_score(y_true, y_pred)
        if f1 > best["f1"]:
            best.update(dict(thr=float(thr), f1=float(f1), acc=float(acc),
                             prec=float(prec), rec=float(rec)))

    # AUC (independiente del umbral)
    try:
        auc = roc_auc_score(y_true, scores)
        best["auc"] = float(auc)
    except Exception:
        best["auc"] = None

    # 6) Aplicar mejor umbral y guardar predicciones
    df["y_pred"] = (scores >= best["thr"]).astype(int)
    df["correct"] = (df["y_pred"] == df["y_true"]).astype(int)
    df.to_csv(PER_IMAGE_CSV, index=False)

    # 7) Guardar reporte de umbral
    with open(THRESH_REPORT, "w", encoding="utf-8") as f:
        f.write("=== Threshold selection on parasite_area_fraction_RBC ===\n")
        f.write(json.dumps(best, indent=2, ensure_ascii=False))
        f.write("\n\nConfusion by set:\n")
        for s in ["sanos", "nosanos"]:
            sub = df[df["set"] == s]
            if len(sub) == 0:
                continue
            acc_s = accuracy_score(sub["y_true"], sub["y_pred"])
            prec_s = precision_score(sub["y_true"], sub["y_pred"], zero_division=0)
            rec_s = recall_score(sub["y_true"], sub["y_pred"])
            f1_s = f1_score(sub["y_true"], sub["y_pred"], zero_division=0)
            f.write(f"- {s}: acc={acc_s:.3f}, prec={prec_s:.3f}, rec={rec_s:.3f}, f1={f1_s:.3f}\n")
    print(f"[OK] Umbral y métricas: {THRESH_REPORT}")

    # 8) GRÁFICAS
    # ROC y PR
    plot_roc_pr(y_true, scores, OUT_DIR)
    # F1 vs umbral (y accuracy)
    plot_f1_vs_threshold(y_true, scores, grid, OUT_DIR)
    # Histograma de scores por set
    plot_score_histograms(df, OUT_DIR, col="parasite_area_fraction_RBC")

    # Matriz de confusión global y por set
    plot_confusion(y_true, df["y_pred"].to_numpy(), OUT_DIR / "confusion_matrix_overall.png",
                   title="Confusion matrix (overall)")
    for s in ["sanos", "nosanos"]:
        sub = df[df["set"] == s]
        if len(sub):
            plot_confusion(sub["y_true"].to_numpy(), sub["y_pred"].to_numpy(),
                           OUT_DIR / f"confusion_matrix_{s}.png",
                           title=f"Confusion matrix ({s})")

    # 9) Resumen consola
    print("\n=== Mejor umbral en parasite_area_fraction_RBC ===")
    print(best)
    print("\nAciertos por set:")
    print(df.pivot_table(index="set", values="correct", aggfunc=["mean","sum","count"]))

if __name__ == "__main__":
    run_pipeline()
    print("Listo ✅  Figuras en ./out_unsup_2sets/")

Sanos (negativos): 125 | Nosanos (positivos): 125
[LOAD] sano_000.png (y=0) superpix=526
[LOAD] sano_002.png (y=0) superpix=586
[LOAD] sano_005.png (y=0) superpix=558
[LOAD] sano_009.png (y=0) superpix=459
[LOAD] sano_010.png (y=0) superpix=612
[LOAD] sano_011.png (y=0) superpix=618
[LOAD] sano_012.png (y=0) superpix=486
[LOAD] sano_013.png (y=0) superpix=610
[LOAD] sano_014.png (y=0) superpix=662
[LOAD] sano_015.png (y=0) superpix=532
[LOAD] sano_016.png (y=0) superpix=641
[LOAD] sano_017.png (y=0) superpix=467
[LOAD] sano_018.png (y=0) superpix=546
[LOAD] sano_019.png (y=0) superpix=613
[LOAD] sano_022.png (y=0) superpix=646
[LOAD] sano_024.png (y=0) superpix=502
[LOAD] sano_025.png (y=0) superpix=453
[LOAD] sano_026.png (y=0) superpix=666
[LOAD] sano_027.png (y=0) superpix=433
[LOAD] sano_028.png (y=0) superpix=436
[LOAD] sano_029.png (y=0) superpix=549
[LOAD] sano_030.png (y=0) superpix=562
[LOAD] sano_031.png (y=0) superpix=584
[LOAD] sano_032.png (y=0) superpix=426
[LOAD] sano_03

[LOAD] 167_seg_012.png (y=1) superpix=666
[LOAD] 169_seg_026.png (y=1) superpix=433
[LOAD] 169_seg_041.png (y=1) superpix=443
[LOAD] 170_seg_013.png (y=1) superpix=531
[LOAD] 171_seg_005.png (y=1) superpix=457
[LOAD] 171_seg_042.png (y=1) superpix=451
[LOAD] 173_seg_002.png (y=1) superpix=521
[LOAD] 174_seg_027.png (y=1) superpix=548
[LOAD] 175_seg_009.png (y=1) superpix=558
[LOAD] 175_seg_030.png (y=1) superpix=484
[LOAD] 176_seg_015.png (y=1) superpix=520
[LOAD] 176_seg_055.png (y=1) superpix=439
[LOAD] 176_seg_075.png (y=1) superpix=434
[LOAD] 176_seg_100.png (y=1) superpix=810
[LOAD] 207_seg_000.png (y=1) superpix=615
[LOAD] 209_seg_001.png (y=1) superpix=542
[LOAD] 266_seg_001.png (y=1) superpix=498
[LOAD] 285_seg_038.png (y=1) superpix=546
[LOAD] 288_seg_011.png (y=1) superpix=616
[LOAD] 292_seg_012.png (y=1) superpix=568
[LOAD] 295_seg_006.png (y=1) superpix=670
[LOAD] 295_seg_008.png (y=1) superpix=597
[LOAD] 299_seg_012.png (y=1) superpix=447
[LOAD] 300_seg_031.png (y=1) super

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[OK] Umbral y métricas: out_unsup_2sets/threshold_report.txt

=== Mejor umbral en parasite_area_fraction_RBC ===
{'thr': 0.0008108108108107561, 'f1': 0.6877828054298643, 'acc': 0.724, 'prec': 0.7916666666666666, 'rec': 0.608, 'auc': 0.7106239999999999}

Aciertos por set:
           mean     sum   count
        correct correct correct
set                            
nosanos   0.608      76     125
sanos     0.840     105     125
Listo ✅  Figuras en ./out_unsup_2sets/


In [None]:
# ============================================================
# VALIDACIÓN (usa modelo ya entrenado)
#   ./val_sanos    (negativos)
#   ./val_nosanos  (positivos)
#
# Carga scaler+kmeans + umbral (robusto), calcula scores por imagen,
# selecciona/aplica umbral robusto y genera métricas + gráficas.
# ============================================================

from pathlib import Path
import os, re, json
import numpy as np
import cv2
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from skimage import color as skcolor, img_as_ubyte
from skimage.segmentation import slic, mark_boundaries
from skimage.filters import gabor
from skimage.measure import label, regionprops
from skimage.morphology import (remove_small_objects, remove_small_holes,
                                binary_opening, binary_closing, disk)

from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score,
    confusion_matrix, ConfusionMatrixDisplay
)

# ----------- Rutas -----------
DIR_VAL_NEG = "./val/sanos"
DIR_VAL_POS = "./val/nosanos"

TRAIN_OUT_DIR = Path("./out_unsup_2sets")                     # donde quedó tu modelo y threshold
MODEL_PATH = TRAIN_OUT_DIR / "model_unsup.joblib"             # << se carga aquí
THRESH_REPORT = TRAIN_OUT_DIR / "threshold_report.txt"        # << umbral entrenado (opcional)

VAL_OUT_DIR = Path("./out_unsup_val")
VAL_OUT_DIR.mkdir(parents=True, exist_ok=True)
PER_IMAGE_CSV = VAL_OUT_DIR / "results_per_image.csv"
VAL_METRICS_JSON = VAL_OUT_DIR / "val_metrics.json"

# ----------- Parámetros (mismos del entrenamiento) -----------
K_SUPERPIXELS = 600
COMPACTNESS = 10.0
SIGMA_SLIC = 1.0

GABOR_FREQUENCIES = [0.1, 0.2, 0.3]
GABOR_THETAS = [0, np.pi/6, np.pi/3, np.pi/2, 2*np.pi/3, 5*np.pi/6]

# Scoring del clúster "parásito"
TARGET_H_DEG = 320.0
H_BAND_DEG   = 50.0
W_C = 1.0
W_T = 0.7
W_H = 0.6

SAVE_OVERLAYS = True     # guarda PNGs de overlay por imagen
MIN_RBC_PIX   = 2000     # sanidad: tamaño mínimo del RBC para confiar en la ROI

# ------------------ Utils de imagen/feats ------------------
def read_rgb_float01(path: str) -> np.ndarray:
    arr = cv2.imread(path, cv2.IMREAD_COLOR)
    if arr is None:
        raise FileNotFoundError(f"No se puede leer: {path}")
    arr = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
    return (arr.astype(np.float32) / 255.0)

def normalize_by_ref(rgb: np.ndarray, ref: np.ndarray, eps: float = 1e-8) -> np.ndarray:
    norm = rgb / (ref + eps)
    norm = np.nan_to_num(norm, nan=0.0, posinf=0.0, neginf=0.0)
    return np.clip(norm, 0.0, 1.0).astype(np.float32)

def gradients_L_from_lab(lab: np.ndarray) -> np.ndarray:
    L = lab[..., 0]
    Ln = (L - L.min()) / (L.max() - L.min() + 1e-8)
    gx = cv2.Sobel(Ln.astype(np.float32), cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(Ln.astype(np.float32), cv2.CV_32F, 0, 1, ksize=3)
    return np.sqrt(gx*gx + gy*gy)

def gabor_bank(gray01: np.ndarray, freqs, thetas):
    feats = []
    for f in freqs:
        for t in thetas:
            real, imag = gabor(gray01, frequency=f, theta=t)
            feats.append(np.sqrt(real**2 + imag**2).astype(np.float32))
    return feats

def pixel_features(rgb01: np.ndarray):
    lab = skcolor.rgb2lab(rgb01)
    lch = skcolor.lab2lch(lab)
    L = lab[...,0].astype(np.float32)
    a = lab[...,1].astype(np.float32)
    b = lab[...,2].astype(np.float32)
    C = lch[...,1].astype(np.float32)
    h = lch[...,2].astype(np.float32)
    gradL = gradients_L_from_lab(lab).astype(np.float32)
    Ln = (L - L.min()) / (L.max() - L.min() + 1e-8)
    gabs = gabor_bank(Ln, GABOR_FREQUENCIES, GABOR_THETAS)
    feat_list = [L, a, b, C, h, gradL] + gabs
    feats = np.stack(feat_list, axis=-1)  # (H,W,F)
    return feats, lab

def rbc_mask_from_lab(rgb01: np.ndarray) -> np.ndarray:
    lab = skcolor.rgb2lab(rgb01)
    lch = skcolor.lab2lch(lab)
    L = lab[...,0]; a = lab[...,1]; C = lch[...,1]
    tL = max(20.0, np.percentile(L, 60))
    tC = max(5.0,  np.percentile(C, 50))
    ta = max(0.0,  np.percentile(a, 45))
    mask = (L > tL) & (C > tC) & (a > ta)
    mask = remove_small_objects(mask, min_size=400)
    mask = binary_opening(mask, disk(3))
    mask = binary_closing(mask, disk(5))
    mask = remove_small_holes(mask, area_threshold=200)
    labimg = label(mask)
    if labimg.max() == 0:
        return mask
    areas = [(r.label, r.area) for r in regionprops(labimg)]
    best = max(areas, key=lambda x: x[1])[0]
    mask = (labimg == best)
    mask = remove_small_holes(mask, area_threshold=2000)
    return mask

def assign_superpixels_to_roi(segments: np.ndarray, roi_mask: np.ndarray, thr=0.5):
    spx_ids = np.unique(segments)
    in_roi = []
    for sid in spx_ids:
        m = (segments == sid)
        frac = np.count_nonzero(roi_mask & m) / float(np.count_nonzero(m))
        in_roi.append(frac >= thr)
    return spx_ids, np.array(in_roi, dtype=bool)

def circular_hue_distance_deg(h, target):
    d = np.abs((h - target + 180) % 360 - 180)
    return d

def choose_parasite_cluster_by_score(agg_means, agg_stds, labels, spx_ids, spx_in_roi, F_each):
    # agg_means: columnas [L, a, b, C, h, gradL, gabor...]
    mean_C = agg_means[:, 3]
    mean_h = agg_means[:, 4]
    mean_grad = agg_means[:, 5]
    n_gab = F_each - 6
    if n_gab > 0:
        gabor_means = agg_means[:, 6:6+n_gab].mean(axis=1)
        texture = 0.5*mean_grad + 0.5*gabor_means
    else:
        texture = mean_grad
    hue_dist = circular_hue_distance_deg(mean_h, TARGET_H_DEG)
    hue_penalty = np.clip(hue_dist / H_BAND_DEG, 0.0, 1.0)

    def rnorm(x):
        p1, p99 = np.percentile(x, 1), np.percentile(x, 99)
        return np.clip((x - p1) / (p99 - p1 + 1e-8), 0, 1)

    Cn = rnorm(mean_C)
    Tn = rnorm(texture)
    spx_score = W_C*Cn + W_T*Tn - W_H*hue_penalty
    spx_score[~spx_in_roi] = -np.inf

    k_scores = []
    for k in np.unique(labels):
        m = (labels == k) & spx_in_roi
        if not np.any(m):
            k_scores.append(-np.inf)
        else:
            k_scores.append(np.mean(spx_score[m]))
    parasite_k = int(np.argmax(k_scores))
    return parasite_k, k_scores

def overlay_mask(rgb01, mask, alpha=0.45):
    out = rgb01.copy()
    color = np.array([1.0, 0.0, 1.0], dtype=np.float32)
    out[mask] = (1 - alpha) * out[mask] + alpha * color
    return out

def put_tag(img_rgb01: np.ndarray, text: str, pos=(10, 30)):
    img8 = img_as_ubyte(img_rgb01.copy())
    cv2.putText(img8, text, pos, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,0,0), 3, cv2.LINE_AA)
    cv2.putText(img8, text, pos, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1, cv2.LINE_AA)
    return img8

def colorize_clusters(segments: np.ndarray, labels: np.ndarray):
    H, W = segments.shape
    cmap = plt.get_cmap("tab10")
    out = np.zeros((H, W, 3), dtype=np.float32)
    for sid, lbl in zip(np.unique(segments), labels):
        out[segments == sid] = cmap(lbl % 10)[:3]
    return out

def collect_images(root: str):
    exts = ["*.png", "*.jpg", "*.jpeg", "*.tif", "*.tiff", "*.bmp"]
    files = []
    for ext in exts:
        files += list(Path(root).rglob(ext))
    return sorted([p for p in files if not p.name.endswith("_ref.png")])

def load_prepare(img_path: Path):
    ref_path = Path(str(img_path).rsplit(".", 1)[0] + "_ref.png")
    rgb = read_rgb_float01(str(img_path))
    if ref_path.exists():
        ref = read_rgb_float01(str(ref_path))
        rgbn = normalize_by_ref(rgb, ref)
    else:
        rgbn = rgb
    feats_px, lab = pixel_features(rgbn)
    H, W, F = feats_px.shape
    segments = slic(rgbn, n_segments=K_SUPERPIXELS, compactness=COMPACTNESS,
                    sigma=SIGMA_SLIC, start_label=0, channel_axis=-1)
    spx_ids = np.unique(segments)
    X = feats_px.reshape(-1, F)
    s = segments.reshape(-1)
    means = np.zeros((spx_ids.size, F), dtype=np.float32)
    stds  = np.zeros_like(means)
    for i, sid in enumerate(spx_ids):
        m = (s == sid)
        Xi = X[m]
        means[i] = Xi.mean(axis=0)
        stds[i]  = Xi.std(axis=0)
    agg = np.concatenate([means, stds], axis=1)
    rbc_mask = rbc_mask_from_lab(rgbn)
    spx_ids, spx_in_roi = assign_superpixels_to_roi(segments, rbc_mask, thr=0.5)
    return dict(rgb=rgbn, segments=segments, agg=agg, means=means, stds=stds,
                spx_ids=spx_ids, F_each=F, rbc_mask=rbc_mask)

# ------------------ Umbral ROBUSTO y sanidad ------------------
def load_threshold_report_robust(path: Path):
    """Devuelve float(thr) o None. Imprime diagnóstico de ruta y contenido."""
    abs_path = path.resolve()
    print(f"[THR] Intentando leer: {abs_path}")
    if not path.exists():
        try:
            listing = "\n - ".join([p.name for p in path.parent.iterdir()])
        except Exception:
            listing = "(no se pudo listar la carpeta)"
        print(f"[THR] No existe el archivo. Contenido de la carpeta:{os.linesep} - {listing}")
        return None

    txt = path.read_text(encoding="utf-8", errors="ignore")
    i, j = txt.find("{"), txt.rfind("}")
    thr = None
    if i != -1 and j != -1 and j > i:
        blob = txt[i:j+1]
        try:
            js = json.loads(blob)
            thr = js.get("thr", None)
        except Exception as e:
            print(f"[THR] JSON principal no parseó: {e}")
    if thr is None:
        m = re.search(r'"thr"\s*:\s*([-+eE0-9\.]+)', txt)
        if m:
            try:
                thr = float(m.group(1))
            except:
                thr = None

    if thr is None:
        print("[THR] No se pudo extraer 'thr' del archivo.")
    else:
        print(f"[THR] Umbral cargado: {float(thr):.8f}")
    return None if thr is None else float(thr)

def safe_frac_roi(pix_par_roi, pix_tot_roi, min_rbc=MIN_RBC_PIX):
    """(frac_roi, roi_bad_flag). Si ROI es muy pequeña, frac=0 y bad=True."""
    if pix_tot_roi < min_rbc:
        return 0.0, True
    return pix_par_roi / float(max(1, pix_tot_roi)), False

def midpoints_threshold_grid(scores: np.ndarray):
    """Umbrales en puntos medios entre scores únicos (evita usar el mínimo)."""
    u = np.unique(scores[~np.isnan(scores)])
    if len(u) <= 1:
        return np.array([u[0] + 1e-9]) if len(u) == 1 else np.array([0.5])
    mids = (u[1:] + u[:-1]) / 2.0
    return np.concatenate([np.array([u[0] + 1e-8]), mids])

def best_threshold_by_f1(scores: np.ndarray, y_true: np.ndarray):
    grid = midpoints_threshold_grid(scores)
    best = {"thr": None, "f1": -1, "acc": 0, "prec": 0, "rec": 0}
    for thr in grid:
        y_pred = (scores > thr).astype(int)  # '>' evita degenerar con el mínimo
        f1 = f1_score(y_true, y_pred, zero_division=0)
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred, zero_division=0)
        rec = recall_score(y_true, y_pred)
        if f1 > best["f1"]:
            best.update(dict(thr=float(thr), f1=float(f1), acc=float(acc),
                             prec=float(prec), rec=float(rec)))
    return best, grid

def apply_threshold(scores: np.ndarray, thr: float):
    """Aplica con '>' y advierte si todo queda 1 o 0."""
    y_pred = (scores > thr).astype(int)
    if (y_pred == 1).all() or (y_pred == 0).all():
        print("[WARN] Predicciones degeneradas: todo 1 o todo 0. Ajusta el threshold o revisa scores.")
    return y_pred

def choose_threshold_robust(thr_loaded, scores, y_true, df=None, fallback_quantile=0.95):
    """
    Valida/ajusta el umbral:
      - si thr_loaded es None, NaN, <= min(scores) o >= max(scores) -> reoptimiza por F1
      - si aún degenera -> usa cuantíl de 'sanos' (control de FPR)
    """
    smin, smax = float(np.min(scores)), float(np.max(scores))
    def degenerate(thr):
        return (thr is None) or (not np.isfinite(thr)) or (thr <= smin) or (thr >= smax)

    if degenerate(thr_loaded):
        print(f"[THR] Umbral cargado inválido o extremo: {thr_loaded}. Re-optimizo por F1…")
        best, grid = best_threshold_by_f1(scores, y_true)
        thr_used = best["thr"]
        print(f"[THR] Mejor F1 -> thr={thr_used:.6f} (F1={best['f1']:.3f})")
    else:
        thr_used = float(thr_loaded)
        grid = None
        print(f"[THR] Usando umbral cargado: {thr_used:.6f}")

    y_pred_try = (scores > thr_used).astype(int)
    if (y_pred_try == 1).all() or (y_pred_try == 0).all():
        print("[THR] Aún degenerado con ese umbral. Calibro por sanos (cuantil).")
        if df is not None and (y_true == 0).any():
            thr_used = float(np.quantile(scores[y_true == 0], fallback_quantile))
            print(f"[THR] Nuevo thr (q={fallback_quantile:.2f} en sanos) = {thr_used:.6f}")
        else:
            thr_used = float(np.quantile(scores, 0.5))
            print(f"[THR] Sin sanos disponibles; uso mediana global = {thr_used:.6f}")
        grid = None
    return thr_used, grid

# ------------------ Gráficas ------------------
def plot_roc_pr(y_true, scores, out_dir: Path):
    fpr, tpr, _ = roc_curve(y_true, scores)
    auc = roc_auc_score(y_true, scores)
    prec, rec, _ = precision_recall_curve(y_true, scores)
    ap = average_precision_score(y_true, scores)

    plt.figure(figsize=(5.2,5))
    plt.plot(fpr, tpr, lw=2, label=f"AUC={auc:.3f}")
    plt.plot([0,1], [0,1], "--", lw=1)
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title("ROC curve (validation)"); plt.legend()
    plt.tight_layout(); plt.savefig(out_dir / "val_roc_curve.png", dpi=160); plt.close()

    plt.figure(figsize=(5.2,5))
    plt.plot(rec, prec, lw=2, label=f"AP={ap:.3f}")
    plt.xlabel("Recall"); plt.ylabel("Precision")
    plt.title("Precision–Recall (validation)"); plt.legend()
    plt.tight_layout(); plt.savefig(out_dir / "val_precision_recall_curve.png", dpi=160); plt.close()

def plot_f1_vs_threshold(y_true, scores, grid, out_dir: Path):
    f1s, accs = [], []
    for thr in grid:
        y_pred = (scores > thr).astype(int)
        f1s.append(f1_score(y_true, y_pred, zero_division=0))
        accs.append(accuracy_score(y_true, y_pred))
    plt.figure(figsize=(6,4))
    plt.plot(grid, f1s, label="F1", lw=2)
    plt.plot(grid, accs, label="Accuracy", lw=1.5)
    plt.xlabel("threshold"); plt.ylabel("score"); plt.title("F1 / Acc vs threshold (validation)")
    plt.legend(); plt.tight_layout(); plt.savefig(out_dir / "val_f1_accuracy_vs_threshold.png", dpi=160); plt.close()

def plot_hist_with_thr(df, out_path, col="parasite_area_fraction_RBC", thr=None):
    plt.figure(figsize=(6,4))
    for name, sub in df.groupby("set"):
        plt.hist(sub[col], bins=30, alpha=0.6, label=name)
    if thr is not None:
        plt.axvline(thr, color="k", linestyle="--", linewidth=1.5, label=f"thr={thr:.4f}")
    plt.xlabel(col); plt.ylabel("count"); plt.title(f"Validation histogram of {col} by set")
    plt.legend(); plt.tight_layout(); plt.savefig(out_path, dpi=160); plt.close()

def plot_confusion(y_true, y_pred, out_path: Path, title="Confusion matrix (validation)"):
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    disp = ConfusionMatrixDisplay(cm, display_labels=["sano(0)","nosano(1)"])
    fig, ax = plt.subplots(figsize=(4.8,4.8))
    disp.plot(ax=ax, cmap="Blues", values_format="d", colorbar=False)
    ax.set_title(title)
    plt.tight_layout(); plt.savefig(out_path, dpi=160); plt.close()

# ------------------ VALIDACIÓN (usa modelo entrenado) ------------------
def evaluate_validation():
    # A) Carga modelo entrenado
    if not MODEL_PATH.exists():
        raise FileNotFoundError(f"No se encontró el modelo entrenado: {MODEL_PATH}")
    bundle = joblib.load(MODEL_PATH)
    scaler = bundle["scaler"]; kmeans = bundle["kmeans"]
    print("[OK] Modelo cargado:", MODEL_PATH)

    # B) Carga umbral entrenado (robusto)
    thr_loaded = load_threshold_report_robust(THRESH_REPORT)

    # C) Carga imágenes de validación
    imgs_neg = [(p, 0) for p in collect_images(DIR_VAL_NEG)]
    imgs_pos = [(p, 1) for p in collect_images(DIR_VAL_POS)]
    all_imgs = imgs_neg + imgs_pos
    if not all_imgs:
        raise RuntimeError("No se encontraron imágenes en ./val_sanos o ./val_nosanos")
    print(f"[VAL] Sanos: {len(imgs_neg)} | Nosanos: {len(imgs_pos)}")

    # D) Pasada por imagen con modelo cargado
    rows = []
    for p, y in all_imgs:
        data = load_prepare(p)
        rgb = data["rgb"]; seg = data["segments"]
        agg = data["agg"]; means = data["means"]; stds = data["stds"]
        spx_ids = data["spx_ids"]; F_each = data["F_each"]
        rbc_mask = data["rbc_mask"]

        X_img = scaler.transform(agg.astype(np.float32))
        lbls = kmeans.predict(X_img)

        _, spx_in_roi = assign_superpixels_to_roi(seg, rbc_mask, thr=0.5)
        parasite_k, k_scores = choose_parasite_cluster_by_score(means, stds, lbls, spx_ids, spx_in_roi, F_each)

        # máscaras
        mask_par = np.isin(seg, spx_ids[lbls == parasite_k])
        mask_par_roi = mask_par & rbc_mask

        # conteos y fracciones (con sanidad de ROI)
        pix_par_global = int(np.count_nonzero(mask_par))
        pix_tot_global = int(mask_par.size)
        frac_global = pix_par_global / float(max(1, pix_tot_global))

        pix_par_roi = int(np.count_nonzero(mask_par_roi))
        pix_tot_roi = int(np.count_nonzero(rbc_mask))
        frac_roi, roi_bad = safe_frac_roi(pix_par_roi, pix_tot_roi, min_rbc=MIN_RBC_PIX)

        # overlays (opcionales)
        if SAVE_OVERLAYS:
            split_dir = VAL_OUT_DIR / ("neg" if y == 0 else "pos")
            split_dir.mkdir(parents=True, exist_ok=True)
            clusters_rgb = colorize_clusters(seg, lbls)
            cv2.imwrite(str(split_dir / f"{p.stem}_clusters_overlay.png"),
                        cv2.cvtColor(img_as_ubyte(mark_boundaries(clusters_rgb, seg)), cv2.COLOR_RGB2BGR))
            overlay_par = overlay_mask(rgb, mask_par_roi, alpha=0.45)
            tag = f"VAL Parasite(ROI) k={parasite_k}, score={k_scores[parasite_k]:.2f}, fracROI={frac_roi:.3f}"
            out_tag = put_tag(overlay_par, tag)
            cv2.imwrite(str(split_dir / f"{p.stem}_parasite_overlay.png"),
                        cv2.cvtColor(out_tag, cv2.COLOR_RGB2BGR))

        rows.append({
            "image": p.name,
            "set": "sanos" if y == 0 else "nosanos",
            "y_true": y,
            "parasite_cluster": parasite_k,
            "score_parasite_cluster": float(k_scores[parasite_k]),
            "parasite_area_fraction_global": frac_global,
            "parasite_area_fraction_RBC": frac_roi,
            "rbc_invalid": int(roi_bad)
        })

    df = pd.DataFrame(rows)
    df.to_csv(PER_IMAGE_CSV, index=False)
    print(f"[VAL] Guardado: {PER_IMAGE_CSV}")

    # E) Scores y etiquetas
    scores = df["parasite_area_fraction_RBC"].to_numpy()
    # >>> Para usar el score del clúster:
    # scores = df["score_parasite_cluster"].to_numpy()
    y_true = df["y_true"].to_numpy().astype(int)

    print(f"[VAL] scores: min={np.min(scores):.6f}, p50={np.median(scores):.6f}, max={np.max(scores):.6f}")

    # F) Eligir/validar umbral de manera robusta
    thr_used, grid = choose_threshold_robust(thr_loaded, scores, y_true, df=df, fallback_quantile=0.95)

    # G) Métricas (ROC/PR) y gráficas
    plot_roc_pr(y_true, scores, VAL_OUT_DIR)
    if grid is not None:
        plot_f1_vs_threshold(y_true, scores, grid, VAL_OUT_DIR)

    # histograma con línea de umbral
    plot_hist_with_thr(df, VAL_OUT_DIR / "val_hist_with_thr.png",
                       col="parasite_area_fraction_RBC", thr=thr_used)

    # H) Predicciones con el umbral usado (aplica con '>')
    y_pred = apply_threshold(scores, thr_used)
    df["y_pred"] = y_pred
    df["correct"] = (y_pred == y_true).astype(int)
    df.to_csv(PER_IMAGE_CSV, index=False)

    # I) Métricas globales
    metrics = {
        "threshold_used": float(thr_used),
        "f1": float(f1_score(y_true, y_pred, zero_division=0)),
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "recall": float(recall_score(y_true, y_pred)),
    }
    try:
        metrics["auc"] = float(roc_auc_score(y_true, scores))
    except Exception:
        metrics["auc"] = None

    # J) Matrices de confusión
    plot_confusion(y_true, y_pred, VAL_OUT_DIR / "val_confusion_matrix_overall.png")
    for s in ["sanos", "nosanos"]:
        sub = df[df["set"] == s]
        if len(sub):
            plot_confusion(sub["y_true"].to_numpy(), sub["y_pred"].to_numpy(),
                           VAL_OUT_DIR / f"val_confusion_matrix_{s}.png",
                           title=f"Confusion matrix (validation, {s})")

    # K) Guardar métricas globales a JSON
    with open(VAL_METRICS_JSON, "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2, ensure_ascii=False)

    # Resumen
    print("\n[VAL] Métricas globales:")
    for k, v in metrics.items():
        print(f"- {k}: {v:.4f}" if isinstance(v, float) else f"- {k}: {v}")
    print("\n[VAL] Aciertos por set:")
    print(df.pivot_table(index="set", values="correct", aggfunc=["mean","sum","count"]))

if __name__ == "__main__":
    evaluate_validation()
    print("Listo ✅  Resultados en ./out_unsup_val/")

[OK] Modelo cargado: out_unsup_2sets/model_unsup.joblib
[THR] Intentando leer: /home/alejandro/Codes/Malaria/Segmentacion/out_unsup_2sets/threshold_report.txt
[THR] Umbral cargado: 0.00081081
[VAL] Sanos: 150 | Nosanos: 150
