In [1]:
# ============================================================
# NO SUPERVISADO con ROI del glóbulo rojo + scoring colorimétrico
# SLIC + features (LAB/LCH/grad/Gabor) + KMeans GLOBAL (train) -> aplica en test
# - Segmenta RBC (ROI) en LAB (L*, C*, a*) + morfología
# - Asigna superpíxeles a ROI (mayoría)
# - Elige clúster "parásito" por score = wC*C* + wT*texture - wH*dist_hue_magenta
# - Guarda overlays y CSV con fracciones globales y relativas al RBC
# ============================================================

from pathlib import Path
import numpy as np
import cv2
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from skimage import color as skcolor, img_as_ubyte
from skimage.segmentation import slic, mark_boundaries
from skimage.filters import gabor
from skimage.measure import label, regionprops
from skimage.morphology import (remove_small_objects, remove_small_holes,
                                binary_opening, binary_closing, disk)

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# ------------------ Configuración ------------------
ROOT_TRAIN = "./img"
ROOT_TEST  = "./img_testing"

OUT_TRAIN = Path(ROOT_TRAIN) / "out_unsup_train"
OUT_TEST  = Path(ROOT_TEST)  / "out_unsup_test"
OUT_TRAIN.mkdir(parents=True, exist_ok=True)
OUT_TEST.mkdir(parents=True, exist_ok=True)

MODEL_PATH = "model_unsup.joblib"

# SLIC
K_SUPERPIXELS = 600
COMPACTNESS = 10.0
SIGMA_SLIC = 1.0

# Gabor (textura)
GABOR_FREQUENCIES = [0.1, 0.2, 0.3]
GABOR_THETAS = [0, np.pi/6, np.pi/3, np.pi/2, 2*np.pi/3, 5*np.pi/6]

# Clustering
K_CLUSTERS = 3

# Morfología (para limpiar máscaras)
OPENING_RADIUS = 2
CLOSING_RADIUS = 2
HOLE_MIN_SIZE = 64
COMP_MIN_SIZE = 64

# ------------------ Scoring del "parásito" dentro del RBC ------------------
# Hue en LCH: 0°=rojo, 90°=amarillo, 180°=verde, 270°=azul. Magenta/púrpura ≈ 300–360° (y 0–20°).
TARGET_H_DEG = 320.0      # centro "púrpura"
H_BAND_DEG   = 50.0       # tolerancia
W_C = 1.0                 # peso C* (croma)
W_T = 0.7                 # peso textura (grad L* + Gabor)
W_H = 0.6                 # peso penalización por distancia angular del hue objetivo

# ------------------ Utilidades base ------------------
def read_rgb_float01(path: str) -> np.ndarray:
    arr = cv2.imread(path, cv2.IMREAD_COLOR)
    if arr is None:
        raise FileNotFoundError(f"No se puede leer: {path}")
    arr = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
    return (arr.astype(np.float32) / 255.0)

def normalize_by_ref(rgb: np.ndarray, ref: np.ndarray, eps: float = 1e-8) -> np.ndarray:
    norm = rgb / (ref + eps)
    norm = np.nan_to_num(norm, nan=0.0, posinf=0.0, neginf=0.0)
    return np.clip(norm, 0.0, 1.0).astype(np.float32)

def gradients_L_from_lab(lab: np.ndarray) -> np.ndarray:
    L = lab[..., 0]
    Ln = (L - L.min()) / (L.max() - L.min() + 1e-8)
    gx = cv2.Sobel(Ln.astype(np.float32), cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(Ln.astype(np.float32), cv2.CV_32F, 0, 1, ksize=3)
    return np.sqrt(gx*gx + gy*gy)

def gabor_bank(gray01: np.ndarray, freqs, thetas):
    feats = []
    for f in freqs:
        for t in thetas:
            real, imag = gabor(gray01, frequency=f, theta=t)
            feats.append(np.sqrt(real**2 + imag**2).astype(np.float32))
    return feats  # lista (H,W)

def pixel_features(rgb01: np.ndarray):
    lab = skcolor.rgb2lab(rgb01)
    lch = skcolor.lab2lch(lab)
    L = lab[...,0].astype(np.float32)
    a = lab[...,1].astype(np.float32)
    b = lab[...,2].astype(np.float32)
    C = lch[...,1].astype(np.float32)
    h = lch[...,2].astype(np.float32)  # grados (0..360)
    gradL = gradients_L_from_lab(lab).astype(np.float32)
    Ln = (L - L.min()) / (L.max() - L.min() + 1e-8)
    gabs = gabor_bank(Ln, GABOR_FREQUENCIES, GABOR_THETAS)
    feat_list = [L, a, b, C, h, gradL] + gabs
    feats = np.stack(feat_list, axis=-1)  # (H,W,F)
    return feats, lab

# ------------------ Máscara del glóbulo (RBC) en LAB ------------------
def rbc_mask_from_lab(rgb01: np.ndarray) -> np.ndarray:
    """
    Segmenta el glóbulo:
    - L* > tL  (fondo negro es L muy bajo)
    - C* > tC  (el glóbulo tiene algo de croma, el fondo no)
    - a* > ta  (suele ser rojizo)
    Luego morfología + mayor componente + relleno.
    """
    lab = skcolor.rgb2lab(rgb01)
    lch = skcolor.lab2lch(lab)
    L = lab[...,0]; a = lab[...,1]; C = lch[...,1]

    # Umbrales adaptativos simples
    tL = max(20.0, np.percentile(L, 60))     # evita fondo oscuro
    tC = max(5.0,  np.percentile(C, 50))     # croma mínimo
    ta = max(0.0,  np.percentile(a, 45))     # rojizo (>0)

    mask = (L > tL) & (C > tC) & (a > ta)

    # Morfología
    mask = remove_small_objects(mask, min_size=400)
    mask = binary_opening(mask, disk(3))
    mask = binary_closing(mask, disk(5))
    mask = remove_small_holes(mask, area_threshold=200)

    # Mayor componente conectada
    labimg = label(mask)
    if labimg.max() == 0:
        return mask
    areas = [(r.label, r.area) for r in regionprops(labimg)]
    best = max(areas, key=lambda x: x[1])[0]
    mask = (labimg == best)
    mask = remove_small_holes(mask, area_threshold=2000)
    return mask

def assign_superpixels_to_roi(segments: np.ndarray, roi_mask: np.ndarray, thr=0.5):
    """
    Devuelve un boolean array por superpíxel: True si >thr de sus píxeles caen en ROI.
    """
    spx_ids = np.unique(segments)
    in_roi = []
    for sid in spx_ids:
        m = (segments == sid)
        frac = np.count_nonzero(roi_mask & m) / float(np.count_nonzero(m))
        in_roi.append(frac >= thr)
    return spx_ids, np.array(in_roi, dtype=bool)

# ------------------ Scoring de clúster "parásito" dentro del RBC ------------------
def circular_hue_distance_deg(h, target):
    """distancia angular mínima en grados (0..180). h y target en grados [0..360)."""
    d = np.abs((h - target + 180) % 360 - 180)
    return d

def choose_parasite_cluster_by_score(agg_means, agg_stds, labels, spx_ids, spx_in_roi, F_each):
    """
    agg (N, 2F) = [means | stds], indices:
      mean_L=0, mean_a=1, mean_b=2, mean_C=3, mean_h=4, mean_gradL=5, mean_gabors...
    Selecciona el clúster con mayor score dentro del ROI:
      score_k = W_C*mean(C*) + W_T*mean(texture) - W_H*mean(hue_distance_to_magenta)
    """
    # columnas
    mean_C = agg_means[:, 3]
    mean_h = agg_means[:, 4]
    mean_grad = agg_means[:, 5]

    # textura: gradL + energía gabor media
    n_gab = F_each - 6
    if n_gab > 0:
        gabor_means = agg_means[:, 6:6+n_gab].mean(axis=1)
        texture = 0.5*mean_grad + 0.5*gabor_means
    else:
        texture = mean_grad

    # distancia de matiz a magenta (normalizada a 0..1 en banda H_BAND_DEG)
    hue_dist = circular_hue_distance_deg(mean_h, TARGET_H_DEG)
    hue_penalty = np.clip(hue_dist / H_BAND_DEG, 0.0, 1.0)  # 0 (ideal) .. 1 (lejos)

    # normalizaciones robustas por percentiles (evita outliers)
    def rnorm(x):
        p1, p99 = np.percentile(x, 1), np.percentile(x, 99)
        return np.clip((x - p1) / (p99 - p1 + 1e-8), 0, 1)

    Cn = rnorm(mean_C)
    Tn = rnorm(texture)

    # score por superpíxel (solo ROI)
    spx_score = W_C*Cn + W_T*Tn - W_H*hue_penalty
    spx_score[~spx_in_roi] = -np.inf  # ignora fuera del glóbulo

    # score por clúster: media de scores de sus superpíxeles ROI
    k_scores = []
    for k in np.unique(labels):
        m = (labels == k) & spx_in_roi
        if not np.any(m):
            k_scores.append(-np.inf)
        else:
            k_scores.append(np.mean(spx_score[m]))
    parasite_k = int(np.argmax(k_scores))
    return parasite_k, k_scores

# ------------------ IO y pipeline ------------------
def collect_images(root: str):
    exts = ["*.png", "*.jpg", "*.jpeg", "*.tif", "*.tiff", "*.bmp"]
    files = []
    for ext in exts:
        files += list(Path(root).rglob(ext))
    return sorted([p for p in files if not p.name.endswith("_ref.png")])

def load_prepare(img_path: Path):
    ref_path = Path(str(img_path).rsplit(".", 1)[0] + "_ref.png")
    rgb = read_rgb_float01(str(img_path))
    if ref_path.exists():
        ref = read_rgb_float01(str(ref_path))
        rgbn = normalize_by_ref(rgb, ref)
    else:
        rgbn = rgb
    feats_px, lab = pixel_features(rgbn)
    H, W, F = feats_px.shape
    segments = slic(rgbn, n_segments=K_SUPERPIXELS, compactness=COMPACTNESS,
                    sigma=SIGMA_SLIC, start_label=0, channel_axis=-1)

    # agregación
    spx_ids = np.unique(segments)
    X = feats_px.reshape(-1, F)
    s = segments.reshape(-1)
    means = np.zeros((spx_ids.size, F), dtype=np.float32)
    stds  = np.zeros_like(means)
    sizes = np.zeros((spx_ids.size,), dtype=np.int32)
    for i, sid in enumerate(spx_ids):
        m = (s == sid)
        Xi = X[m]
        means[i] = Xi.mean(axis=0)
        stds[i]  = Xi.std(axis=0)
        sizes[i] = np.count_nonzero(m)
    agg = np.concatenate([means, stds], axis=1)  # (N, 2F)

    # máscara RBC y asignación
    rbc_mask = rbc_mask_from_lab(rgbn)
    spx_ids, spx_in_roi = assign_superpixels_to_roi(segments, rbc_mask, thr=0.5)

    return dict(rgb=rgbn, segments=segments, agg=agg, means=means, stds=stds,
                spx_ids=spx_ids, sizes=sizes, F_each=F, rbc_mask=rbc_mask)

def overlay_mask(rgb01, mask, alpha=0.45):
    out = rgb01.copy()
    color = np.array([1.0, 0.0, 1.0], dtype=np.float32)
    out[mask] = (1 - alpha) * out[mask] + alpha * color
    return out

def put_tag(img_rgb01: np.ndarray, text: str, pos=(10, 30)):
    img8 = img_as_ubyte(img_rgb01.copy())
    cv2.putText(img8, text, pos, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,0,0), 3, cv2.LINE_AA)
    cv2.putText(img8, text, pos, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1, cv2.LINE_AA)
    return img8

def colorize_clusters(segments: np.ndarray, labels: np.ndarray):
    H, W = segments.shape
    cmap = plt.get_cmap("tab10")
    out = np.zeros((H, W, 3), dtype=np.float32)
    for sid, lbl in zip(np.unique(segments), labels):
        out[segments == sid] = cmap(lbl % 10)[:3]
    return out

# ------------------ TRAIN ------------------
def train_unsup():
    imgs = collect_images(ROOT_TRAIN)
    if not imgs:
        raise RuntimeError("No hay imágenes en ./img")

    per_image = []
    all_agg = []
    for p in imgs:
        data = load_prepare(p)
        per_image.append((p, data))
        all_agg.append(data["agg"])
        print(f"[TRAIN] {p.name}: superpix={data['agg'].shape[0]}")

    X_all = np.vstack(all_agg).astype(np.float32)
    scaler = StandardScaler().fit(X_all)
    Xs_all = scaler.transform(X_all)

    kmeans = KMeans(n_clusters=K_CLUSTERS, n_init=50, random_state=42)
    kmeans.fit(Xs_all)

    joblib.dump({"scaler": scaler, "kmeans": kmeans}, MODEL_PATH)
    print(f"[OK] Modelo guardado en {MODEL_PATH}")

    rows = []
    for p, data in per_image:
        rgb = data["rgb"]; seg = data["segments"]
        agg = data["agg"]; means = data["means"]; stds = data["stds"]
        spx_ids = data["spx_ids"]; sizes = data["sizes"]; F_each = data["F_each"]
        rbc_mask = data["rbc_mask"]

        X_img = scaler.transform(agg.astype(np.float32))
        lbls = kmeans.predict(X_img)

        # scoring dentro del RBC
        _, spx_in_roi = assign_superpixels_to_roi(seg, rbc_mask, thr=0.5)
        parasite_k, k_scores = choose_parasite_cluster_by_score(means, stds, lbls, spx_ids, spx_in_roi, F_each)

        # máscaras y fracciones
        mask_par = np.isin(seg, spx_ids[lbls == parasite_k])
        mask_par_roi = mask_par & rbc_mask

        pix_par_global = int(np.count_nonzero(mask_par))
        pix_tot_global = int(mask_par.size)
        frac_global = pix_par_global / max(1, pix_tot_global)

        pix_par_roi = int(np.count_nonzero(mask_par_roi))
        pix_tot_roi = int(np.count_nonzero(rbc_mask))
        frac_roi = pix_par_roi / max(1, pix_tot_roi)

        # overlays
        clusters_rgb = colorize_clusters(seg, lbls)
        cv2.imwrite(str(OUT_TRAIN / f"{p.stem}_clusters_overlay.png"),
                    cv2.cvtColor(img_as_ubyte(mark_boundaries(clusters_rgb, seg)), cv2.COLOR_RGB2BGR))

        overlay_par = overlay_mask(rgb, mask_par_roi, alpha=0.45)
        tag = f"Parasite (ROI RBC) -> k={parasite_k}, score={k_scores[parasite_k]:.2f}, fracROI={frac_roi:.3f}"
        out = put_tag(overlay_par, tag)
        cv2.imwrite(str(OUT_TRAIN / f"{p.stem}_parasite_overlay.png"),
                    cv2.cvtColor(out, cv2.COLOR_RGB2BGR))

        # CSV con fracciones
        rows.append({
            "image": p.name,
            "parasite_cluster": parasite_k,
            "score_parasite_cluster": k_scores[parasite_k],
            "pixels_parasite_global": pix_par_global,
            "pixels_total_global": pix_tot_global,
            "parasite_area_fraction_global": frac_global,
            "pixels_parasite_RBC": pix_par_roi,
            "pixels_RBC": pix_tot_roi,
            "parasite_area_fraction_RBC": frac_roi
        })

        # CSV superpíxeles con cluster
        F = F_each
        nice = ["L","a","b","C","h","gradL"]
        n_gab = F - len(nice)
        col_means = [f"mean_{n}" for n in nice] + [f"mean_gabor_{i}" for i in range(n_gab)]
        col_stds  = [f"std_{n}"  for n in nice] + [f"std_gabor_{i}"  for i in range(n_gab)]
        cols = col_means + col_stds
        df = pd.DataFrame(agg, columns=cols)
        df["superpixel"] = spx_ids
        df["cluster"] = lbls
        df["in_RBC"] = spx_in_roi.astype(int)
        df.to_csv(OUT_TRAIN / f"{p.stem}_superpixel_features_kmeans.csv", index=False)

    pd.DataFrame(rows).to_csv(OUT_TRAIN / "tag_summary_train.csv", index=False)
    print(f"[OK] tag_summary_train.csv en {OUT_TRAIN}")

# ------------------ TEST ------------------
def test_unsup():
    imgs = collect_images(ROOT_TEST)
    if not imgs:
        raise RuntimeError("No hay imágenes en ./img_testing")

    bundle = joblib.load(MODEL_PATH)
    scaler = bundle["scaler"]; kmeans = bundle["kmeans"]

    rows = []
    for p in imgs:
        data = load_prepare(p)
        rgb = data["rgb"]; seg = data["segments"]
        agg = data["agg"]; means = data["means"]; stds = data["stds"]
        spx_ids = data["spx_ids"]; sizes = data["sizes"]; F_each = data["F_each"]
        rbc_mask = data["rbc_mask"]

        X_img = scaler.transform(agg.astype(np.float32))
        lbls = kmeans.predict(X_img)

        _, spx_in_roi = assign_superpixels_to_roi(seg, rbc_mask, thr=0.5)
        parasite_k, k_scores = choose_parasite_cluster_by_score(means, stds, lbls, spx_ids, spx_in_roi, F_each)

        mask_par = np.isin(seg, spx_ids[lbls == parasite_k])
        mask_par_roi = mask_par & rbc_mask

        pix_par_global = int(np.count_nonzero(mask_par))
        pix_tot_global = int(mask_par.size)
        frac_global = pix_par_global / max(1, pix_tot_global)

        pix_par_roi = int(np.count_nonzero(mask_par_roi))
        pix_tot_roi = int(np.count_nonzero(rbc_mask))
        frac_roi = pix_par_roi / max(1, pix_tot_roi)

        clusters_rgb = colorize_clusters(seg, lbls)
        cv2.imwrite(str(OUT_TEST / f"{p.stem}_clusters_overlay.png"),
                    cv2.cvtColor(img_as_ubyte(mark_boundaries(clusters_rgb, seg)), cv2.COLOR_RGB2BGR))

        overlay_par = overlay_mask(rgb, mask_par_roi, alpha=0.45)
        tag = f"Parasite (ROI RBC) -> k={parasite_k}, score={k_scores[parasite_k]:.2f}, fracROI={frac_roi:.3f}"
        out = put_tag(overlay_par, tag)
        cv2.imwrite(str(OUT_TEST / f"{p.stem}_parasite_overlay.png"),
                    cv2.cvtColor(out, cv2.COLOR_RGB2BGR))

        rows.append({
            "image": p.name,
            "parasite_cluster": parasite_k,
            "score_parasite_cluster": k_scores[parasite_k],
            "pixels_parasite_global": pix_par_global,
            "pixels_total_global": pix_tot_global,
            "parasite_area_fraction_global": frac_global,
            "pixels_parasite_RBC": pix_par_roi,
            "pixels_RBC": pix_tot_roi,
            "parasite_area_fraction_RBC": frac_roi
        })

        # CSV superpíxeles
        F = F_each
        nice = ["L","a","b","C","h","gradL"]
        n_gab = F - len(nice)
        col_means = [f"mean_{n}" for n in nice] + [f"mean_gabor_{i}" for i in range(n_gab)]
        col_stds  = [f"std_{n}"  for n in nice] + [f"std_gabor_{i}"  for i in range(n_gab)]
        cols = col_means + col_stds
        df = pd.DataFrame(agg, columns=cols)
        df["superpixel"] = spx_ids
        df["cluster"] = lbls
        df["in_RBC"] = spx_in_roi.astype(int)
        df.to_csv(OUT_TEST / f"{p.stem}_superpixel_features_kmeans.csv", index=False)

    pd.DataFrame(rows).to_csv(OUT_TEST / "tag_summary_test.csv", index=False)
    print(f"[OK] tag_summary_test.csv en {OUT_TEST}")

# ------------------ Main ------------------
if __name__ == "__main__":
    print("=== ENTRENAR ===")
    train_unsup()
    print("=== PROBAR ===")
    test_unsup()
    print("Listo ✅")



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 736, in start
    self.io_loop.start()
  File "/ho

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 736, in start
    self.io_loop.start()
  File "/ho

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 736, in start
    self.io_loop.start()
  File "/ho

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 736, in start
    self.io_loop.start()
  File "/ho

AttributeError: _ARRAY_API not found

=== ENTRENAR ===
[TRAIN] 000_seg_000.png: superpix=545
[TRAIN] 000_seg_000_seg.png: superpix=538
[TRAIN] 000_seg_001.png: superpix=559
[TRAIN] 000_seg_001_seg.png: superpix=556
[TRAIN] 000_seg_002.png: superpix=589
[TRAIN] 000_seg_002_seg.png: superpix=581
[TRAIN] 000_seg_003.png: superpix=634
[TRAIN] 000_seg_003_seg.png: superpix=630
[TRAIN] 000_seg_004.png: superpix=579
[TRAIN] 000_seg_004_seg.png: superpix=578
[TRAIN] 000_seg_005.png: superpix=609
[TRAIN] 000_seg_005_seg.png: superpix=606
[TRAIN] 000_seg_006.png: superpix=538
[TRAIN] 000_seg_006_seg.png: superpix=538
[TRAIN] 000_seg_007.png: superpix=561
[TRAIN] 000_seg_007_seg.png: superpix=559
[TRAIN] 000_seg_008.png: superpix=593
[TRAIN] 000_seg_008_seg.png: superpix=590
[TRAIN] 000_seg_009.png: superpix=561
[TRAIN] 000_seg_009_seg.png: superpix=567
[TRAIN] 000_seg_010.png: superpix=771
[TRAIN] 000_seg_010_seg.png: superpix=762
[TRAIN] 000_seg_011.png: superpix=639
[TRAIN] 000_seg_011_seg.png: superpix=636
[TRAIN] 000_seg_012.png

In [4]:
# ============================================
# Etiquetado a nivel de imagen (sí/no parásito)
# a partir de parasite_area_fraction
# ============================================
import os
import pandas as pd

TRAIN_SUM = "./img/out_unsup_train/tag_summary_train.csv"
TEST_SUM  = "./img_testing/out_unsup_test/tag_summary_test.csv"

# umbral configurable (ej. 0.05 = 5% del área)
THRESHOLD = 0.1

def tag_images(csv_path, threshold=THRESHOLD):
    if not os.path.exists(csv_path):
        print(f"⚠️ No encontrado: {csv_path}")
        return None
    df = pd.read_csv(csv_path)
    if "parasite_area_fraction_RBC" not in df.columns:
        raise ValueError("Falta columna 'parasite_area_fraction'")

    # 1) etiqueta por “cualquier área > 0”
    df["image_tag_any"] = (df["parasite_area_fraction_RBC"] > 0).map({True:"parasite", False:"no_parasite"})

    # 2) etiqueta por umbral
    df["image_tag_thresh"] = (df["parasite_area_fraction_RBC"] >= threshold).map({True:"parasite", False:"no_parasite"})

    # guardar
    out_csv = csv_path.replace(".csv", f"_with_image_tags_thr{threshold:.3f}.csv")
    df.to_csv(out_csv, index=False)

    # resumen
    print(f"\n{os.path.basename(out_csv)}")
    print("— Regla ANY      :", df["image_tag_any"].value_counts().to_dict())
    print(f"— Regla THRESH={threshold:.3f} :", df["image_tag_thresh"].value_counts().to_dict())
    if (df["image_tag_any"] == "parasite").all():
        print("✔ Todas las imágenes fueron marcadas como 'parasite' por la regla ANY (coincide con tu observación).")
    return df

if __name__ == "__main__":
    print("Etiquetando TRAIN:")
    df_tr = tag_images(TRAIN_SUM, THRESHOLD)
    print("\nEtiquetando TEST:")
    df_te = tag_images(TEST_SUM, THRESHOLD)


Etiquetando TRAIN:

tag_summary_train_with_image_tags_thr0.100.csv
— Regla ANY      : {'no_parasite': 51, 'parasite': 17}
— Regla THRESH=0.100 : {'no_parasite': 53, 'parasite': 15}

Etiquetando TEST:

tag_summary_test_with_image_tags_thr0.100.csv
— Regla ANY      : {'parasite': 43, 'no_parasite': 23}
— Regla THRESH=0.100 : {'parasite': 37, 'no_parasite': 29}
