In [7]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ========== Paths ==========
csv_path = "/data/GitHub/Breast-AI-model/src/metrics_rc2/metrics_per_patient_mean.csv"
out_dir  = "/data/GitHub/Breast-AI-model/src/metrics_rc2/boxplots_cleaned"
os.makedirs(out_dir, exist_ok=True)

# ========== Métricas ==========
metric_list = ["DSC", "IoU", "AHD", "VS", "Accuracy", "Sensitivity", "Specificity", "AUC"]

# ========== Utilidades ==========
def sanitize_filename(s: str) -> str:
    return (
        str(s).strip().lower()
        .replace(" ", "_").replace("/", "_").replace("\\", "_")
        .replace("__", "_")
    )

def ensure_numeric(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

# ========== Cargar ==========
df = pd.read_csv(csv_path)

# === Crop flag a booleano y filtrar False ===
def to_bool(x):
    if pd.isna(x): return np.nan
    if isinstance(x, (bool, np.bool_)): return bool(x)
    if isinstance(x, (int, np.integer)): return x != 0
    s = str(x).strip().lower()
    if s in ("false","f","0","no","n"): return False
    if s in ("true","t","1","yes","y"): return True
    return np.nan

if "Crop flag" not in df.columns:
    raise ValueError("No encuentro la columna 'Crop flag' en el CSV.")

df["Crop flag"] = df["Crop flag"].apply(to_bool)
df_f = df[df["Crop flag"] == False].copy()

# === Normalizar / inferir Cancer vs No cancer usando ambas columnas ===
def _norm(s):
    if pd.isna(s): return ""
    # colapsa espacios múltiples, baja a minúsculas
    return re.sub(r"\s+", " ", str(s).strip().lower())

def infer_cancer_status(row):
    cstat = _norm(row.get("Cancer/no cancer", ""))
    categ = _norm(row.get("Category", ""))

    # 1) Excluir bx pending
    if "bx" in cstat and "pending" in cstat:
        return np.nan

    # 2) Reglas explícitas de NO CANCER (primero, para no capturar 'no cancer' como 'cancer')
    if re.search(r"\bno[\s_-]*cancer\b", cstat):
        return "no cancer"
    if cstat in {"no", "none", "neg", "negativo"}:
        return "no cancer"
    if "mass" in cstat:
        return "no cancer"

    # 3) Si Category tiene 'cancer' => cancer
    if re.search(r"\bcancer\b", categ):
        return "cancer"

    # 4) Si Cancer/no cancer dice 'cancer' a secas => cancer
    if re.search(r"\bcancer\b", cstat):
        return "cancer"

    # 5) Por descarte => no cancer
    return "no cancer"

df_f["Cancer"] = df_f.apply(infer_cancer_status, axis=1)

# === Limpiar Category (colapsa variantes de 'Miscellaneous') ===
def clean_category(val):
    if pd.isna(val): return np.nan
    s = str(val).strip()
    if s.lower().startswith("miscellaneous"):
        return "Miscellaneous"
    return s

df_f["Category"] = df_f["Category"].apply(clean_category)

# === Asegurar que las métricas son numéricas ===
ensure_numeric(df_f, metric_list)

# === Helpers de gráficos ===
def save_boxplots(df_in: pd.DataFrame, group_col: str, metrics, title_suffix: str, out_dir: str,
                  ndigits: int = 3, annotate: bool = True):
    import numpy as np
    import matplotlib.pyplot as plt

    # niveles del grupo (orden alfabético, sin NaN)
    levels = sorted([lv for lv in df_in[group_col].dropna().unique()], key=lambda x: str(x).lower())

    for met in metrics:
        if met not in df_in.columns:
            continue

        # datos por nivel
        data = [df_in.loc[df_in[group_col] == lv, met].dropna().values for lv in levels]
        if all(len(d) == 0 for d in data):
            continue

        # medias por nivel (pueden ser NaN si no hay datos)
        means = [np.nan if len(d) == 0 else float(np.nanmean(d)) for d in data]

        fig, ax = plt.subplots(figsize=(9, 5), dpi=130)

        bp = ax.boxplot(
            data,
            labels=levels,
            showmeans=True,          # dibuja triángulo de la media
            patch_artist=True
        )

        ax.set_title(f"{met} by {group_col} {title_suffix}")
        ax.set_xlabel(group_col)
        ax.set_ylabel(met)
        plt.xticks(rotation=15, ha="right")

        # ---- Anotar el valor de la media junto al triángulo ----
        if annotate:
            for i, m in enumerate(means, start=1):
                if np.isnan(m):
                    continue
                # pequeño corrimiento a la derecha y un poquito arriba
                ax.annotate(f"{m:.{ndigits}f}",
                            xy=(i, m),
                            xytext=(10, 0),               # 10 px a la derecha
                            textcoords="offset points",
                            ha="left", va="center",
                            fontsize=9,
                            bbox=dict(boxstyle="round,pad=0.15",
                                      fc="white", ec="none", alpha=0.6))

        fig.tight_layout()
        fname = f"{sanitize_filename(met)}_by_{sanitize_filename(group_col)}{sanitize_filename(title_suffix)}.png"
        fig.savefig(os.path.join(out_dir, fname))
        plt.close(fig)


def save_scatterplots(df_in: pd.DataFrame, group_col: str, metrics, title_suffix: str, out_dir: str):
    levels = sorted([lv for lv in df_in[group_col].dropna().unique()], key=lambda x: str(x).lower())
    for met in metrics:
        if met not in df_in.columns: 
            continue

        plt.figure(figsize=(9, 5), dpi=130)
        for i, lv in enumerate(levels, start=1):
            vals = df_in.loc[df_in[group_col] == lv, met].dropna().values
            if len(vals) == 0: 
                continue
            jitter = np.random.uniform(-0.12, 0.12, size=len(vals))
            plt.scatter(np.full_like(vals, i, dtype=float) + jitter, vals, alpha=0.75)

        plt.title(f"{met} by {group_col} {title_suffix} (scatters)")
        plt.xlabel(group_col)
        plt.ylabel(met)
        plt.xticks(range(1, len(levels)+1), levels, rotation=15, ha="right")
        plt.tight_layout()

        fname = f"{sanitize_filename(met)}_scatter_by_{sanitize_filename(group_col)}{sanitize_filename(title_suffix)}.png"
        plt.savefig(os.path.join(out_dir, fname))
        plt.close()

# ========== Reporte rápido ==========
print("Recuentos (Crop flag = False):")
print("Cancer:")
print(df_f["Cancer"].value_counts(dropna=False))
print("\nCategory (top 20):")
print(df_f["Category"].value_counts(dropna=False).head(20))

#Promedio de métricas 
df_mean = df_f[metric_list].mean().round(3)
print("\nPromedio de métricas:")
print(df_mean)

# ========== Graficar ==========
# 1) Cancer/no cancer (limpio, sin 'bx pending')
df_cancer = df_f.dropna(subset=["Cancer"]).copy()
title1 = ""

save_boxplots  (df_cancer, "Cancer",   metric_list, title1, out_dir)
save_scatterplots(df_cancer, "Cancer", metric_list, title1, out_dir)

# 2) Category (agrupada)
df_cat = df_f.dropna(subset=["Category"]).copy()
title2 = ""

save_boxplots  (df_cat, "Category",   metric_list, title2, out_dir)
save_scatterplots(df_cat, "Category", metric_list, title2, out_dir)

print(f"\n✅ Gráficos guardados en: {out_dir}")


Recuentos (Crop flag = False):
Cancer:
Cancer
no cancer    47
cancer       11
NaN           1
Name: count, dtype: int64

Category (top 20):
Category
Miscellaneous                         15
Cyst                                  11
Fibroadenoma                          10
Cancer                                 8
No cancer                              2
NaN                                    2
Mass                                   2
cyst                                   2
Cancer (invasive ductal carcinoma)     1
Miscellanous (lactating adenoma)       1
Lipoma                                 1
Skin lesion                            1
lactational adenoma or galactocele     1
Fibroadenoma + cyst                    1
Miscellanous (benign mass)             1
Name: count, dtype: int64

Promedio de métricas:
DSC             0.311
IoU             0.246
AHD            69.281
VS              0.389
Accuracy        0.984
Sensitivity     0.286
Specificity     0.998
AUC             0.644
dtype: floa

  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(
  bp = ax.boxplot(



✅ Gráficos guardados en: /data/GitHub/Breast-AI-model/src/metrics_rc2/boxplots_cleaned
