In [1]:
from __future__ import annotations
from pathlib import Path
import re, json, math
import numpy as np
import pandas as pd

REPO_ROOT = Path(".")
FOLDERS = ["VQC-AM-Diabetes-Hamiltonian"]  # cambia se serve

# Match: model_Encoding_<enc>_numLayers_<L>_Hadamard_<...>_Reuploading_<...>_(accuracy|loss).json
FNAME_RE = re.compile(
    r"model_Encoding_(?P<encoding>[A-Za-z0-9\-]+)"
    r"_numLayers_(?P<layers>\d+)"
    r"(?:_Hadamard_(?P<hadamard>True|False))?"
    r"_Reuploading_(?P<ru>True|False)"
    r"_(?P<kind>accuracy|loss)\.json$"
)

def meta_from_filename(fp: Path) -> dict | None:
    m = FNAME_RE.search(fp.name)
    if not m:
        return None
    g = m.groupdict()
    return {
        "encoding": g["encoding"],
        "layers": int(g["layers"]),
        "ru": (g["ru"] == "True"),
        "hadamard": None if g["hadamard"] is None else (g["hadamard"] == "True"),
        "kind": g["kind"],
    }

def _safe_div(num: float, den: float, *, zero_division: float = 0.0) -> float:
    # stile sklearn: se den==0 ritorna zero_division (tipicamente 0)
    return float(num) / float(den) if den else float(zero_division)

def metrics_from_counts(tp: int, fp: int, tn: int, fn: int, *, zero_division: float = 0.0) -> dict:
    tp, fp, tn, fn = map(int, (tp, fp, tn, fn))

    precision = _safe_div(tp, tp + fp, zero_division=zero_division)
    recall    = _safe_div(tp, tp + fn, zero_division=zero_division)  # TPR
    tnr       = _safe_div(tn, tn + fp, zero_division=zero_division)  # specificity
    acc       = _safe_div(tp + tn, tp + tn + fp + fn, zero_division=zero_division)
    bal_acc   = (recall + tnr) / 2.0
    f1        = _safe_div(2 * precision * recall, precision + recall, zero_division=zero_division)

    return {
        "acc": acc,
        "precision": precision,
        "recall": recall,
        "balanced_accuracy": bal_acc,
        "f1": f1,
    }

def stats_from_accuracy_json(fp: Path, *, zero_division: float = 0.0) -> dict | None:
    meta = meta_from_filename(fp)
    if meta is None or meta["kind"] != "accuracy":
        return None

    obj = json.loads(fp.read_text(encoding="utf-8"))
    if not isinstance(obj, list) or len(obj) == 0:
        return None

    per_run_rows = []
    for run in obj:
        for split in ("train", "test"):
            tp = run.get(f"tp_{split}")
            fpv = run.get(f"fp_{split}")
            tn = run.get(f"tn_{split}")
            fn = run.get(f"fn_{split}")
            if tp is None or fpv is None or tn is None or fn is None:
                continue

            m = metrics_from_counts(tp, fpv, tn, fn, zero_division=zero_division)
            m["split"] = split
            per_run_rows.append(m)

    if not per_run_rows:
        return None

    df = pd.DataFrame(per_run_rows)

    out = {
        "layers": meta["layers"],
        "ru": meta["ru"],
        "encoding": meta["encoding"],
        "hadamard": meta["hadamard"],
        "n_runs": int(df[df["split"] == "test"].shape[0]) if (df["split"] == "test").any() else int(df.shape[0]),
        "source_file": str(fp),
    }

    for split in ("train", "test"):
        sub = df[df["split"] == split]
        if sub.empty:
            continue
        for col in ("acc", "precision", "recall", "balanced_accuracy", "f1"):
            s = sub[col].astype("float64")
            out[f"{split}_{col}_mean"] = s.mean()
            out[f"{split}_{col}_var"]  = s.var(ddof=1)
            out[f"{split}_{col}_std"]  = s.std(ddof=1)

    return out

def stats_from_loss_json(fp: Path) -> dict | None:
    meta = meta_from_filename(fp)
    if meta is None or meta["kind"] != "loss":
        return None

    obj = json.loads(fp.read_text(encoding="utf-8"))
    if not isinstance(obj, list) or len(obj) == 0:
        return None

    out = {
        "layers": meta["layers"],
        "ru": meta["ru"],
        "encoding": meta["encoding"],
        "hadamard": meta["hadamard"],
        "source_file_loss": str(fp),
    }

    for split in ("train", "test"):
        vals = [float(run[split]) for run in obj if isinstance(run, dict) and split in run]
        if vals:
            s = pd.Series(vals, dtype="float64")
            out[f"{split}_loss_mean"] = s.mean()
            out[f"{split}_loss_var"]  = s.var(ddof=1)
            out[f"{split}_loss_std"]  = s.std(ddof=1)
            out["n_runs_loss"] = int(s.shape[0])

    return out

def load_all_models(repo_root: Path, folders: list[str], *, zero_division: float = 0.0) -> tuple[pd.DataFrame, pd.DataFrame]:
    acc_rows, loss_rows = [], []
    for folder in folders:
        base = repo_root / folder
        for fp in base.glob("**/*.json"):
            meta = meta_from_filename(fp)
            if meta is None:
                continue
            if meta["kind"] == "accuracy":
                rec = stats_from_accuracy_json(fp, zero_division=zero_division)
                if rec is not None:
                    acc_rows.append(rec)
            elif meta["kind"] == "loss":
                rec = stats_from_loss_json(fp)
                if rec is not None:
                    loss_rows.append(rec)

    if not acc_rows:
        raise RuntimeError("Non ho trovato file *_accuracy.json compatibili (o regex non matcha).")

    df_acc = pd.DataFrame(acc_rows)
    df_loss = pd.DataFrame(loss_rows) if loss_rows else pd.DataFrame()

    return df_acc, df_loss

def merge_acc_loss(df_acc: pd.DataFrame, df_loss: pd.DataFrame) -> pd.DataFrame:
    if df_loss is None or df_loss.empty:
        return df_acc.copy()
    keys = ["encoding", "layers", "ru", "hadamard"]
    return df_acc.merge(df_loss, on=keys, how="left")

def best_worst_by_layers_ru(df: pd.DataFrame, *, metric: str = "test_f1_mean") -> pd.DataFrame:
    """
    Per ogni (layers, ru) prende:
    - best: riga con max(metric)
    - worst: riga con min(metric)
    """
    need = {"layers", "ru", metric}
    missing = need - set(df.columns)
    if missing:
        raise ValueError(f"Mancano colonne necessarie: {missing}")

    gcols = ["layers", "ru"]
    rows = []
    for (layers, ru), grp in df.groupby(gcols, dropna=False):
        grp2 = grp.dropna(subset=[metric])
        if grp2.empty:
            continue
        best = grp2.loc[grp2[metric].idxmax()]
        worst = grp2.loc[grp2[metric].idxmin()]

        rows.append({
            "layers": layers,
            "ru": ru,
            "best_encoding": best.get("encoding"),
            "best_source": best.get("source_file"),
            metric: float(best[metric]),
            "worst_encoding": worst.get("encoding"),
            "worst_source": worst.get("source_file"),
            f"worst_{metric}": float(worst[metric]),
        })
    return pd.DataFrame(rows).sort_values(["layers", "ru"]).reset_index(drop=True)

# ==========================
# ESECUZIONE
# ==========================
df_acc, df_loss = load_all_models(REPO_ROOT, FOLDERS, zero_division=0.0)
df_models = merge_acc_loss(df_acc, df_loss)

table = best_worst_by_layers_ru(df_models, metric="test_f1_mean")  # oppure "test_acc_mean"

print("MODELS (1 riga per JSON/modello):")
print(df_models.head())

print("\nBEST/WORST (per Layers, RU):")
print(table)


MODELS (1 riga per JSON/modello):
   layers     ru     encoding hadamard  n_runs  \
0       8   True  Hamiltonian     None      10   
1       4  False  Hamiltonian     None      10   
2       8  False  Hamiltonian     None      10   
3       2   True  Hamiltonian     None      10   
4       2  False  Hamiltonian     None      10   

                                         source_file  train_acc_mean  \
0  VQC-AM-Diabetes-Hamiltonian/model_Encoding_Ham...        0.773453   
1  VQC-AM-Diabetes-Hamiltonian/model_Encoding_Ham...        0.651466   
2  VQC-AM-Diabetes-Hamiltonian/model_Encoding_Ham...        0.651466   
3  VQC-AM-Diabetes-Hamiltonian/model_Encoding_Ham...        0.684365   
4  VQC-AM-Diabetes-Hamiltonian/model_Encoding_Ham...        0.651466   

   train_acc_var  train_acc_std  train_precision_mean  ...  test_f1_var  \
0   1.552920e-04   1.246162e-02              0.714309  ...     0.000527   
1   1.369550e-32   1.170278e-16              0.000000  ...     0.000000   
2   1.3

In [4]:
df_models.columns

Index(['layers', 'ru', 'encoding', 'hadamard', 'n_runs', 'source_file',
       'train_acc_mean', 'train_acc_var', 'train_acc_std',
       'train_precision_mean', 'train_precision_var', 'train_precision_std',
       'train_recall_mean', 'train_recall_var', 'train_recall_std',
       'train_balanced_accuracy_mean', 'train_balanced_accuracy_var',
       'train_balanced_accuracy_std', 'train_f1_mean', 'train_f1_var',
       'train_f1_std', 'test_acc_mean', 'test_acc_var', 'test_acc_std',
       'test_precision_mean', 'test_precision_var', 'test_precision_std',
       'test_recall_mean', 'test_recall_var', 'test_recall_std',
       'test_balanced_accuracy_mean', 'test_balanced_accuracy_var',
       'test_balanced_accuracy_std', 'test_f1_mean', 'test_f1_var',
       'test_f1_std', 'source_file_loss', 'train_loss_mean', 'train_loss_var',
       'train_loss_std', 'n_runs_loss', 'test_loss_mean', 'test_loss_var',
       'test_loss_std'],
      dtype='object')

In [6]:
df_models[["layers", "ru", "encoding", "test_recall_mean", "test_recall_var"]].sort_values(by=["layers", "ru", "test_recall_mean"], ascending=[True, True, False]).to_latex("diabetes__recall_hamiltonian.tex", index=False, float_format="%.4f")

In [7]:
# do it for test_precision_mean
df_models[["layers", "ru", "encoding", "test_precision_mean", "test_precision_var"]].sort_values(by=["layers", "ru", "test_precision_mean"], ascending=[True, True, False]).to_latex("diabetes_precision_hamiltonian.tex", index=False, float_format="%.4f")

In [8]:
# do it for balanced_accuracy
df_models[["layers", "ru", "encoding", "test_balanced_accuracy_mean", "test_balanced_accuracy_var"]].sort_values(by=["layers", "ru", "test_balanced_accuracy_mean"], ascending=[True, True, False]).to_latex("diabetes_balanced_accuracy_hamiltonian.tex", index=False, float_format="%.4f")

In [9]:
# do it for accuracy
df_models[["layers", "ru", "encoding", "test_acc_mean", "test_acc_var"]].sort_values(by=["layers", "ru", "test_acc_mean"], ascending=[True, True, False]).to_latex("diabetes_accuracy_hamiltonian.tex", index=False, float_format="%.4f")

In [10]:
# do it for f1 score
df_models[["layers", "ru", "encoding", "test_f1_mean", "test_f1_var"]].sort_values(by=["layers", "ru", "test_f1_mean"], ascending=[True, True, False]).to_latex("diabetes_f1_hamiltonian.tex", index=False, float_format="%.4f")    