# Paper Tables Generator
Reads `full_HDBSCAN_metadata.csv` (train/test) and `best_per_model_HDBSCAN.csv` to produce LaTeX tables.

In [None]:
import re
import pandas as pd

TRAIN_FULL  = "../compare_flu/results/full_HDBSCAN_metadata.csv"
TEST_FULL   = "../compare_flu2018-2020/results/full_HDBSCAN_metadata.csv"
BEST_MODEL  = "../compare_flu2018-2020/results/best_per_model_HDBSCAN.csv"

train = pd.read_csv(TRAIN_FULL)
test  = pd.read_csv(TEST_FULL)
best  = pd.read_csv(BEST_MODEL)

print("train:", train.shape, "  test:", test.shape, "  best:", best.shape)
train.head(2)

In [None]:

MODEL_DISPLAY = {
    "t33-650M": "ESM-2(650M)",
    "t36-3B":   "ESM-2(3B)",
    "t48-15B":  "ESM-2(15B)",
    "protbert": "ProtBert",
    "prot5":    "ProtT5",
    "CARP":     "CARP",
}
POOL_DISPLAY = {
    "mean":          "mean",
    "bos":           "BOS",
    "attentionmean": "attn-mean",
    "sitemean":      "site-mean",
}
DIM_DISPLAY = {
    "t-sne": "t-SNE",
    "umap":  "UMAP",
    "pca":   "PCA",
    "mds":   "MDS",
}

def extract_epsilon(predicted_clusters_column):
    m = re.search(r'_cluster_at_([\d.]+)$', str(predicted_clusters_column))
    return float(m.group(1)) if m else None

def parse_method(method):
    empty = dict(dim=None, metric=None, components=None, base_model=None, pooling=None)
    if not method.startswith("reduced_"):
        return empty
    s = method[:-4] if method.endswith("_All") else method
    parts = s.split("_")
    if len(parts) < 5:
        return empty
    dim, metric = parts[1], parts[2]
    try:
        components = int(parts[3])
    except ValueError:
        return empty
    model_full = "_".join(parts[4:])
    tokens = model_full.split("-")
    if tokens[-1] in POOL_DISPLAY:
        pooling    = tokens[-1]
        base_model = "-".join(tokens[:-1])
    else:
        pooling    = None
        base_model = model_full
    return dict(dim=dim, metric=metric, components=components,
                base_model=base_model, pooling=pooling)

for m in ["reduced_t-sne_cosine_1_t33-650M-mean_All",
          "reduced_t-sne_euclidean_2_CARP",
          "reduced_t-sne_euclidean_2_t48-15B-attentionmean_All",
          "t-sne", "genetic"]:
    print(m, "->", parse_method(m))

train["epsilon"] = train["predicted_clusters_column"].apply(extract_epsilon)


## Table 1 – Baseline scores (genetic dim-reduction methods)

In [None]:
BASELINE_METHODS = ["t-sne", "umap", "pca", "mds"]

train_base = (
    train[train["method"].isin(BASELINE_METHODS)]
    .sort_values("normalized_vi")
    .groupby("method", sort=False)
    .first()
    .reset_index()[["method", "epsilon", "normalized_vi",
                    "adjusted_rand_score", "normalized_mutual_info_score"]]
    .rename(columns={
        "normalized_vi":              "train_nvi",
        "adjusted_rand_score":        "train_ari",
        "normalized_mutual_info_score": "train_nmi",
    })
)

test_base = (
    test[test["method"].isin(BASELINE_METHODS)]
    [["method", "normalized_vi", "adjusted_rand_score", "normalized_mutual_info_score"]]
    .rename(columns={
        "normalized_vi":              "test_nvi",
        "adjusted_rand_score":        "test_ari",
        "normalized_mutual_info_score": "test_nmi",
    })
)

baseline = train_base.merge(test_base, on="method")
# enforce display order
order = {m: i for i, m in enumerate(BASELINE_METHODS)}
baseline["_ord"] = baseline["method"].map(order)
baseline = baseline.sort_values("_ord").drop(columns="_ord")
baseline["method"] = baseline["method"].map(lambda m: DIM_DISPLAY.get(m, m))

baseline

In [None]:
def latex_baseline(df):
    rows = []
    for _, r in df.iterrows():
        rows.append(
            f"{r['method']:<8} & {r['epsilon']:<5} "
            f"& {r['train_nvi']:.4f} & {r['test_nvi']:.4f} "
            f"& {r['train_nmi']:.4f} & {r['test_nmi']:.4f} "
            f"& {r['train_ari']:.4f} & {r['test_ari']:.4f} \\\\"
        )
    header = r"""
\begin{table}[h]
\centering
\caption{Baseline scores. Lower NVI is better, higher NMI and ARI are better.}
\label{tab:baseline-results}
\begin{tabular}{|l|l|l|l|l|l|l|l|}
\hline
Method & HDBSCAN~$\epsilon$ & Train NVI & Test NVI & Train NMI & Test NMI & Train ARI & Test ARI\\\\
\hline"""
    footer = r"""
\hline
\end{tabular}
\end{table}"""
    print(header)
    print("\n".join(rows))
    print(footer)

latex_baseline(baseline)

## Table 2 – Top 10 training PLM results (mean pooling)

In [None]:

plm_mask = train["method"].str.startswith("reduced_") & ~train["method"].isin(BASELINE_METHODS)
plm_train = train[plm_mask].copy()

parsed = plm_train["method"].apply(parse_method)
plm_train = plm_train.join(pd.DataFrame(parsed.tolist(), index=plm_train.index))
plm_train = plm_train[plm_train["pooling"].isin(["mean"]) | plm_train["pooling"].isna()]

top10 = (
    plm_train.sort_values("normalized_vi")
    .head(10)
    [["base_model", "dim", "metric", "components", "epsilon",
      "normalized_vi", "adjusted_rand_score", "normalized_mutual_info_score"]]
    .copy()
)
top10["base_model"] = top10["base_model"].map(model_label)
top10["dim"]        = top10["dim"].map(lambda d: DIM_DISPLAY.get(d, d))
top10["metric"]     = top10["metric"].str.capitalize()
top10


In [None]:
def latex_top10(df):
    rows = []
    for _, r in df.iterrows():
        rows.append(
            f"{r['base_model']:<14} & {r['dim']:<6} & {r['metric']:<10} "
            f"& {int(r['components'])} & {r['epsilon']:<5} "
            f"& {r['normalized_vi']:.4f} "
            f"& {r['normalized_mutual_info_score']:.4f} "
            f"& {r['adjusted_rand_score']:.4f} \\\\"
        )
    header = r"""
\begin{table}[h]
\centering
\caption{Top 10 training results using protein language model embeddings and mean-pooling.
Lower NVI is better, higher NMI and ARI are better.}
\label{tab:top10-results}
\begin{tabular}{|l|l|l|c|c|c|c|c|}
\hline
Model & Dim.~red. & Metric & Dims & HDBSCAN~$\epsilon$ & NVI & NMI & ARI\\\\
\hline"""
    footer = r"""
\hline
\end{tabular}
\end{table}"""
    print(header)
    print("\n".join(rows))
    print(footer)

latex_top10(top10)

## Table 3 – Best configuration per model (train + test)

In [None]:

train_opt = (
    train.sort_values("normalized_vi")
    .groupby("method", sort=False)
    .first()
    .reset_index()[["method", "epsilon"]]
)

best_with_eps = best.merge(train_opt, on="method", how="left")

parsed_df = pd.DataFrame(best_with_eps["method"].apply(parse_method).tolist(), index=best_with_eps.index)
for col in ["dim", "metric", "components", "base_model", "pooling", "gene"]:
    best_with_eps[col] = parsed_df.get(col)

best_with_eps = best_with_eps[~best_with_eps["method"].str.contains("sitemean|attentionmean|bos")]

def fill_display(row):
    if pd.isna(row.get("dim")):
        return DIM_DISPLAY.get(row["method"], row["method"])
    return DIM_DISPLAY.get(row["dim"], row["dim"])

best_with_eps["dim_label"]    = best_with_eps.apply(fill_display, axis=1)
best_with_eps["model_label"]  = best_with_eps["model"].map(lambda m: MODEL_DISPLAY.get(m, m))
best_with_eps["metric_label"] = best_with_eps["metric"].fillna("").str.capitalize()

cols = ["model_label", "dim_label", "metric_label", "components", "epsilon",
        "train_normalized_vi", "test_normalized_vi",
        "train_normalized_mutual_info_score", "test_normalized_mutual_info_score",
        "train_adjusted_rand_score", "test_adjusted_rand_score"]
best_with_eps[cols]


In [None]:
def latex_best_per_model(df):
    rows = []
    for _, r in df.iterrows():
        comp = int(r['components']) if pd.notna(r.get('components')) else "--"
        met  = r['metric_label'] if r['metric_label'] else "--"
        rows.append(
            f"{r['model_label']:<14} & {r['dim_label']:<7} & {met:<10} "
            f"& {comp} & {r['epsilon']:<5} "
            f"& {r['train_normalized_vi']:.4f} & {r['test_normalized_vi']:.4f} "
            f"& {r['train_normalized_mutual_info_score']:.4f} & {r['test_normalized_mutual_info_score']:.4f} "
            f"& {r['train_adjusted_rand_score']:.4f} & {r['test_adjusted_rand_score']:.4f} \\\\"
        )
    header = r"""
\begin{table}[h]
\centering
\caption{Best configuration for each model type.
Lower NVI is better, higher NMI and ARI are better.}
\label{tab:best-config}
\begin{tabular}{|l|l|l|c|c|c|c|c|c|c|c|}
\hline
Model & Dim.~red. & Metric & Dims & $\epsilon$ & Train NVI & Test NVI & Train NMI & Test NMI & Train ARI & Test ARI\\\\
\hline"""
    footer = r"""
\hline
\end{tabular}
\end{table}"""
    print(header)
    print("\n".join(rows))
    print(footer)

latex_best_per_model(best_with_eps)

## Table 4 – Pooling method comparison (test NVI/NMI/ARI per model × pooling)

In [None]:

best_all = best.copy()
parsed_df = pd.DataFrame(best_all["method"].apply(parse_method).tolist(), index=best_all.index)
for col in ["dim", "metric", "components", "base_model", "pooling"]:
    best_all[col] = parsed_df.get(col)
best_all["base_model_label"] = best_all["base_model"].map(lambda m: MODEL_DISPLAY.get(m, m))

pool_rows = best_all[best_all["pooling"].notna()].copy()
pool_rows["pooling_label"] = pool_rows["pooling"].map(POOL_DISPLAY)

pivot_nvi = pool_rows.pivot_table(
    index="base_model_label", columns="pooling_label",
    values="test_normalized_vi", aggfunc="min"
).round(4)
pivot_nmi = pool_rows.pivot_table(
    index="base_model_label", columns="pooling_label",
    values="test_normalized_mutual_info_score", aggfunc="max"
).round(4)
pivot_ari = pool_rows.pivot_table(
    index="base_model_label", columns="pooling_label",
    values="test_adjusted_rand_score", aggfunc="max"
).round(4)

print("NVI by pooling:")
display(pivot_nvi)
print("\nNMI by pooling:")
display(pivot_nmi)
print("\nARI by pooling:")
display(pivot_ari)


In [None]:
def latex_pooling_table(pool_rows, metric_col, caption, label):
    POOL_ORDER = ["mean", "BOS", "attn-mean", "site-mean"]
    present = [p for p in POOL_ORDER if p in pool_rows["pooling_label"].values]

    header = (
        f"\\begin{{table}}[h]\n\\centering\n"
        f"\\caption{{{caption}}}\n\\label{{{label}}}\n"
        f"\\begin{{tabular}}{{|l|{'c|' * len(present)}}}\n\\hline\n"
        f"Model & {' & '.join(present)} \\\\\n\\hline"
    )
    rows = []
    for model, grp in pool_rows.groupby("model_label"):
        vals = {r["pooling_label"]: r[metric_col] for _, r in grp.iterrows()}
        cells = " & ".join(f"{vals.get(p, float('nan')):.4f}" if p in vals else "--" for p in present)
        rows.append(f"{model} & {cells} \\\\")
    footer = "\\hline\n\\end{tabular}\n\\end{table}"
    print(header)
    print("\n".join(rows))
    print(footer)

latex_pooling_table(
    pool_rows, "test_normalized_vi",
    "Test NVI by model and pooling strategy. Lower is better.",
    "tab:pooling-nvi"
)
print()
latex_pooling_table(
    pool_rows, "test_normalized_mutual_info_score",
    "Test NMI by model and pooling strategy. Higher is better.",
    "tab:pooling-nmi"
)
print()
latex_pooling_table(
    pool_rows, "test_adjusted_rand_score",
    "Test ARI by model and pooling strategy. Higher is better.",
    "tab:pooling-ari"
)