In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path

# =====================================================
# Config
# =====================================================
GRID_RESULTS_CSV = "all_model_param_results.csv"  # many rows per (Dataset, Model)
BASELINE_SUMMARY_CSV = "results_summary.csv"      # one row per (Dataset, Model)
PRIMARY_METRIC = "CV_F1_mean"                     # selection metric for "best"
TIEBREAKER_METRIC = "CV_Accuracy_mean"            # tiebreaker
OUTDIR = Path("comparison_outputs")
OUTDIR.mkdir(exist_ok=True)

# =====================================================
# 1) Load inputs
# =====================================================
grid = pd.read_csv(GRID_RESULTS_CSV)
base = pd.read_csv(BASELINE_SUMMARY_CSV)

# Normalize column names (strip spaces)
grid.columns = [c.strip() for c in grid.columns]
base.columns = [c.strip() for c in base.columns]

# Ensure expected columns exist even if missing
for col in [
    "Dataset","Model","Params",
    "CV_Accuracy_mean","CV_Accuracy_std","CV_F1_mean","CV_F1_std",
    "Fit_time_mean_s","Fit_time_std_s","Score_time_mean_s","Score_time_std_s",
    "Holdout_Accuracy","Holdout_F1_macro","Holdout_Train_time_s","Holdout_Infer_time_s",
    "Rank_in_dataset_model"
]:
    if col not in grid.columns:
        grid[col] = np.nan

for col in [
    "Dataset","Model","Accuracy","F1_macro","Train_time_s","Infer_time_s",
    "CV_Accuracy_mean","CV_Accuracy_std","CV_F1_mean","CV_F1_std"
]:
    if col not in base.columns:
        base[col] = np.nan

# =====================================================
# 2) Pick best per (Dataset, Model) from the grid
# =====================================================
def pick_best(group, primary=PRIMARY_METRIC, tiebreak=TIEBREAKER_METRIC):
    g = group.copy()
    sort_cols, ascending = [], []
    if primary in g.columns:
        sort_cols.append(primary); ascending.append(False)   # higher is better
    if tiebreak in g.columns:
        sort_cols.append(tiebreak); ascending.append(False)
    if "Score_time_mean_s" in g.columns:
        sort_cols.append("Score_time_mean_s"); ascending.append(True)  # prefer faster ties
    if not sort_cols:
        return g.iloc[[0]]
    return g.sort_values(sort_cols, ascending=ascending, na_position="last").iloc[[0]]

best = (
    grid.groupby(["Dataset","Model"], group_keys=False)
        .apply(pick_best)
        .reset_index(drop=True)
)

# Keep a readable Params column (string)
def canonical_params(s):
    try:
        d = json.loads(s) if isinstance(s, str) else {}
        return json.dumps(d, sort_keys=True)
    except Exception:
        return str(s)

best["Params_canonical"] = best["Params"].apply(canonical_params)

# Select columns to keep from best
best_keep = [
    "Dataset","Model","Params_canonical",
    "CV_Accuracy_mean","CV_Accuracy_std","CV_F1_mean","CV_F1_std",
    "Fit_time_mean_s","Fit_time_std_s","Score_time_mean_s","Score_time_std_s",
    "Holdout_Accuracy","Holdout_F1_macro","Holdout_Train_time_s","Holdout_Infer_time_s"
]
best = best[best_keep].rename(columns={"Params_canonical": "Best_Params"})

# =====================================================
# 3) Prepare baseline columns (rename for clear suffixes)
# =====================================================
base_ren = base.rename(columns={
    "CV_Accuracy_mean":"Base_CV_Accuracy_mean",
    "CV_Accuracy_std":"Base_CV_Accuracy_std",
    "CV_F1_mean":"Base_CV_F1_mean",
    "CV_F1_std":"Base_CV_F1_std",
    "Accuracy":"Base_Holdout_Accuracy",
    "F1_macro":"Base_Holdout_F1_macro",
    "Train_time_s":"Base_Train_time_s",
    "Infer_time_s":"Base_Infer_time_s",
})

base_keep = [
    "Dataset","Model",
    "Base_CV_Accuracy_mean","Base_CV_Accuracy_std","Base_CV_F1_mean","Base_CV_F1_std",
    "Base_Holdout_Accuracy","Base_Holdout_F1_macro",
    "Base_Train_time_s","Base_Infer_time_s"
]
base_ren = base_ren[base_keep]

# =====================================================
# 4) Merge tuned-best vs baseline
# =====================================================
cmp_df = pd.merge(best, base_ren, on=["Dataset","Model"], how="left")

# Compute deltas (tuned minus baseline)
cmp_df["Delta_CV_F1_mean"]       = cmp_df["CV_F1_mean"]       - cmp_df["Base_CV_F1_mean"]
cmp_df["Delta_CV_Accuracy_mean"] = cmp_df["CV_Accuracy_mean"] - cmp_df["Base_CV_Accuracy_mean"]
cmp_df["Delta_Holdout_F1_macro"] = cmp_df["Holdout_F1_macro"] - cmp_df["Base_Holdout_F1_macro"]
cmp_df["Delta_Holdout_Accuracy"] = cmp_df["Holdout_Accuracy"] - cmp_df["Base_Holdout_Accuracy"]

# Relative (%) improvements (safe divide)
def rel_impr(tuned, base):
    return np.where(pd.notna(tuned) & pd.notna(base) & (base != 0),
                    100.0 * (tuned - base) / base, np.nan)

cmp_df["Rel_%_CV_F1_mean"]       = rel_impr(cmp_df["CV_F1_mean"],       cmp_df["Base_CV_F1_mean"])
cmp_df["Rel_%_CV_Accuracy_mean"] = rel_impr(cmp_df["CV_Accuracy_mean"], cmp_df["Base_CV_Accuracy_mean"])
cmp_df["Rel_%_Holdout_F1_macro"] = rel_impr(cmp_df["Holdout_F1_macro"], cmp_df["Base_Holdout_F1_macro"])
cmp_df["Rel_%_Holdout_Accuracy"] = rel_impr(cmp_df["Holdout_Accuracy"], cmp_df["Base_Holdout_Accuracy"])

# Flags: did tuning beat baseline?
cmp_df["Beat_Baseline_on_CV_F1"]       = cmp_df["Delta_CV_F1_mean"] > 0
cmp_df["Beat_Baseline_on_CV_Accuracy"] = cmp_df["Delta_CV_Accuracy_mean"] > 0
cmp_df["Beat_Baseline_on_Holdout_F1"]  = cmp_df["Delta_Holdout_F1_macro"] > 0
cmp_df["Beat_Baseline_on_Holdout_Acc"] = cmp_df["Delta_Holdout_Accuracy"] > 0

# Nice ordering
order_cols = [
    "Dataset","Model","Best_Params",
    "CV_Accuracy_mean","CV_Accuracy_std","CV_F1_mean","CV_F1_std",
    "Holdout_Accuracy","Holdout_F1_macro",
    "Base_CV_Accuracy_mean","Base_CV_Accuracy_std","Base_CV_F1_mean","Base_CV_F1_std",
    "Base_Holdout_Accuracy","Base_Holdout_F1_macro",
    "Delta_CV_Accuracy_mean","Delta_CV_F1_mean",
    "Delta_Holdout_Accuracy","Delta_Holdout_F1_macro",
    "Rel_%_CV_Accuracy_mean","Rel_%_CV_F1_mean",
    "Rel_%_Holdout_Accuracy","Rel_%_Holdout_F1_macro",
    "Fit_time_mean_s","Score_time_mean_s","Base_Train_time_s","Base_Infer_time_s",
    "Beat_Baseline_on_CV_Accuracy","Beat_Baseline_on_CV_F1",
    "Beat_Baseline_on_Holdout_Acc","Beat_Baseline_on_Holdout_F1"
]
cmp_df = cmp_df[order_cols]

# Round numeric columns for readability
num_cols = cmp_df.select_dtypes(include=[np.number]).columns
cmp_df[num_cols] = cmp_df[num_cols].round(4)

# Save CSV
cmp_csv_path = OUTDIR / "tuned_vs_baseline_comparison.csv"
cmp_df.to_csv(cmp_csv_path, index=False)
print(f"✅ Wrote: {cmp_csv_path.resolve()}")

# =====================================================
# 5) Quick text summary per dataset/model (human-readable)
# =====================================================
lines = []
for (ds, mdl), g in cmp_df.groupby(["Dataset","Model"]):
    r = g.iloc[0]
    lines.append(
        f"[{ds} | {mdl}] "
        f"Baseline CV-F1={r['Base_CV_F1_mean']:.4f} → Tuned CV-F1={r['CV_F1_mean']:.4f} "
        f"(Δ={r['Delta_CV_F1_mean']:+.4f}, {r['Rel_%_CV_F1_mean']:+.2f}%), "
        f"Params={r['Best_Params']}"
    )
txt_path = OUTDIR / "tuned_vs_baseline_summary.txt"
with open(txt_path, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))
print(f"✅ Wrote: {txt_path.resolve()}")

# =====================================================
# 6) (Optional) LaTeX table (short comparison)
# =====================================================
short_cols = [
    "Dataset","Model",
    "Base_CV_Accuracy_mean","CV_Accuracy_mean","Delta_CV_Accuracy_mean",
    "Base_CV_F1_mean","CV_F1_mean","Delta_CV_F1_mean",
    "Best_Params"
]
latex_df = cmp_df[short_cols].copy()
# escape underscores in model/dataset names for LaTeX
latex_df["Dataset"] = latex_df["Dataset"].astype(str).str.replace("_", "\\_", regex=False)
latex_df["Model"]   = latex_df["Model"].astype(str).str.replace("_", "\\_", regex=False)

latex = latex_df.to_latex(
    index=False,
    escape=False,
    float_format="%.4f",
    caption="Baseline vs. tuned best per (dataset, model). Deltas are tuned minus baseline.",
    label="tab:tuned_vs_baseline"
)

with open(OUTDIR / "tuned_vs_baseline_table.tex", "w", encoding="utf-8") as f:
    f.write(latex)

print(f"✅ Wrote: {(OUTDIR / 'tuned_vs_baseline_table.tex').resolve()}")
print("\n--- Quick Summary ---")
print("\n".join(lines))

  grid.groupby(["Dataset","Model"], group_keys=False)


✅ Wrote: /Users/vp/PycharmProjects/MachineLearning/Practice1/62_CayoPletikosicRavuril_Exercise1_Report/Pipelines/comparison_outputs/tuned_vs_baseline_comparison.csv
✅ Wrote: /Users/vp/PycharmProjects/MachineLearning/Practice1/62_CayoPletikosicRavuril_Exercise1_Report/Pipelines/comparison_outputs/tuned_vs_baseline_summary.txt
✅ Wrote: /Users/vp/PycharmProjects/MachineLearning/Practice1/62_CayoPletikosicRavuril_Exercise1_Report/Pipelines/comparison_outputs/tuned_vs_baseline_table.tex

--- Quick Summary ---
[dataset1 | NeuralNet] Baseline CV-F1=0.7280 → Tuned CV-F1=0.7138 (Δ=-0.0142, -1.95%), Params={"mlpclassifier__alpha": 0.001, "mlpclassifier__early_stopping": true, "mlpclassifier__hidden_layer_sizes": [100, 100], "mlpclassifier__learning_rate_init": 0.01, "mlpclassifier__max_iter": 300}
[dataset1 | RandomForest] Baseline CV-F1=0.8896 → Tuned CV-F1=0.9024 (Δ=+0.0127, +1.43%), Params={"randomforestclassifier__class_weight": "balanced", "randomforestclassifier__max_depth": 20, "randomfor