# Optuna Per-Experiment Best Results

This notebook extracts and displays the best hyperparameters found by per-experiment Optuna tuning,
along with their **test-set** evaluation metrics (error rate, macro-F1, ECE).

Test metrics are parsed from `study.log` files since the Optuna objective only records dev macro-F1.

In [47]:
# Configuration
RESULTS_DIR = "../results/optuna/per_experiment"
N_TRIALS = 10  # Trial count to load (None = latest available)

In [48]:
import sys
sys.path.insert(0, "..")

import pandas as pd
from extract_optuna_test_results import extract_results

results = extract_results(RESULTS_DIR, n_trials=N_TRIALS)
print(f"Loaded {len(results)} experiment results")

Loaded 83 experiment results


## Full Results Table

All experiments with best hyperparameters, dev F1, and test metrics.

In [49]:
# Build DataFrame
rows = []
for r in results:
    row = {
        "event": r["event"],
        "budget": r["budget"],
        "seed_set": r["seed_set"],
        "dev_macro_f1": r["dev_macro_f1"],
        "test_macro_f1": r["test_macro_f1"],
        "test_error_rate": r["test_error_rate"],
        "test_ece": r["test_ece"],
    }
    if r.get("best_params"):
        row["lr"] = r["best_params"].get("lr")
        row["batch_size"] = r["best_params"].get("batch_size")
        row["cotrain_epochs"] = r["best_params"].get("cotrain_epochs")
        row["finetune_patience"] = r["best_params"].get("finetune_patience")
        row["weight_decay"] = r["best_params"].get("weight_decay")
        row["warmup_ratio"] = r["best_params"].get("warmup_ratio")
    rows.append(row)

df = pd.DataFrame(rows).sort_values(["event", "budget", "seed_set"]).reset_index(drop=True)

# Format for display
format_dict = {
    "dev_macro_f1": "{:.4f}",
    "test_macro_f1": "{:.4f}",
    "test_error_rate": "{:.2f}",
    "test_ece": "{:.4f}",
    "lr": "{:.6f}",
    "weight_decay": "{:.4f}",
    "warmup_ratio": "{:.4f}",
}
df.style.format(format_dict, na_rep="-")

Unnamed: 0,event,budget,seed_set,dev_macro_f1,test_macro_f1,test_error_rate,test_ece,lr,batch_size,cotrain_epochs,finetune_patience,weight_decay,warmup_ratio
0,california_wildfires_2018,5,1,0.6306,0.6079,29.16,0.1533,0.000148,16,17,6,0.0979,0.1507
1,california_wildfires_2018,5,2,0.6286,0.6014,27.52,0.1896,0.00014,32,20,10,0.0993,0.1355
2,california_wildfires_2018,5,3,0.6313,0.6242,27.99,0.1788,3.4e-05,8,6,6,0.029,0.101
3,california_wildfires_2018,10,1,0.6272,0.5982,31.14,0.0862,0.000161,32,15,9,0.0142,0.1848
4,california_wildfires_2018,10,2,0.6492,0.656,26.97,0.1266,2.5e-05,16,11,9,0.0041,0.0892
5,california_wildfires_2018,10,3,0.6416,0.6376,29.09,0.1073,7.9e-05,8,7,4,0.073,0.266
6,california_wildfires_2018,25,1,0.6302,0.6229,28.54,0.0398,2.5e-05,8,5,5,0.0353,0.2122
7,california_wildfires_2018,25,2,0.6689,0.6488,29.09,0.1107,7.2e-05,64,9,10,0.0304,0.0574
8,california_wildfires_2018,25,3,0.6393,0.6418,27.93,0.0599,8e-05,8,12,4,0.0304,0.211
9,california_wildfires_2018,50,1,0.6491,0.6598,27.45,0.131,3.2e-05,16,17,6,0.0581,0.0452


## Summary: Mean Test Macro-F1 and ECE by Event and Budget

Averaged over 3 seed sets per (event, budget) combination.

In [50]:
# Pivot table: event x budget, values = mean test_macro_f1 over seeds
pivot_f1 = df.pivot_table(
    values="test_macro_f1",
    index="event",
    columns="budget",
    aggfunc="mean",
)
pivot_f1["mean"] = pivot_f1.mean(axis=1)
pivot_f1.loc["mean"] = pivot_f1.mean(axis=0)

print("Test Macro-F1")
display(pivot_f1.style.format("{:.4f}", na_rep="-").background_gradient(cmap="YlGn", axis=None))

# Pivot table: event x budget, values = mean test_ece over seeds
pivot_ece = df.pivot_table(
    values="test_ece",
    index="event",
    columns="budget",
    aggfunc="mean",
)
pivot_ece["mean"] = pivot_ece.mean(axis=1)
pivot_ece.loc["mean"] = pivot_ece.mean(axis=0)

print("\nTest ECE (lower is better)")
display(pivot_ece.style.format("{:.4f}", na_rep="-").background_gradient(cmap="YlOrRd", axis=None))

Test Macro-F1


budget,5,10,25,50,mean
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
california_wildfires_2018,0.6112,0.6306,0.6378,0.6615,0.6353
canada_wildfires_2016,0.5656,0.5956,0.5962,0.5807,0.5845
cyclone_idai_2019,0.5746,0.6005,0.6095,0.5869,0.5929
hurricane_dorian_2019,0.5606,0.5764,0.5768,0.5952,0.5772
hurricane_florence_2018,0.6622,0.6665,0.6835,0.6713,0.6709
hurricane_harvey_2017,0.6255,0.6442,0.6686,0.6745,0.6532
hurricane_irma_2017,0.6207,0.6351,0.6526,0.6519,0.6401
mean,0.6029,0.6213,0.6321,0.6317,0.622



Test ECE (lower is better)


budget,5,10,25,50,mean
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
california_wildfires_2018,0.1739,0.1067,0.0701,0.0927,0.1109
canada_wildfires_2016,0.1065,0.1067,0.0873,0.1091,0.1024
cyclone_idai_2019,0.0867,0.0894,0.1035,0.1065,0.0965
hurricane_dorian_2019,0.2321,0.2059,0.1341,0.1392,0.1778
hurricane_florence_2018,0.1704,0.1311,0.1069,0.102,0.1276
hurricane_harvey_2017,0.2401,0.1582,0.1593,0.1306,0.1721
hurricane_irma_2017,0.2746,0.2137,0.1408,0.1793,0.2021
mean,0.1835,0.1445,0.1146,0.1228,0.1413


## Summary: Mean Test Macro-F1 by Event

In [51]:
by_event = df.groupby("event").agg(
    test_macro_f1_mean=("test_macro_f1", "mean"),
    test_macro_f1_std=("test_macro_f1", "std"),
    test_ece_mean=("test_ece", "mean"),
    dev_macro_f1_mean=("dev_macro_f1", "mean"),
    test_error_rate_mean=("test_error_rate", "mean"),
    count=("test_macro_f1", "count"),
).sort_values("test_macro_f1_mean", ascending=False)

by_event.style.format({
    "test_macro_f1_mean": "{:.4f}",
    "test_macro_f1_std": "{:.4f}",
    "test_ece_mean": "{:.4f}",
    "dev_macro_f1_mean": "{:.4f}",
    "test_error_rate_mean": "{:.2f}",
}, na_rep="-").background_gradient(cmap="YlGn", subset=["test_macro_f1_mean"])

Unnamed: 0_level_0,test_macro_f1_mean,test_macro_f1_std,test_ece_mean,dev_macro_f1_mean,test_error_rate_mean,count
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
hurricane_florence_2018,0.6709,0.0147,0.1276,0.6922,24.88,12
hurricane_harvey_2017,0.6532,0.0228,0.1721,0.6644,30.55,12
hurricane_irma_2017,0.639,0.018,0.2042,0.6398,33.71,11
california_wildfires_2018,0.6353,0.0239,0.1109,0.6424,28.27,12
cyclone_idai_2019,0.5929,0.031,0.0965,0.5375,27.28,12
canada_wildfires_2016,0.5845,0.0215,0.1024,0.6956,24.48,12
hurricane_dorian_2019,0.5772,0.0177,0.1778,0.5987,36.61,12


## Summary: Mean Test Macro-F1 by Budget

In [52]:
by_budget = df.groupby("budget").agg(
    test_macro_f1_mean=("test_macro_f1", "mean"),
    test_macro_f1_std=("test_macro_f1", "std"),
    test_ece_mean=("test_ece", "mean"),
    dev_macro_f1_mean=("dev_macro_f1", "mean"),
    test_error_rate_mean=("test_error_rate", "mean"),
    count=("test_macro_f1", "count"),
)

by_budget.style.format({
    "test_macro_f1_mean": "{:.4f}",
    "test_macro_f1_std": "{:.4f}",
    "test_ece_mean": "{:.4f}",
    "dev_macro_f1_mean": "{:.4f}",
    "test_error_rate_mean": "{:.2f}",
}, na_rep="-").background_gradient(cmap="YlGn", subset=["test_macro_f1_mean"])

Unnamed: 0_level_0,test_macro_f1_mean,test_macro_f1_std,test_ece_mean,dev_macro_f1_mean,test_error_rate_mean,count
budget,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,0.6029,0.0393,0.1835,0.6221,29.74,21
10,0.6213,0.0353,0.1445,0.6297,29.58,21
25,0.6321,0.0395,0.1146,0.646,29.0,21
50,0.6307,0.0429,0.12,0.6578,29.04,20
