# LG-CoTrain: Alternative Early Stopping — Quick Comparison

This notebook runs a **fast comparison** of all 6 stopping strategies using only
**budget=50, seed set 1** across all 10 events — the minimum number of experiments
needed to see whether any strategy is consistently better.

| Strategy | Key Idea |
|---|---|
| `baseline` | Original: stop when ensemble macro-F1 plateaus for `patience` epochs |
| `no_early_stopping` | Run all `finetune_max_epochs`; restore best-ever checkpoint (upper bound) |
| `per_class_patience` | Stop only when **every** class F1 has individually plateaued |
| `weighted_macro_f1` | Weight rare classes more in the stopping metric |
| `balanced_dev` | Resample dev set to equal class sizes for the stopping signal |
| `scaled_threshold` | Require a larger improvement delta for highly imbalanced events |

**Total experiments**: 6 strategies × 10 events × 1 budget × 1 seed = **60 runs**
(vs 360 for a full 3-seed, 4-budget sweep).

Results are stored in `results/quick-stop-{strategy}/` to keep them separate from
full-run results. See **notebook 05** for the complete sweep.

In [1]:
import json
import sys
import time
from pathlib import Path


def _find_repo_root(marker: str = "lg_cotrain") -> Path:
    for candidate in [Path().resolve()] + list(Path().resolve().parents):
        if (candidate / marker).is_dir():
            return candidate
    raise RuntimeError(
        f"Cannot find repo root: no ancestor directory contains '{marker}/'. "
        "Run the notebook from inside the repository."
    )


repo_root = _find_repo_root()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

import matplotlib.pyplot as plt
import numpy as np

from lg_cotrain.run_all import run_all_experiments


class ProgressTracker:
    """Track global progress across all strategies × events × budgets × seeds."""

    def __init__(self, total: int, already_done: int, start_time: float):
        self.total = total
        self.done = already_done
        self.start_time = start_time

    def update(self, event, budget, seed_set, status):
        self.done += 1
        elapsed = time.time() - self.start_time
        pct = 100.0 * self.done / self.total if self.total else 0
        elapsed_h = elapsed / 3600
        remaining = self.total - self.done
        eta_h = (elapsed / self.done) * remaining / 3600 if self.done > 0 else 0
        print(
            f"  [PROGRESS] {self.done}/{self.total} ({pct:.1f}%)"
            f"  |  Elapsed: {elapsed_h:.2f}h  |  ETA: {eta_h:.2f}h  |  {status}"
        )


print(f"Repo root: {repo_root}")

Repo root: D:\Workspace\Co-Training


In [2]:
# ---- Configuration ----

PSEUDO_LABEL_SOURCE = "gpt-4o"

# Quick-run scope: 50 labels/class, seed set 1 only
RUN_BUDGETS  = [50]
RUN_SEEDS    = [1]

STRATEGIES = [
    "baseline",
    "no_early_stopping",
    "per_class_patience",
    "weighted_macro_f1",
    "balanced_dev",
    "scaled_threshold",
]

DATA_ROOT = str(repo_root / "data")

# Discover all events
TARGET_EVENTS = sorted(
    p.name for p in (Path(DATA_ROOT) / "original").iterdir() if p.is_dir()
)

# Each strategy gets its own results sub-folder
STRATEGY_RESULTS_ROOTS = {
    s: str(repo_root / "results" / f"{PSEUDO_LABEL_SOURCE}-quick-stop-{s}")
    for s in STRATEGIES
}

total_runs = len(STRATEGIES) * len(TARGET_EVENTS) * len(RUN_BUDGETS) * len(RUN_SEEDS)
print(f"Strategies : {STRATEGIES}")
print(f"Events     : {TARGET_EVENTS}")
print(f"Budget     : {RUN_BUDGETS}  |  Seed sets: {RUN_SEEDS}")
print(f"Total runs : {total_runs}")
print()
for s, r in STRATEGY_RESULTS_ROOTS.items():
    print(f"  {s:<25} → {r}")

Strategies : ['baseline', 'no_early_stopping', 'per_class_patience', 'weighted_macro_f1', 'balanced_dev', 'scaled_threshold']
Events     : ['california_wildfires_2018', 'canada_wildfires_2016', 'cyclone_idai_2019', 'hurricane_dorian_2019', 'hurricane_florence_2018', 'hurricane_harvey_2017', 'hurricane_irma_2017', 'hurricane_maria_2017', 'kaikoura_earthquake_2016', 'kerala_floods_2018']
Budget     : [50]  |  Seed sets: [1]
Total runs : 60

  baseline                  → D:\Workspace\Co-Training\results\gpt-4o-quick-stop-baseline
  no_early_stopping         → D:\Workspace\Co-Training\results\gpt-4o-quick-stop-no_early_stopping
  per_class_patience        → D:\Workspace\Co-Training\results\gpt-4o-quick-stop-per_class_patience
  weighted_macro_f1         → D:\Workspace\Co-Training\results\gpt-4o-quick-stop-weighted_macro_f1
  balanced_dev              → D:\Workspace\Co-Training\results\gpt-4o-quick-stop-balanced_dev
  scaled_threshold          → D:\Workspace\Co-Training\results\gpt-4o-quick

## Running Experiments

Each cell runs all events for one budget × seed combination per strategy.
If the cell crashes or is interrupted, re-run it — existing `metrics.json` files
are automatically skipped.

In [3]:
# Count already-completed experiments across all strategies (for accurate ETA from the start)
already_done = sum(
    1
    for strategy in STRATEGIES
    for event in TARGET_EVENTS
    for budget in RUN_BUDGETS
    for seed_set in RUN_SEEDS
    if (
        Path(STRATEGY_RESULTS_ROOTS[strategy])
        / event / f"{budget}_set{seed_set}" / "metrics.json"
    ).exists()
)
total_experiments = len(STRATEGIES) * len(TARGET_EVENTS) * len(RUN_BUDGETS) * len(RUN_SEEDS)

print(f"Total experiments : {total_experiments}")
print(f"Already completed : {already_done}")
print(f"Remaining         : {total_experiments - already_done}")
print()

overall_start = time.time()
tracker = ProgressTracker(total_experiments, already_done, overall_start)
all_strategy_results = {}  # strategy -> event -> list[result_dict]

for strategy in STRATEGIES:
    results_root = STRATEGY_RESULTS_ROOTS[strategy]
    strat_start = time.time()
    print(f"\n{'=' * 65}")
    print(f"Strategy: {strategy}")
    print(f"{'=' * 65}")
    all_strategy_results[strategy] = {}

    for event in TARGET_EVENTS:
        results = run_all_experiments(
            event,
            budgets=RUN_BUDGETS,
            seed_sets=RUN_SEEDS,
            pseudo_label_source=PSEUDO_LABEL_SOURCE,
            stopping_strategy=strategy,
            data_root=DATA_ROOT,
            results_root=results_root,
            _on_experiment_done=tracker.update,
        )
        all_strategy_results[strategy][event] = results

    strat_elapsed = time.time() - strat_start
    print(f"\n  Strategy '{strategy}' done in {strat_elapsed / 3600:.2f}h ({strat_elapsed / 60:.1f}min)")

total_elapsed = time.time() - overall_start
print(f"\n{'=' * 65}")
print(f"All experiments complete in {total_elapsed / 3600:.2f}h ({total_elapsed / 60:.1f}min)")

Total experiments : 60
Already completed : 0
Remaining         : 60


Strategy: baseline


  from .autonotebook import tqdm as notebook_tqdm


[1/1] budget=50, seed=1 -- starting...


2026-02-19 16:27:59,014 - lg_cotrain - INFO - Starting LG-CoTrain: event=california_wildfires_2018, budget=50, seed_set=1
2026-02-19 16:27:59,047 - lg_cotrain - INFO - Detected 10 classes for event california_wildfires_2018: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'missing_or_found_people', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
2026-02-19 16:27:59,057 - lg_cotrain - INFO - D_l1: 250, D_l2: 250, D_LG: 4663
2026-02-19 16:27:59,059 - lg_cotrain - INFO - === Phase 1: Weight Generation ===
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1120.39it/s, Materializing param=bert.pooler.dense.weight]
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1129.41it/s, Materializing param=bert.pooler.dense.weight]
2026-02-19 16:28:18,900 - lg_cotrain - INFO - Phase 1 epoch 1/7: mean_prob1=0.1094

[1/1] budget=50, seed=1 -- done (macro_f1=0.6379)
  [PROGRESS] 1/60 (1.7%)  |  Elapsed: 0.23h  |  ETA: 13.84h  |  done

Batch complete: 1 ran, 0 skipped, 0 failed (840.6s total)
[1/1] budget=50, seed=1 -- starting...


2026-02-19 16:41:58,889 - lg_cotrain - INFO - Starting LG-CoTrain: event=canada_wildfires_2016, budget=50, seed_set=1
2026-02-19 16:41:58,904 - lg_cotrain - INFO - Detected 8 classes for event canada_wildfires_2016: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
2026-02-19 16:41:58,909 - lg_cotrain - INFO - D_l1: 182, D_l2: 182, D_LG: 1205
2026-02-19 16:41:58,911 - lg_cotrain - INFO - === Phase 1: Weight Generation ===
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1084.91it/s, Materializing param=bert.pooler.dense.weight]
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1127.53it/s, Materializing param=bert.pooler.dense.weight]
2026-02-19 16:42:05,986 - lg_cotrain - INFO - Phase 1 epoch 1/7: mean_prob1=0.1182, mean_prob2=0.1448
2026-02-19 16:42:11,827 - lg_cotrain - INF

[1/1] budget=50, seed=1 -- done (macro_f1=0.6098)
  [PROGRESS] 2/60 (3.3%)  |  Elapsed: 0.30h  |  ETA: 8.82h  |  done

Batch complete: 1 ran, 0 skipped, 0 failed (250.1s total)
[1/1] budget=50, seed=1 -- starting...


2026-02-19 16:46:08,997 - lg_cotrain - INFO - Starting LG-CoTrain: event=cyclone_idai_2019, budget=50, seed_set=1
2026-02-19 16:46:09,023 - lg_cotrain - INFO - Detected 10 classes for event cyclone_idai_2019: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'missing_or_found_people', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
2026-02-19 16:46:09,028 - lg_cotrain - INFO - D_l1: 227, D_l2: 226, D_LG: 2300
2026-02-19 16:46:09,028 - lg_cotrain - INFO - === Phase 1: Weight Generation ===
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1129.14it/s, Materializing param=bert.pooler.dense.weight]
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1151.38it/s, Materializing param=bert.pooler.dense.weight]
2026-02-19 16:46:20,386 - lg_cotrain - INFO - Phase 1 epoch 1/7: mean_prob1=0.1059, mean_prob2=0.1

[1/1] budget=50, seed=1 -- done (macro_f1=0.6427)
  [PROGRESS] 3/60 (5.0%)  |  Elapsed: 0.42h  |  ETA: 8.05h  |  done

Batch complete: 1 ran, 0 skipped, 0 failed (430.5s total)
[1/1] budget=50, seed=1 -- starting...


2026-02-19 16:53:19,571 - lg_cotrain - INFO - Starting LG-CoTrain: event=hurricane_dorian_2019, budget=50, seed_set=1
2026-02-19 16:53:19,606 - lg_cotrain - INFO - Detected 9 classes for event hurricane_dorian_2019: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
2026-02-19 16:53:19,615 - lg_cotrain - INFO - D_l1: 221, D_l2: 221, D_LG: 4887
2026-02-19 16:53:19,615 - lg_cotrain - INFO - === Phase 1: Weight Generation ===
Loading weights: 100%|████████████████| 199/199 [00:00<00:00, 996.39it/s, Materializing param=bert.pooler.dense.weight]
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1175.32it/s, Materializing param=bert.pooler.dense.weight]
2026-02-19 16:53:39,615 - lg_cotrain - INFO - Phase 1 epoch 1/7: mean_prob1=0.1125, mean_prob2=0.1088
2026-02-19 16:53

[1/1] budget=50, seed=1 -- done (macro_f1=0.5864)
  [PROGRESS] 4/60 (6.7%)  |  Elapsed: 0.66h  |  ETA: 9.21h  |  done

Batch complete: 1 ran, 0 skipped, 0 failed (842.5s total)
[1/1] budget=50, seed=1 -- starting...


2026-02-19 17:07:22,077 - lg_cotrain - INFO - Starting LG-CoTrain: event=hurricane_florence_2018, budget=50, seed_set=1
2026-02-19 17:07:22,107 - lg_cotrain - INFO - Detected 9 classes for event hurricane_florence_2018: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
2026-02-19 17:07:22,115 - lg_cotrain - INFO - D_l1: 219, D_l2: 219, D_LG: 3946
2026-02-19 17:07:22,116 - lg_cotrain - INFO - === Phase 1: Weight Generation ===
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1179.48it/s, Materializing param=bert.pooler.dense.weight]
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1201.99it/s, Materializing param=bert.pooler.dense.weight]
2026-02-19 17:07:38,908 - lg_cotrain - INFO - Phase 1 epoch 1/7: mean_prob1=0.1223, mean_prob2=0.1138
2026-02-19 1

[1/1] budget=50, seed=1 -- done (macro_f1=0.6995)
  [PROGRESS] 5/60 (8.3%)  |  Elapsed: 0.86h  |  ETA: 9.41h  |  done

Batch complete: 1 ran, 0 skipped, 0 failed (712.3s total)
[1/1] budget=50, seed=1 -- starting...


2026-02-19 17:19:14,295 - lg_cotrain - INFO - Starting LG-CoTrain: event=hurricane_harvey_2017, budget=50, seed_set=1
2026-02-19 17:19:14,331 - lg_cotrain - INFO - Detected 9 classes for event hurricane_harvey_2017: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
2026-02-19 17:19:14,341 - lg_cotrain - INFO - D_l1: 225, D_l2: 225, D_LG: 5928
2026-02-19 17:19:14,342 - lg_cotrain - INFO - === Phase 1: Weight Generation ===
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1222.80it/s, Materializing param=bert.pooler.dense.weight]
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1153.28it/s, Materializing param=bert.pooler.dense.weight]
2026-02-19 17:19:37,795 - lg_cotrain - INFO - Phase 1 epoch 1/7: mean_prob1=0.1204, mean_prob2=0.1221
2026-02-19 17:20

[1/1] budget=50, seed=1 -- done (macro_f1=0.6787)
  [PROGRESS] 6/60 (10.0%)  |  Elapsed: 1.14h  |  ETA: 10.30h  |  done

Batch complete: 1 ran, 0 skipped, 0 failed (1040.1s total)
[1/1] budget=50, seed=1 -- starting...


2026-02-19 17:36:34,425 - lg_cotrain - INFO - Starting LG-CoTrain: event=hurricane_irma_2017, budget=50, seed_set=1
2026-02-19 17:36:34,463 - lg_cotrain - INFO - Detected 9 classes for event hurricane_irma_2017: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
2026-02-19 17:36:34,474 - lg_cotrain - INFO - D_l1: 225, D_l2: 225, D_LG: 6129
2026-02-19 17:36:34,476 - lg_cotrain - INFO - === Phase 1: Weight Generation ===
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1124.00it/s, Materializing param=bert.pooler.dense.weight]
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1104.08it/s, Materializing param=bert.pooler.dense.weight]
2026-02-19 17:36:58,662 - lg_cotrain - INFO - Phase 1 epoch 1/7: mean_prob1=0.1219, mean_prob2=0.1217
2026-02-19 17:37:21,

[1/1] budget=50, seed=1 -- done (macro_f1=0.6391)
  [PROGRESS] 7/60 (11.7%)  |  Elapsed: 1.46h  |  ETA: 11.04h  |  done

Batch complete: 1 ran, 0 skipped, 0 failed (1129.0s total)
[1/1] budget=50, seed=1 -- starting...


2026-02-19 17:55:23,368 - lg_cotrain - INFO - Starting LG-CoTrain: event=hurricane_maria_2017, budget=50, seed_set=1
2026-02-19 17:55:23,397 - lg_cotrain - INFO - Detected 9 classes for event hurricane_maria_2017: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
2026-02-19 17:55:23,406 - lg_cotrain - INFO - D_l1: 225, D_l2: 225, D_LG: 4644
2026-02-19 17:55:23,408 - lg_cotrain - INFO - === Phase 1: Weight Generation ===
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1215.67it/s, Materializing param=bert.pooler.dense.weight]
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1132.21it/s, Materializing param=bert.pooler.dense.weight]
2026-02-19 17:55:42,753 - lg_cotrain - INFO - Phase 1 epoch 1/7: mean_prob1=0.1183, mean_prob2=0.1190
2026-02-19 17:56:0

[1/1] budget=50, seed=1 -- done (macro_f1=0.6801)
  [PROGRESS] 8/60 (13.3%)  |  Elapsed: 1.69h  |  ETA: 10.96h  |  done

Batch complete: 1 ran, 0 skipped, 0 failed (823.5s total)
[1/1] budget=50, seed=1 -- starting...


2026-02-19 18:09:06,970 - lg_cotrain - INFO - Starting LG-CoTrain: event=kaikoura_earthquake_2016, budget=50, seed_set=1
2026-02-19 18:09:06,981 - lg_cotrain - INFO - Detected 9 classes for event kaikoura_earthquake_2016: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
2026-02-19 18:09:06,986 - lg_cotrain - INFO - D_l1: 209, D_l2: 208, D_LG: 1119
2026-02-19 18:09:06,987 - lg_cotrain - INFO - === Phase 1: Weight Generation ===
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1117.98it/s, Materializing param=bert.pooler.dense.weight]
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1145.92it/s, Materializing param=bert.pooler.dense.weight]
2026-02-19 18:09:14,180 - lg_cotrain - INFO - Phase 1 epoch 1/7: mean_prob1=0.1232, mean_prob2=0.1112
2026-02-19

[1/1] budget=50, seed=1 -- done (macro_f1=0.7705)
  [PROGRESS] 9/60 (15.0%)  |  Elapsed: 1.76h  |  ETA: 9.95h  |  done

Batch complete: 1 ran, 0 skipped, 0 failed (251.1s total)
[1/1] budget=50, seed=1 -- starting...


2026-02-19 18:13:18,091 - lg_cotrain - INFO - Starting LG-CoTrain: event=kerala_floods_2018, budget=50, seed_set=1
2026-02-19 18:13:18,129 - lg_cotrain - INFO - Detected 9 classes for event kerala_floods_2018: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
2026-02-19 18:13:18,141 - lg_cotrain - INFO - D_l1: 220, D_l2: 219, D_LG: 5149
2026-02-19 18:13:18,144 - lg_cotrain - INFO - === Phase 1: Weight Generation ===
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1114.11it/s, Materializing param=bert.pooler.dense.weight]
Loading weights: 100%|███████████████| 199/199 [00:00<00:00, 1056.03it/s, Materializing param=bert.pooler.dense.weight]
2026-02-19 18:13:40,911 - lg_cotrain - INFO - Phase 1 epoch 1/7: mean_prob1=0.1190, mean_prob2=0.1323
2026-02-19 18:14:01,66

KeyboardInterrupt: 

In [None]:
# Load any results that already existed (re-run safe)
for strategy in STRATEGIES:
    results_root = Path(STRATEGY_RESULTS_ROOTS[strategy])
    if strategy not in all_strategy_results:
        all_strategy_results[strategy] = {}

    for event in TARGET_EVENTS:
        if event in all_strategy_results[strategy]:
            continue
        results = []
        for budget in RUN_BUDGETS:
            for seed_set in RUN_SEEDS:
                path = results_root / event / f"{budget}_set{seed_set}" / "metrics.json"
                if path.exists():
                    with open(path) as f:
                        results.append(json.load(f))
        if results:
            all_strategy_results[strategy][event] = results

# Build flat lookup: lookup[strategy][event] -> result dict (or None)
# Since budget=50, seed=1 only, each (strategy, event) has at most one result.
lookup = {}
for strategy in STRATEGIES:
    lookup[strategy] = {}
    for event in TARGET_EVENTS:
        results = all_strategy_results.get(strategy, {}).get(event, [])
        # Pick the single result for budget=50, seed=1
        match = next(
            (r for r in results if r and r.get("budget") == 50 and r.get("seed_set") == 1),
            None,
        )
        lookup[strategy][event] = match

print("Results available (budget=50, seed=1):")
print(f"{'Strategy':<26}" + "".join(f" {e[:12]:<13}" for e in TARGET_EVENTS))
print("-" * (26 + 14 * len(TARGET_EVENTS)))
for strategy in STRATEGIES:
    row = f"{strategy:<26}"
    for event in TARGET_EVENTS:
        row += " OK          " if lookup[strategy][event] else " --          "
    print(row)

In [None]:
# Summary table: strategies (rows) × events (columns), value = macro-F1
# Plus a delta-from-baseline table.

col_w = 10
event_labels = [e.replace("_", " ") for e in TARGET_EVENTS]
short_labels  = [" ".join(w[:4] for w in e.split("_")) for e in TARGET_EVENTS]

print("Macro-F1  (budget=50, seed=1)")
print(f"{'Strategy':<26}" + "".join(f" {sl:<{col_w}}" for sl in short_labels) + "  Mean")
print("-" * (26 + (col_w + 1) * len(TARGET_EVENTS) + 6))

baseline_row = {}
for strategy in STRATEGIES:
    row = f"{strategy:<26}"
    vals = []
    for event in TARGET_EVENTS:
        r = lookup[strategy][event]
        f1 = r["test_macro_f1"] if r else None
        vals.append(f1)
        row += f" {f1:.4f}   " if f1 is not None else f" {'N/A':<{col_w}}"
    valid = [v for v in vals if v is not None]
    row += f"  {sum(valid)/len(valid):.4f}" if valid else "  N/A"
    print(row)
    if strategy == "baseline":
        baseline_row = {e: v for e, v in zip(TARGET_EVENTS, vals)}

print()
print("Delta vs baseline  (+) = better:")
print(f"{'Strategy':<26}" + "".join(f" {sl:<{col_w}}" for sl in short_labels) + "  Mean Δ")
print("-" * (26 + (col_w + 1) * len(TARGET_EVENTS) + 8))

for strategy in STRATEGIES:
    if strategy == "baseline":
        continue
    row = f"{strategy:<26}"
    deltas = []
    for event in TARGET_EVENTS:
        r = lookup[strategy][event]
        f1   = r["test_macro_f1"] if r else None
        base = baseline_row.get(event)
        if f1 is not None and base is not None:
            d = f1 - base
            deltas.append(d)
            sign = "+" if d >= 0 else ""
            row += f" {sign}{d:.4f}  "
        else:
            row += f" {'N/A':<{col_w}}"
    row += f"  {'+' if sum(deltas)/len(deltas)>=0 else ''}{sum(deltas)/len(deltas):.4f}" if deltas else "  N/A"
    print(row)

In [None]:
# Grouped bar chart: one group per event, one bar per strategy
# All at budget=50, seed=1

n_events     = len(TARGET_EVENTS)
n_strategies = len(STRATEGIES)
bar_width    = 0.8 / n_strategies
colors       = plt.cm.tab10(np.linspace(0, 1, n_strategies))
x            = np.arange(n_events)

fig, ax = plt.subplots(figsize=(max(14, n_events * 1.4), 5))

for i, (strategy, color) in enumerate(zip(STRATEGIES, colors)):
    f1s = [
        lookup[strategy][event]["test_macro_f1"]
        if lookup[strategy][event] else 0
        for event in TARGET_EVENTS
    ]
    offset = (i - n_strategies / 2 + 0.5) * bar_width
    ax.bar(x + offset, f1s, bar_width * 0.9, label=strategy, color=color, alpha=0.85)

ax.set_xticks(x)
ax.set_xticklabels(
    [e.replace("_", "\n") for e in TARGET_EVENTS],
    fontsize=8,
)
ax.set_ylabel("Test Macro-F1")
ax.set_ylim(0, 1)
ax.set_title(
    f"Stopping Strategy Comparison — Budget=50, Seed=1\n(pseudo-labels: {PSEUDO_LABEL_SOURCE})",
    fontsize=11,
)
ax.legend(loc="upper right", fontsize=8, framealpha=0.8)
ax.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Per-class F1 heatmap for each event
# Rows = strategies, Columns = classes, Value = F1 at budget=50, seed=1

from lg_cotrain.data_loading import CLASS_LABELS

for event in TARGET_EVENTS:
    strategies_with_data = [
        s for s in STRATEGIES
        if lookup[s][event] and "test_per_class_f1" in lookup[s][event]
    ]
    if not strategies_with_data:
        print(f"No per-class data for {event}, skipping.")
        continue

    data = np.array([
        lookup[s][event]["test_per_class_f1"]
        for s in strategies_with_data
    ])  # shape: (n_strategies, n_classes)

    fig, ax = plt.subplots(
        figsize=(max(9, len(CLASS_LABELS) * 0.75), len(strategies_with_data) * 0.65 + 1.8)
    )
    im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=0, vmax=1)

    ax.set_xticks(range(len(CLASS_LABELS)))
    ax.set_xticklabels(CLASS_LABELS, rotation=45, ha="right", fontsize=8)
    ax.set_yticks(range(len(strategies_with_data)))
    ax.set_yticklabels(strategies_with_data, fontsize=9)
    ax.set_title(
        f"{event}  |  Budget=50, Seed=1  |  Per-class F1 by strategy",
        fontsize=10,
    )

    for i in range(len(strategies_with_data)):
        for j in range(len(CLASS_LABELS)):
            val = data[i, j]
            color = "black" if 0.25 < val < 0.75 else "white"
            ax.text(j, i, f"{val:.2f}", ha="center", va="center", fontsize=7, color=color)

    fig.colorbar(im, ax=ax, label="F1 Score")
    plt.tight_layout()
    plt.show()

In [None]:
# Rebuild multi-tab dashboard so all quick-run result sets appear as tabs.

from lg_cotrain.dashboard import discover_result_sets, generate_html_multi

TOP_RESULTS_ROOT = str(repo_root / "results")

result_sets = discover_result_sets(TOP_RESULTS_ROOT)
print(f"Discovered {len(result_sets)} result set(s):")
for name, path in result_sets.items():
    print(f"  {name:<35} → {path}")

html = generate_html_multi(result_sets, data_root=DATA_ROOT)
dashboard_path = Path(TOP_RESULTS_ROOT) / "dashboard.html"
dashboard_path.write_text(html)
print(f"\nDashboard written to: {dashboard_path}")
print("Open in a browser to compare strategies side-by-side.")