# Hyperparameter Tuning Phase 2: EuroBERT-210M
## Eingeengter Suchbereich basierend auf Phase 1

Verfeinerte Hyperparameter-Suche basierend auf den Top-5-Trials aus Phase 1.

**Phase 1 Ergebnisse (Best F1 Macro CV Mean: 0.8393):**
- Learning Rate: 3.7e-5 bis 5e-5 (oberes Ende performt besser)
- Scheduler: `linear` dominiert (4/5 Top Trials)
- Epochs: 13-15 (mehr Training = besser)
- Dropout: 0.26-0.40 (mittlerer Bereich)
- Batch Size: 4 dominiert (4/5 Top Trials)

**Phase 2 Strategie:**
- Eingeengte Suchbereiche um Top-5-Region
- `linear` Scheduler und Batch Size 4 fixiert
- NaN-Detection fuer sofortiges Pruning instabiler Trials
- 20 Trials fuer feinere Suche im engen Raum

**Voraussetzung:** GPU-Runtime (L4 empfohlen), `HF_TOKEN` in Colab Secrets hinterlegt.

In [None]:
# === SETUP ===
import os, sys

# Repo klonen / aktualisieren
REPO = "/content/news_articles_classification_thesis"
if not os.path.exists(REPO):
    !git clone https://github.com/ZorbeyOezcan/news_articles_classification_thesis.git {REPO}
else:
    !cd {REPO} && git pull -q

# Dependencies (+ optuna, plotly, kaleido fuer HPT)
!pip install -q transformers[sentencepiece] datasets huggingface_hub \
    scikit-learn matplotlib seaborn tqdm pandas accelerate evaluate \
    optuna plotly kaleido

# Google Drive mounten (persistente Reports + Optuna DB)
from google.colab import drive
drive.mount("/content/drive", force_remount=False)

# pipeline_utils importierbar machen
PIPELINE_DIR = f"{REPO}/Python/classification_pipeline"
if PIPELINE_DIR not in sys.path:
    sys.path.insert(0, PIPELINE_DIR)

import importlib
import pipeline_utils as pu
importlib.reload(pu)

# HuggingFace Login
from huggingface_hub import login
from google.colab import userdata
login(token=userdata.get("HF_TOKEN"))

# Auto-Shutdown Watchdog
import threading, time as _time
MAX_RUNTIME_HOURS = 6  # Phase 2 braucht evtl. etwas laenger (20 Trials)

def _auto_shutdown():
    """Beendet Runtime nach MAX_RUNTIME_HOURS als Sicherheitsnetz."""
    _time.sleep(MAX_RUNTIME_HOURS * 3600)
    print(f"\n[WATCHDOG] Max Runtime ({MAX_RUNTIME_HOURS}h) erreicht. Runtime wird beendet.")
    try:
        from google.colab import runtime
        runtime.unassign()
    except Exception:
        pass
threading.Thread(target=_auto_shutdown, daemon=True).start()

print(f"Reports-Ordner: {pu.REPORTS_DIR}")
print(f"Setup abgeschlossen. Watchdog: Runtime wird nach max {MAX_RUNTIME_HOURS}h beendet.")

In [None]:
# ===== HPT PHASE 2 CONFIGURATION =====
import torch
import numpy as np

MODEL_ID = "EuroBERT/EuroBERT-210m"
MODEL_SHORT_NAME = "eurobert_210m"
MAX_LENGTH = 2048
RANDOM_SEED = 42

# ----- Cross-Validation -----
N_FOLDS = 3

# ----- Optuna -----
N_TRIALS = 20  # Mehr Trials im engeren Suchraum
STUDY_NAME = "eurobert_210m_hpt_phase2"
OPTUNA_SEED = 42
OPTUNA_TIMEOUT = None

# ----- Fixed Training Parameters -----
FIXED_GRADIENT_CHECKPOINTING = False
FIXED_LOGGING_STEPS = 10
FIXED_REPORT_TO = "tensorboard"
FIXED_DATALOADER_NUM_WORKERS = 4
FIXED_EARLY_STOPPING_PATIENCE = 3
FIXED_GROUP_BY_LENGTH = True

# Mixed Precision: GPU-adaptiv
if torch.cuda.is_available():
    _gpu_cap = torch.cuda.get_device_capability()
    _gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    if _gpu_cap[0] >= 8:
        FIXED_BF16 = True
        FIXED_FP16 = False
    else:
        FIXED_BF16 = False
        FIXED_FP16 = True
    FIXED_OPTIM = "adamw_torch_fused"
    if _gpu_mem >= 40:
        FIXED_BATCH_SIZE_EVAL = 32
    elif _gpu_mem >= 20:
        FIXED_BATCH_SIZE_EVAL = 16
    else:
        FIXED_BATCH_SIZE_EVAL = 8
    print(f"GPU: {torch.cuda.get_device_name(0)} ({_gpu_mem:.1f} GB, CC {_gpu_cap[0]}.{_gpu_cap[1]})")
    print(f"  FP16={FIXED_FP16}, BF16={FIXED_BF16}, Eval Batch={FIXED_BATCH_SIZE_EVAL}")
    print(f"  Gradient Checkpointing: {FIXED_GRADIENT_CHECKPOINTING}")
else:
    raise RuntimeError("HPT benoetigt eine GPU! Bitte Colab Runtime aendern.")

# ----- Phase 2: Eingeengte Suchbereiche (basierend auf Phase 1 Top 5) -----
# Phase 1 Top 5 Ranges:
#   learning_rate:      3.7e-5 .. 5.0e-5
#   weight_decay:       0.029  .. 0.071
#   warmup_ratio:       0.083  .. 0.145
#   label_smoothing:    0.023  .. 0.067
#   classifier_dropout: 0.264  .. 0.395
#   epochs:             13 .. 15
#   scheduler:          linear (4/5)
#   batch_size:         4 (4/5)
HP_RANGES = {
    "learning_rate": (2.5e-5, 5.5e-5),              # leicht erweitert (log scale)
    "weight_decay": (0.02, 0.08),
    "warmup_ratio": (0.06, 0.16),
    "label_smoothing_factor": (0.01, 0.08),
    "classifier_dropout": (0.20, 0.45),
    "num_train_epochs": (12, 16),                    # int range
}

# Fixiert aus Phase 1 Erkenntnissen:
FIXED_LR_SCHEDULER_TYPE = "linear"       # 4/5 Top Trials
FIXED_BATCH_SIZE_TRAIN = 4               # 4/5 Top Trials
EFFECTIVE_BATCH_SIZE = 16
FIXED_GRAD_ACCUM = EFFECTIVE_BATCH_SIZE // FIXED_BATCH_SIZE_TRAIN  # = 4

# ----- Labels -----
ALL_LABELS = [
    "Klima / Energie", "Zuwanderung", "Renten", "Soziales Gef\u00e4lle",
    "AfD/Rechte", "Arbeitslosigkeit", "Wirtschaftslage", "Politikverdruss",
    "Gesundheitswesen, Pflege", "Kosten/L\u00f6hne/Preise",
    "Ukraine/Krieg/Russland", "Bundeswehr/Verteidigung", "Andere",
]

# Split-Konfiguration
TEST_PER_CLASS = 30

print(f"\nOptuna HPT Phase 2: {N_TRIALS} Trials, {N_FOLDS}-Fold CV")
print(f"Modell: {MODEL_ID}")
print(f"Max Length: {MAX_LENGTH}")
print(f"Fixiert: scheduler={FIXED_LR_SCHEDULER_TYPE}, batch_size={FIXED_BATCH_SIZE_TRAIN}, grad_accum={FIXED_GRAD_ACCUM}")
print(f"Effektive Batch Size: {EFFECTIVE_BATCH_SIZE}")
print(f"Labels: {len(ALL_LABELS)} Klassen")
print(f"\nEingeengte Suchbereiche:")
for k, v in HP_RANGES.items():
    print(f"  {k}: {v}")

In [None]:
# ===== DATEN LADEN & CUSTOM SPLIT =====
import pandas as pd
from datasets import load_dataset

np.random.seed(RANDOM_SEED)

ds = load_dataset(pu.DATASET_ID)
train_hf = ds["train"].to_pandas()
test_hf = ds["test"].to_pandas()
all_labelled = pd.concat([train_hf, test_hf], ignore_index=True)

print(f"Gesamtpool gelabelter Artikel: {len(all_labelled)}")
print(f"Klassen im Datensatz: {all_labelled['label'].nunique()}")
print()

# --- Test-Split (identisch wie Phase 1) ---
test_indices = []
rest_indices = []

for label in ALL_LABELS:
    label_mask = all_labelled["label"] == label
    label_indices = all_labelled[label_mask].index.tolist()
    n_total = len(label_indices)

    if n_total < 60:
        n_test = n_total // 2
        print(f"  {label}: nur {n_total} Artikel -> {n_test} fuer Test (Haelfte)")
    else:
        n_test = TEST_PER_CLASS

    np.random.shuffle(label_indices)
    test_indices.extend(label_indices[:n_test])
    rest_indices.extend(label_indices[n_test:])

test_df = all_labelled.loc[test_indices].reset_index(drop=True)
cv_pool_df = all_labelled.loc[rest_indices].reset_index(drop=True)

print(f"\nTest (eingefroren):  {len(test_df)} Artikel")
print(f"CV-Pool (fuer Folds): {len(cv_pool_df)} Artikel")

# Klassenverteilung
print("\nCV-Pool Klassenverteilung:")
cv_dist = cv_pool_df["label"].value_counts().sort_index()
for label, count in cv_dist.items():
    print(f"  {label}: {count}")
print(f"  TOTAL: {len(cv_pool_df)}")

In [None]:
# ===== LABEL ENCODING =====
label2id = {label: idx for idx, label in enumerate(ALL_LABELS)}
id2label = {idx: label for idx, label in enumerate(ALL_LABELS)}

cv_pool_df["label_id"] = cv_pool_df["label"].map(label2id)
test_df["label_id"] = test_df["label"].map(label2id)

assert cv_pool_df["label_id"].isna().sum() == 0, "Unbekannte Labels im CV-Pool!"
assert test_df["label_id"].isna().sum() == 0, "Unbekannte Labels im Test-Set!"

print("Label-Mapping:")
for label, idx in label2id.items():
    print(f"  {idx:>2}: {label}")
print(f"\nAnzahl Klassen: {len(ALL_LABELS)}")

In [None]:
# ===== TOKENIZER + EUROBERT ROPE FIX =====
from transformers import AutoTokenizer
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

def _default_rope_init(config, device=None, **kwargs):
    base = getattr(config, "rope_theta", 10000.0)
    partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
    dim = int(head_dim * partial_rotary_factor)
    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
    return inv_freq, 1.0

ROPE_INIT_FUNCTIONS["default"] = _default_rope_init
print("ROPE_INIT_FUNCTIONS gepatcht.")

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        max_length=MAX_LENGTH,
        truncation=True,
    )

print(f"Tokenizer geladen: {MODEL_ID}")

In [None]:
# ===== OPTUNA OBJECTIVE MIT K-FOLD CV =====
import gc
import shutil
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Fold-Indizes vorab berechnen
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)
fold_indices = list(skf.split(cv_pool_df, cv_pool_df["label_id"]))

print(f"Stratified {N_FOLDS}-Fold CV:")
for i, (train_idx, val_idx) in enumerate(fold_indices):
    val_labels = cv_pool_df.iloc[val_idx]["label"].value_counts()
    min_val_count = val_labels.min()
    min_val_class = val_labels.idxmin()
    print(f"  Fold {i+1}: Train={len(train_idx)}, Val={len(val_idx)}, "
          f"min Val-Klasse: {min_val_class} ({min_val_count} Samples)")


def compute_metrics_simple(eval_pred):
    """Nur F1 Macro — schlank fuer HPT."""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }


def create_model(classifier_dropout):
    """Erstellt ein frisches Modell mit dem angegebenen Dropout."""
    config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
    config.num_labels = len(ALL_LABELS)
    config.id2label = id2label
    config.label2id = label2id
    config.classifier_dropout = classifier_dropout

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        config=config,
        ignore_mismatched_sizes=True,
        trust_remote_code=True,
    )
    return model


def _cleanup_cuda():
    """VRAM aufraeumen."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()


def objective(trial):
    """Optuna objective: mean F1 Macro across k folds."""

    # --- Hyperparameter samplen (eingeengter Bereich) ---
    lr = trial.suggest_float("learning_rate", *HP_RANGES["learning_rate"], log=True)
    wd = trial.suggest_float("weight_decay", *HP_RANGES["weight_decay"])
    warmup = trial.suggest_float("warmup_ratio", *HP_RANGES["warmup_ratio"])
    label_smooth = trial.suggest_float("label_smoothing_factor", *HP_RANGES["label_smoothing_factor"])
    dropout = trial.suggest_float("classifier_dropout", *HP_RANGES["classifier_dropout"])
    epochs = trial.suggest_int("num_train_epochs", *HP_RANGES["num_train_epochs"])

    # Fixierte Parameter aus Phase 1
    scheduler = FIXED_LR_SCHEDULER_TYPE
    batch_size = FIXED_BATCH_SIZE_TRAIN
    grad_accum = FIXED_GRAD_ACCUM

    print(f"\n{'='*60}")
    print(f"Trial {trial.number}: lr={lr:.2e}, wd={wd:.3f}, warmup={warmup:.3f}, "
          f"ls={label_smooth:.3f}, dropout={dropout:.3f}, epochs={epochs}")
    print(f"  (fixiert: sched={scheduler}, batch={batch_size}, grad_accum={grad_accum})")
    print(f"{'='*60}")

    fold_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(fold_indices):
        print(f"  Fold {fold_idx + 1}/{N_FOLDS}...", end=" ", flush=True)

        _cleanup_cuda()

        # --- Fold Datasets ---
        fold_train_df = cv_pool_df.iloc[train_idx]
        fold_val_df = cv_pool_df.iloc[val_idx]

        train_ds = Dataset.from_pandas(
            fold_train_df[["text", "label_id"]].rename(columns={"label_id": "labels"})
        )
        val_ds = Dataset.from_pandas(
            fold_val_df[["text", "label_id"]].rename(columns={"label_id": "labels"})
        )

        train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
        val_ds = val_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
        train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
        val_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

        fold_output_dir = f"/content/hpt_tmp/trial_{trial.number}_fold_{fold_idx}"
        fold_logging_dir = f"/content/hpt_tmp/tb_logs/trial_{trial.number:03d}_fold_{fold_idx}"

        training_args = TrainingArguments(
            output_dir=fold_output_dir,
            num_train_epochs=epochs,
            learning_rate=lr,
            weight_decay=wd,
            warmup_ratio=warmup,
            label_smoothing_factor=label_smooth,
            lr_scheduler_type=scheduler,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=FIXED_BATCH_SIZE_EVAL,
            gradient_accumulation_steps=grad_accum,
            bf16=FIXED_BF16,
            fp16=FIXED_FP16,
            gradient_checkpointing=FIXED_GRADIENT_CHECKPOINTING,
            optim=FIXED_OPTIM,
            group_by_length=FIXED_GROUP_BY_LENGTH,
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            metric_for_best_model="f1_macro",
            greater_is_better=True,
            logging_strategy="steps",
            logging_steps=FIXED_LOGGING_STEPS,
            logging_dir=fold_logging_dir,
            report_to=FIXED_REPORT_TO,
            seed=RANDOM_SEED + fold_idx,
            dataloader_num_workers=FIXED_DATALOADER_NUM_WORKERS,
            dataloader_pin_memory=True,
            disable_tqdm=False,
        )

        try:
            model = create_model(dropout)

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_ds,
                eval_dataset=val_ds,
                data_collator=data_collator,
                compute_metrics=compute_metrics_simple,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=FIXED_EARLY_STOPPING_PATIENCE)],
            )

            trainer.train()

            eval_result = trainer.evaluate()
            fold_f1 = eval_result["eval_f1_macro"]

            # NaN-Detection: sofort prunen wenn Loss NaN
            if np.isnan(eval_result.get("eval_loss", 0)) or fold_f1 < 0.05:
                print(f"NaN/collapse detected (F1={fold_f1:.4f}) — pruning Trial")
                raise optuna.TrialPruned()

            fold_scores.append(fold_f1)
            print(f"F1 Macro = {fold_f1:.4f}")

        except torch.cuda.OutOfMemoryError:
            print(f"\n  OOM in Trial {trial.number}, Fold {fold_idx + 1}! Ueberspringe Trial.")
            _cleanup_cuda()
            if os.path.exists(fold_output_dir):
                shutil.rmtree(fold_output_dir, ignore_errors=True)
            raise optuna.TrialPruned()

        finally:
            for var in ["trainer", "model", "training_args", "train_ds", "val_ds"]:
                try:
                    exec(f"del {var}")
                except NameError:
                    pass
            _cleanup_cuda()
            if os.path.exists(fold_output_dir):
                shutil.rmtree(fold_output_dir, ignore_errors=True)

        # Pruning: Zwischenergebnis melden
        trial.report(np.mean(fold_scores), fold_idx)
        if trial.should_prune():
            print(f"  Trial {trial.number} PRUNED nach Fold {fold_idx + 1}")
            raise optuna.TrialPruned()

    mean_f1 = np.mean(fold_scores)
    std_f1 = np.std(fold_scores)
    print(f"\n  -> Trial {trial.number}: F1 Macro = {mean_f1:.4f} +/- {std_f1:.4f}")

    return mean_f1


print("Objective Function definiert.")
print(f"Folds: {N_FOLDS}, Trials: {N_TRIALS}")
print(f"Effektive Batch Size: {EFFECTIVE_BATCH_SIZE}")

In [None]:
# ===== OPTUNA STUDY STARTEN =====
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from pathlib import Path

# SQLite-Backend auf Google Drive fuer Crash-Recovery
db_dir = Path(pu.REPORTS_DIR)
db_dir.mkdir(parents=True, exist_ok=True)
storage_path = db_dir / "hpt_eurobert_210m_phase2.db"
storage_url = f"sqlite:///{storage_path}"

print(f"Optuna DB: {storage_path}")
print(f"  -> Bei Crash: Runtime neu starten, 'Run All' -> Trials werden fortgesetzt")

sampler = TPESampler(seed=OPTUNA_SEED, n_startup_trials=5)
pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=1)

study = optuna.create_study(
    study_name=STUDY_NAME,
    direction="maximize",
    sampler=sampler,
    pruner=pruner,
    storage=storage_url,
    load_if_exists=True,
)

# Nur fehlende Trials nachlaufen lassen (Fix aus Phase 1 Bug)
n_existing = len([t for t in study.trials if t.state.name == "COMPLETE"])
n_remaining = max(0, N_TRIALS - n_existing)

if n_existing > 0:
    print(f"\n{n_existing} bereits abgeschlossene Trials gefunden.")
    print(f"Noch {n_remaining} Trials ausstehend.")

if n_remaining == 0:
    print("Alle Trials bereits abgeschlossen — ueberspringe optimize().")
else:
    timer_hpt = pu.ExperimentTimer()
    with timer_hpt:
        study.optimize(
            objective,
            n_trials=n_remaining,
            timeout=OPTUNA_TIMEOUT,
            gc_after_trial=True,
            show_progress_bar=True,
        )
    print(f"\nOptuna HPT Phase 2 abgeschlossen: {timer_hpt.duration_formatted}")

print(f"\n{'='*60}")
print(f"Beste Trial: {study.best_trial.number}")
print(f"Bester F1 Macro (CV Mean): {study.best_value:.4f}")
print(f"\nBeste Hyperparameter:")
for key, val in study.best_params.items():
    print(f"  {key}: {val}")
print(f"{'='*60}")

In [None]:
# ===== ERGEBNISSE ANALYSIEREN =====

trials_df = study.trials_dataframe(attrs=("number", "value", "params", "state", "duration"))
trials_df = trials_df.sort_values("value", ascending=False)

completed = trials_df[trials_df["state"] == "COMPLETE"].copy()
pruned = trials_df[trials_df["state"] == "PRUNED"].copy()

print(f"Trials: {len(completed)} abgeschlossen, {len(pruned)} gepruned")
print(f"\nTop 5 Trials:")
top5_cols = ["number", "value"] + [c for c in completed.columns if c.startswith("params_")]
print(completed[top5_cols].head().to_string(index=False))

print(f"\nStatistiken ueber alle abgeschlossenen Trials:")
print(f"  Mean F1:   {completed['value'].mean():.4f}")
print(f"  Std F1:    {completed['value'].std():.4f}")
print(f"  Min F1:    {completed['value'].min():.4f}")
print(f"  Max F1:    {completed['value'].max():.4f}")

# Vergleich mit Phase 1
print(f"\n--- Vergleich mit Phase 1 ---")
print(f"  Phase 1 Best: 0.8393")
print(f"  Phase 2 Best: {study.best_value:.4f}")
print(f"  Differenz:    {study.best_value - 0.8393:+.4f}")

In [None]:
# ===== OPTUNA VISUALISIERUNGEN =====
import optuna.visualization as vis

# 1. Optimization History
fig_history = vis.plot_optimization_history(study)
fig_history.update_layout(title="Phase 2: Optimization History")
fig_history.show()

# 2. Parameter Importances
try:
    fig_importance = vis.plot_param_importances(study)
    fig_importance.update_layout(title="Phase 2: Parameter Importance (fANOVA)")
    fig_importance.show()
except Exception as e:
    print(f"Parameter Importance nicht verfuegbar: {e}")

# 3. Parallel Coordinate Plot
fig_parallel = vis.plot_parallel_coordinate(study)
fig_parallel.update_layout(title="Phase 2: Parallel Coordinate")
fig_parallel.show()

# 4. Slice Plot
fig_slice = vis.plot_slice(study)
fig_slice.show()

# 5. Contour: LR vs Dropout
try:
    fig_contour = vis.plot_contour(study, params=["learning_rate", "classifier_dropout"])
    fig_contour.update_layout(title="Phase 2: LR vs. Dropout")
    fig_contour.show()
except Exception as e:
    print(f"Contour Plot nicht verfuegbar: {e}")

print("Optuna-Visualisierungen erstellt.")

In [None]:
# ===== SUMMARY-VISUALISIERUNG (matplotlib) =====
import matplotlib.pyplot as plt
from pathlib import Path

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Trial F1 Scores (sortiert)
completed_sorted = completed.sort_values("value", ascending=True).reset_index(drop=True)
ax = axes[0, 0]
best_idx = completed_sorted["value"].idxmax()
colors = ["#4CAF50" if i == best_idx else "#2196F3" for i in range(len(completed_sorted))]
ax.barh(range(len(completed_sorted)), completed_sorted["value"], color=colors)
ax.set_xlabel("F1 Macro (CV Mean)")
ax.set_ylabel("Trial (sortiert)")
ax.set_title("Alle Trials sortiert nach F1 Macro")
ax.axvline(x=study.best_value, color="red", linestyle="--", alpha=0.7,
           label=f"Best: {study.best_value:.4f}")
# Phase 1 Referenzlinie
ax.axvline(x=0.8393, color="orange", linestyle=":", alpha=0.7,
           label="Phase 1 Best: 0.8393")
ax.legend()
ax.grid(axis="x", alpha=0.3)

# 2. Learning Rate vs F1
ax = axes[0, 1]
scatter = ax.scatter(
    completed["params_learning_rate"], completed["value"],
    c=completed["value"], cmap="viridis", s=80, edgecolors="black", linewidth=0.5,
)
ax.set_xlabel("Learning Rate")
ax.set_ylabel("F1 Macro")
ax.set_title("Learning Rate vs. F1 Macro")
ax.set_xscale("log")
plt.colorbar(scatter, ax=ax, label="F1")
ax.grid(alpha=0.3)

# 3. Beste Hyperparameter (normalisiert)
ax = axes[1, 0]
continuous_params = ["learning_rate", "weight_decay", "warmup_ratio",
                     "label_smoothing_factor", "classifier_dropout"]
best_normalized = []
for p in continuous_params:
    lo, hi = HP_RANGES[p]
    best_val = study.best_params[p]
    if p == "learning_rate":
        import math
        best_normalized.append((math.log(best_val) - math.log(lo)) / (math.log(hi) - math.log(lo)))
    elif hi == lo:
        best_normalized.append(0.5)
    else:
        best_normalized.append((best_val - lo) / (hi - lo))
ax.barh(continuous_params, best_normalized, color="#FF9800")
ax.set_xlabel("Normalisierter Wert (0=min, 1=max)")
ax.set_title("Beste Hyperparameter (normalisiert)")
ax.set_xlim(0, 1)
for i, p in enumerate(continuous_params):
    val = study.best_params[p]
    ax.text(best_normalized[i] + 0.02, i, f"{val:.4g}", va="center", fontsize=9)
ax.grid(axis="x", alpha=0.3)

# 4. Phase 1 vs Phase 2 Vergleich
ax = axes[1, 1]
phase1_best = 0.8393
phase2_scores = completed["value"].sort_values(ascending=False).values
ax.plot(range(1, len(phase2_scores) + 1), phase2_scores, 'o-', color="#2196F3", label="Phase 2 Trials")
ax.axhline(y=phase1_best, color="orange", linestyle="--", linewidth=2, label=f"Phase 1 Best: {phase1_best:.4f}")
ax.axhline(y=study.best_value, color="green", linestyle="--", linewidth=2, label=f"Phase 2 Best: {study.best_value:.4f}")
ax.set_xlabel("Trial Rank")
ax.set_ylabel("F1 Macro")
ax.set_title("Phase 1 vs Phase 2 Performance")
ax.legend(fontsize=9)
ax.grid(alpha=0.3)

plt.suptitle(
    f"HPT Phase 2: EuroBERT-210M ({len(completed)} Trials, {N_FOLDS}-Fold CV)",
    fontsize=14, fontweight="bold",
)
plt.tight_layout()

save_dir = Path(pu.REPORTS_DIR)
save_dir.mkdir(parents=True, exist_ok=True)
png_path = save_dir / "hpt_eurobert_210m_phase2_summary.png"
fig.savefig(str(png_path), dpi=150, bbox_inches="tight")
plt.show()
print(f"Plot gespeichert: {png_path}")

In [None]:
# ===== HPT REPORT GENERIEREN =====
import json
from datetime import datetime

now = datetime.now()
report_name = f"{now.strftime('%d%m%y')}_{MODEL_SHORT_NAME}_hpt_phase2"

# Timer-Dauer (falls optimize() gelaufen ist)
duration_str = timer_hpt.duration_formatted if 'timer_hpt' in dir() else "N/A (aus DB geladen)"

# --- Markdown Report ---
report_lines = [
    f"# HPT Report: EuroBERT-210M Phase 2",
    f"**Generated:** {now.strftime('%Y-%m-%d %H:%M:%S')}",
    "",
    "---",
    "",
    "## Phase 1 -> Phase 2",
    f"- Phase 1 Best F1: **0.8393** (Trial 11)",
    f"- Phase 2 Best F1: **{study.best_value:.4f}** (Trial {study.best_trial.number})",
    f"- Verbesserung: **{study.best_value - 0.8393:+.4f}**",
    "",
    "## Configuration",
    "| Property | Value |",
    "|---|---|",
    f"| Model | {MODEL_ID} |",
    f"| N Trials | {N_TRIALS} |",
    f"| N Folds | {N_FOLDS} |",
    f"| Completed Trials | {len(completed)} |",
    f"| Pruned Trials | {len(pruned)} |",
    f"| Duration | {duration_str} |",
    f"| GPU | {pu.get_gpu_info()['gpu_name']} |",
    f"| CV Pool Size | {len(cv_pool_df)} |",
    f"| Test Size (frozen) | {len(test_df)} |",
    f"| Effective Batch Size | {EFFECTIVE_BATCH_SIZE} |",
    "",
    "## Fixed Parameters (aus Phase 1)",
    "| Parameter | Value |",
    "|---|---|",
    f"| lr_scheduler_type | {FIXED_LR_SCHEDULER_TYPE} |",
    f"| per_device_train_batch_size | {FIXED_BATCH_SIZE_TRAIN} |",
    f"| gradient_accumulation_steps | {FIXED_GRAD_ACCUM} |",
    f"| bf16 | {FIXED_BF16} |",
    f"| fp16 | {FIXED_FP16} |",
    f"| gradient_checkpointing | {FIXED_GRADIENT_CHECKPOINTING} |",
    f"| group_by_length | {FIXED_GROUP_BY_LENGTH} |",
    f"| optim | {FIXED_OPTIM} |",
    f"| early_stopping_patience | {FIXED_EARLY_STOPPING_PATIENCE} |",
    f"| max_length | {MAX_LENGTH} |",
    "",
    "## Eingeengte Suchbereiche (Phase 2)",
    "| Parameter | Range |",
    "|---|---|",
]
for param, range_val in HP_RANGES.items():
    report_lines.append(f"| {param} | {range_val} |")

report_lines += [
    "",
    "## Beste Hyperparameter",
    "| Parameter | Value |",
    "|---|---|",
]
for key, val in study.best_params.items():
    report_lines.append(f"| **{key}** | **{val}** |")
# Fixierte Parameter auch auflisten
report_lines.append(f"| **lr_scheduler_type** | **{FIXED_LR_SCHEDULER_TYPE}** (fixiert) |")
report_lines.append(f"| **per_device_train_batch_size** | **{FIXED_BATCH_SIZE_TRAIN}** (fixiert) |")

report_lines += [
    "",
    f"**Best F1 Macro (CV Mean): {study.best_value:.4f}**",
    f"**Best Trial: {study.best_trial.number}**",
    "",
    "## Alle Trials",
    "",
]

table_cols = ["number", "value", "state"] + [c for c in trials_df.columns if c.startswith("params_")]
report_lines.append(trials_df[table_cols].to_markdown(index=False))
report_lines += [
    "",
    "---",
    "*Generated by eurobert_210_hpt_phase_2.ipynb*",
]

report_md = "\n".join(report_lines)
report_path = Path(pu.REPORTS_DIR) / f"{report_name}.md"
report_path.write_text(report_md, encoding="utf-8")

# --- JSON Sidecar ---
json_data = {
    "report_id": report_name,
    "model_id": MODEL_ID,
    "phase": 2,
    "phase1_best_value": 0.8393,
    "n_trials": N_TRIALS,
    "n_folds": N_FOLDS,
    "best_value": round(study.best_value, 4),
    "best_trial_number": study.best_trial.number,
    "best_params": {
        **study.best_params,
        "lr_scheduler_type": FIXED_LR_SCHEDULER_TYPE,
        "per_device_train_batch_size": FIXED_BATCH_SIZE_TRAIN,
    },
    "all_trials": [
        {
            "number": t.number,
            "value": round(t.value, 4) if t.value is not None else None,
            "params": t.params,
            "state": str(t.state.name),
            "duration_s": round(t.duration.total_seconds(), 1) if t.duration else None,
        }
        for t in study.trials
    ],
    "search_ranges": {k: str(v) for k, v in HP_RANGES.items()},
    "fixed_params": {
        "lr_scheduler_type": FIXED_LR_SCHEDULER_TYPE,
        "per_device_train_batch_size": FIXED_BATCH_SIZE_TRAIN,
        "gradient_accumulation_steps": FIXED_GRAD_ACCUM,
        "bf16": FIXED_BF16,
        "fp16": FIXED_FP16,
        "gradient_checkpointing": FIXED_GRADIENT_CHECKPOINTING,
        "optim": FIXED_OPTIM,
        "early_stopping_patience": FIXED_EARLY_STOPPING_PATIENCE,
        "group_by_length": FIXED_GROUP_BY_LENGTH,
        "max_length": MAX_LENGTH,
        "effective_batch_size": EFFECTIVE_BATCH_SIZE,
    },
    "duration_formatted": duration_str,
    "gpu": pu.get_gpu_info(),
}
json_path = Path(pu.REPORTS_DIR) / f"{report_name}.json"
json_path.write_text(json.dumps(json_data, ensure_ascii=False, indent=2), encoding="utf-8")

print(f"Report: {report_path}")
print(f"JSON:   {json_path}")

In [None]:
# ===== FINALE PARAMETER FUER TRAINING =====
from datetime import datetime

print("=" * 70)
print("  FINALE HYPERPARAMETER (kopieren fuer finales Training)")
print("=" * 70)
print()
print(f"# Aus HPT Phase 2 ({datetime.now().strftime('%Y-%m-%d')})")
print(f"# Best F1 Macro (CV Mean): {study.best_value:.4f}")
print(f"# Phase 1 Best: 0.8393 -> Phase 2 Best: {study.best_value:.4f} ({study.best_value - 0.8393:+.4f})")
print(f"# Trial {study.best_trial.number}")
print()
print(f"LEARNING_RATE = {study.best_params['learning_rate']}")
print(f"WEIGHT_DECAY = {study.best_params['weight_decay']}")
print(f"WARMUP_RATIO = {study.best_params['warmup_ratio']}")
print(f"LABEL_SMOOTHING_FACTOR = {study.best_params['label_smoothing_factor']}")
print(f'LR_SCHEDULER_TYPE = "{FIXED_LR_SCHEDULER_TYPE}"')
print(f"CLASSIFIER_DROPOUT = {study.best_params['classifier_dropout']}")
print(f"PER_DEVICE_TRAIN_BATCH_SIZE = {FIXED_BATCH_SIZE_TRAIN}")
print(f"NUM_TRAIN_EPOCHS = {study.best_params['num_train_epochs']}")
print()
print("# Fixed parameters:")
print(f"BF16 = {FIXED_BF16}")
print(f"GRADIENT_CHECKPOINTING = {FIXED_GRADIENT_CHECKPOINTING}")
print(f'OPTIM = "{FIXED_OPTIM}"')
print(f"EARLY_STOPPING_PATIENCE = {FIXED_EARLY_STOPPING_PATIENCE}")
print(f"GROUP_BY_LENGTH = {FIXED_GROUP_BY_LENGTH}")
print(f"MAX_LENGTH = {MAX_LENGTH}")
print(f"EFFECTIVE_BATCH_SIZE = {EFFECTIVE_BATCH_SIZE}")
print(f"GRADIENT_ACCUMULATION_STEPS = {FIXED_GRAD_ACCUM}")
print()
print("# Naechster Schritt: Finales Training")
print("# -> Training mit diesen Parametern auf dem gesamten CV-Pool")
print("# -> Evaluation auf dem eingefrorenen Test-Set")
print("# -> Upload auf HuggingFace Hub")
print("=" * 70)

In [None]:
# ===== CLEANUP + AUTO-SHUTDOWN =====
import shutil

if os.path.exists("/content/hpt_tmp"):
    for item in Path("/content/hpt_tmp").iterdir():
        if item.name != "tb_logs":
            shutil.rmtree(item, ignore_errors=True)
    print("Temporaere Checkpoint-Dateien geloescht.")
    print("TensorBoard Logs behalten: /content/hpt_tmp/tb_logs/")

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    free_mem = torch.cuda.mem_get_info()[0] / 1e9
    print(f"GPU VRAM frei: {free_mem:.1f} GB")

print(f"\nErgebnisse auf Google Drive:")
print(f"  Report: {report_path}")
print(f"  JSON:   {json_path}")
print(f"  Plot:   {png_path}")
print(f"  DB:     {storage_path}")

print("\nRuntime wird beendet...")
from google.colab import runtime
runtime.unassign()