# Hyperparameter Tuning Phase 1: EuroBERT-210M
## Optuna + Stratified 3-Fold Cross-Validation

Systematische Hyperparameter-Suche fuer EuroBERT-210M auf dem deutschen
Nachrichtenklassifikations-Datensatz (13 Klassen, Bundestagswahl 2025).

**Strategie:**
- Optuna TPE-Sampler mit 3-Fold Stratified CV
- F1 Macro als Optimierungsmetrik
- Keine Modellgewichte gespeichert (nur beste Parameter)
- Ergebnisse als Report + Visualisierungen auf Google Drive
- SQLite-Backend fuer Crash-Recovery (Colab Pro)

**Voraussetzung:** GPU-Runtime (L4 empfohlen), `HF_TOKEN` in Colab Secrets hinterlegt.

In [None]:
# === SETUP ===
import os, sys

# Repo klonen / aktualisieren
REPO = "/content/news_articles_classification_thesis"
if not os.path.exists(REPO):
    !git clone https://github.com/ZorbeyOezcan/news_articles_classification_thesis.git {REPO}
else:
    !cd {REPO} && git pull -q

# Dependencies (+ optuna, plotly, kaleido fuer HPT)
!pip install -q transformers[sentencepiece] datasets huggingface_hub \
    scikit-learn matplotlib seaborn tqdm pandas accelerate evaluate \
    optuna plotly kaleido

# Google Drive mounten (persistente Reports + Optuna DB)
from google.colab import drive
drive.mount("/content/drive", force_remount=False)

# pipeline_utils importierbar machen
PIPELINE_DIR = f"{REPO}/Python/classification_pipeline"
if PIPELINE_DIR not in sys.path:
    sys.path.insert(0, PIPELINE_DIR)

# hpt_utils importierbar machen
HPT_DIR = f"{REPO}/Python/classification_pipeline/hyper_parameter_tuning"
if HPT_DIR not in sys.path:
    sys.path.insert(0, HPT_DIR)

import importlib
import pipeline_utils as pu
importlib.reload(pu)
import hpt_utils as hu
importlib.reload(hu)

# HuggingFace Login
from huggingface_hub import login
from google.colab import userdata
login(token=userdata.get("HF_TOKEN"))

# Auto-Shutdown Watchdog: maximale Runtime begrenzen (Sicherheitsnetz fuer Colab Pro)
# Wird nach MAX_RUNTIME_HOURS ausgeloest, falls das Notebook haengt.
import threading, time as _time
MAX_RUNTIME_HOURS = 5  # Sicherheitsgrenze (HPT sollte in 2-3h fertig sein)

def _auto_shutdown():
    """Beendet Runtime nach MAX_RUNTIME_HOURS als Sicherheitsnetz."""
    _time.sleep(MAX_RUNTIME_HOURS * 3600)
    print(f"\n[WATCHDOG] Max Runtime ({MAX_RUNTIME_HOURS}h) erreicht. Runtime wird beendet.")
    try:
        from google.colab import runtime
        runtime.unassign()
    except Exception:
        pass
threading.Thread(target=_auto_shutdown, daemon=True).start()

print(f"Reports-Ordner: {pu.REPORTS_DIR}")
print(f"Setup abgeschlossen. Watchdog: Runtime wird nach max {MAX_RUNTIME_HOURS}h beendet.")

In [None]:
# ===== HPT CONFIGURATION =====
import torch
import numpy as np

MODEL_ID = "EuroBERT/EuroBERT-210m"
MODEL_SHORT_NAME = "eurobert_210m"
MAX_LENGTH = 2048
RANDOM_SEED = 42

# ----- Cross-Validation -----
N_FOLDS = 3  # k=3: sicher fuer Politikverdruss (~9 Samples im CV-Pool)

# ----- Optuna -----
N_TRIALS = 15
STUDY_NAME = "eurobert_210m_hpt_phase1"
OPTUNA_SEED = 42
OPTUNA_TIMEOUT = None  # Optional: max Sekunden (z.B. 10800 fuer 3h)

# ----- Database Management -----
# "new"      = neue DB erstellen (auto-Nummerierung wenn Name existiert)
# "continue" = vorhandene DB laden, fehlende Trials nachlaufen
DB_MODE = "new"

# ----- Fixed Training Parameters (geaendert vs. Original) -----
# Gradient Checkpointing AUS: EuroBERTs custom modeling code (trust_remote_code)
# verursacht loss=0 / grad_norm=nan mit gradient_checkpointing auf GPU.
# 210M ist klein genug fuer L4/A100 ohne Checkpointing.
FIXED_GRADIENT_CHECKPOINTING = False
FIXED_LOGGING_STEPS = 10
FIXED_REPORT_TO = "tensorboard"
FIXED_DATALOADER_NUM_WORKERS = 4
FIXED_EARLY_STOPPING_PATIENCE = 3
FIXED_GROUP_BY_LENGTH = True

# Mixed Precision: GPU-adaptiv
# BF16 fuer Ampere+ (L4, A100): groesserer Wertebereich, verhindert FP16-Overflow
# FP16 fuer aeltere GPUs (T4): kein BF16-Support
if torch.cuda.is_available():
    _gpu_cap = torch.cuda.get_device_capability()
    _gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    if _gpu_cap[0] >= 8:  # Ampere oder neuer (L4, A100)
        FIXED_BF16 = True
        FIXED_FP16 = False
    else:  # Aeltere GPUs (T4 = Compute Capability 7.5)
        FIXED_BF16 = False
        FIXED_FP16 = True
    # Fused AdamW nur auf CUDA
    FIXED_OPTIM = "adamw_torch_fused"
    # Eval Batch Size GPU-adaptiv
    if _gpu_mem >= 40:
        FIXED_BATCH_SIZE_EVAL = 32
    elif _gpu_mem >= 20:
        FIXED_BATCH_SIZE_EVAL = 16
    else:
        FIXED_BATCH_SIZE_EVAL = 8
    print(f"GPU: {torch.cuda.get_device_name(0)} ({_gpu_mem:.1f} GB, CC {_gpu_cap[0]}.{_gpu_cap[1]})")
    print(f"  FP16={FIXED_FP16}, BF16={FIXED_BF16}, Eval Batch={FIXED_BATCH_SIZE_EVAL}")
    print(f"  Gradient Checkpointing: {FIXED_GRADIENT_CHECKPOINTING}")
else:
    raise RuntimeError("HPT benoetigt eine GPU! Bitte Colab Runtime aendern.")

# ----- Hyperparameter Search Ranges -----
# batch_size [4, 8]: 16 OOMt auf L4 ohne gradient_checkpointing (Attention-Matrizen
# bei seq_len=2048 brauchen ~1.5 GB/Layer). Effektive BS=16 via grad_accum.
HP_RANGES = {
    "learning_rate": (1e-5, 5e-5),              # log scale
    "weight_decay": (0.0, 0.1),
    "warmup_ratio": (0.0, 0.15),
    "label_smoothing_factor": (0.0, 0.1),
    "lr_scheduler_type": ["linear", "cosine"],
    "classifier_dropout": (0.0, 0.5),
    "per_device_train_batch_size": [4, 8],
    "num_train_epochs": (5, 15),                 # int range
}

# Effektive Batch Size soll 16 bleiben -> grad_accum anpassen
EFFECTIVE_BATCH_SIZE = 16

# ----- Labels -----
ALL_LABELS = [
    "Klima / Energie", "Zuwanderung", "Renten", "Soziales Gef\u00e4lle",
    "AfD/Rechte", "Arbeitslosigkeit", "Wirtschaftslage", "Politikverdruss",
    "Gesundheitswesen, Pflege", "Kosten/L\u00f6hne/Preise",
    "Ukraine/Krieg/Russland", "Bundeswehr/Verteidigung", "Andere",
]

# Split-Konfiguration
TEST_PER_CLASS = 30

print(f"\nOptuna HPT: {N_TRIALS} Trials, {N_FOLDS}-Fold CV")
print(f"Modell: {MODEL_ID}")
print(f"Max Length: {MAX_LENGTH}")
print(f"Effektive Batch Size: {EFFECTIVE_BATCH_SIZE}")
print(f"DB Mode: {DB_MODE}")
print(f"Labels: {len(ALL_LABELS)} Klassen")

In [None]:
# ===== DATEN LADEN & CUSTOM SPLIT =====
# Test-Set wird EINMAL extrahiert und eingefroren.
# Die verbleibenden Daten = CV-Pool fuer k-Fold (kein fixer Train/Val-Split).

import pandas as pd
from datasets import load_dataset

np.random.seed(RANDOM_SEED)

ds = load_dataset(pu.DATASET_ID)
train_hf = ds["train"].to_pandas()
test_hf = ds["test"].to_pandas()
all_labelled = pd.concat([train_hf, test_hf], ignore_index=True)

print(f"Gesamtpool gelabelter Artikel: {len(all_labelled)}")
print(f"Klassen im Datensatz: {all_labelled['label'].nunique()}")
print()

# --- Test-Split (identisch wie im Fine-Tuning Notebook) ---
test_indices = []
rest_indices = []

for label in ALL_LABELS:
    label_mask = all_labelled["label"] == label
    label_indices = all_labelled[label_mask].index.tolist()
    n_total = len(label_indices)

    if n_total < 60:
        n_test = n_total // 2
        print(f"  {label}: nur {n_total} Artikel -> {n_test} fuer Test (Haelfte)")
    else:
        n_test = TEST_PER_CLASS

    np.random.shuffle(label_indices)
    test_indices.extend(label_indices[:n_test])
    rest_indices.extend(label_indices[n_test:])

test_df = all_labelled.loc[test_indices].reset_index(drop=True)
cv_pool_df = all_labelled.loc[rest_indices].reset_index(drop=True)

print(f"\nTest (eingefroren):  {len(test_df)} Artikel")
print(f"CV-Pool (fuer Folds): {len(cv_pool_df)} Artikel")

# Klassenverteilung
print("\nCV-Pool Klassenverteilung:")
cv_dist = cv_pool_df["label"].value_counts().sort_index()
for label, count in cv_dist.items():
    print(f"  {label}: {count}")
print(f"  TOTAL: {len(cv_pool_df)}")

In [None]:
# ===== LABEL ENCODING =====
label2id = {label: idx for idx, label in enumerate(ALL_LABELS)}
id2label = {idx: label for idx, label in enumerate(ALL_LABELS)}

cv_pool_df["label_id"] = cv_pool_df["label"].map(label2id)
test_df["label_id"] = test_df["label"].map(label2id)

# Sanity check
assert cv_pool_df["label_id"].isna().sum() == 0, "Unbekannte Labels im CV-Pool!"
assert test_df["label_id"].isna().sum() == 0, "Unbekannte Labels im Test-Set!"

print("Label-Mapping:")
for label, idx in label2id.items():
    print(f"  {idx:>2}: {label}")
print(f"\nAnzahl Klassen: {len(ALL_LABELS)}")

In [None]:
# ===== TOKENIZER + EUROBERT ROPE FIX =====
from transformers import AutoTokenizer
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# EuroBERT ROPE Fix (identisch mit Fine-Tuning Notebooks)
def _default_rope_init(config, device=None, **kwargs):
    base = getattr(config, "rope_theta", 10000.0)
    partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
    dim = int(head_dim * partial_rotary_factor)
    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
    return inv_freq, 1.0

ROPE_INIT_FUNCTIONS["default"] = _default_rope_init
print("ROPE_INIT_FUNCTIONS gepatcht.")

# Tokenisierungsfunktion (wiederverwendbar pro Fold)
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        max_length=MAX_LENGTH,
        truncation=True,
    )

print(f"Tokenizer geladen: {MODEL_ID}")

In [None]:
# ===== OPTUNA OBJECTIVE MIT K-FOLD CV =====
import gc
import shutil
import optuna
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Extended compute_metrics (per-class scores + confusion matrix support)
compute_metrics = hu.make_compute_metrics(ALL_LABELS, id2label)

# Fold-Indizes vorab berechnen (gleiche Folds fuer jeden Trial -> fairer Vergleich)
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)
fold_indices = list(skf.split(cv_pool_df, cv_pool_df["label_id"]))

# Sanity-Check: Fold-Verteilung
print(f"Stratified {N_FOLDS}-Fold CV:")
for i, (train_idx, val_idx) in enumerate(fold_indices):
    val_labels = cv_pool_df.iloc[val_idx]["label"].value_counts()
    min_val_count = val_labels.min()
    min_val_class = val_labels.idxmin()
    print(f"  Fold {i+1}: Train={len(train_idx)}, Val={len(val_idx)}, "
          f"min Val-Klasse: {min_val_class} ({min_val_count} Samples)")


def create_model(classifier_dropout):
    """Erstellt ein frisches Modell mit dem angegebenen Dropout."""
    config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
    config.num_labels = len(ALL_LABELS)
    config.id2label = id2label
    config.label2id = label2id
    config.classifier_dropout = classifier_dropout

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        config=config,
        ignore_mismatched_sizes=True,
        trust_remote_code=True,
    )
    if FIXED_GRADIENT_CHECKPOINTING and torch.cuda.is_available():
        model.gradient_checkpointing_enable()
    return model


def _cleanup_cuda():
    """VRAM aufraeumen nach OOM oder normalem Fold-Ende."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()


def objective(trial):
    """Optuna objective: mean F1 Macro across k folds.

    Collects per-epoch metrics (per-class scores, confusion matrices) via
    HPTMetricsCallback. Detects NaN/LR=0 at epoch level for early pruning.
    """

    # --- Hyperparameter samplen ---
    lr = trial.suggest_float("learning_rate", *HP_RANGES["learning_rate"], log=True)
    wd = trial.suggest_float("weight_decay", *HP_RANGES["weight_decay"])
    warmup = trial.suggest_float("warmup_ratio", *HP_RANGES["warmup_ratio"])
    label_smooth = trial.suggest_float("label_smoothing_factor", *HP_RANGES["label_smoothing_factor"])
    scheduler = trial.suggest_categorical("lr_scheduler_type", HP_RANGES["lr_scheduler_type"])
    dropout = trial.suggest_float("classifier_dropout", *HP_RANGES["classifier_dropout"])
    batch_size = trial.suggest_categorical("per_device_train_batch_size", HP_RANGES["per_device_train_batch_size"])
    epochs = trial.suggest_int("num_train_epochs", *HP_RANGES["num_train_epochs"])

    # gradient_accumulation_steps anpassen um effektive Batch Size = 16 zu halten
    grad_accum = max(1, EFFECTIVE_BATCH_SIZE // batch_size)

    print(f"\n{'='*60}")
    print(f"Trial {trial.number}: lr={lr:.2e}, wd={wd:.3f}, warmup={warmup:.3f}, "
          f"ls={label_smooth:.3f}, sched={scheduler}, dropout={dropout:.3f}, "
          f"batch={batch_size}, grad_accum={grad_accum}, epochs={epochs}")
    print(f"{'='*60}")

    fold_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(fold_indices):
        print(f"  Fold {fold_idx + 1}/{N_FOLDS}...", end=" ", flush=True)

        # VRAM sauber starten
        _cleanup_cuda()

        # --- Fold Datasets erstellen ---
        fold_train_df = cv_pool_df.iloc[train_idx]
        fold_val_df = cv_pool_df.iloc[val_idx]

        train_ds = Dataset.from_pandas(
            fold_train_df[["text", "label_id"]].rename(columns={"label_id": "labels"})
        )
        val_ds = Dataset.from_pandas(
            fold_val_df[["text", "label_id"]].rename(columns={"label_id": "labels"})
        )

        train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
        val_ds = val_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
        train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
        val_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

        # --- Output / Logging Verzeichnisse ---
        fold_output_dir = f"/content/hpt_tmp/trial_{trial.number}_fold_{fold_idx}"
        fold_logging_dir = f"/content/hpt_tmp/tb_logs/trial_{trial.number:03d}_fold_{fold_idx}"

        # --- Per-epoch metrics callback ---
        metrics_callback = hu.HPTMetricsCallback(
            trial_number=trial.number,
            fold_idx=fold_idx,
            all_labels=ALL_LABELS,
            id2label=id2label,
        )

        # --- TrainingArguments ---
        training_args = TrainingArguments(
            output_dir=fold_output_dir,
            num_train_epochs=epochs,
            learning_rate=lr,
            weight_decay=wd,
            warmup_ratio=warmup,
            label_smoothing_factor=label_smooth,
            lr_scheduler_type=scheduler,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=FIXED_BATCH_SIZE_EVAL,
            gradient_accumulation_steps=grad_accum,
            bf16=FIXED_BF16,
            fp16=FIXED_FP16,
            gradient_checkpointing=FIXED_GRADIENT_CHECKPOINTING,
            optim=FIXED_OPTIM,
            group_by_length=FIXED_GROUP_BY_LENGTH,
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            metric_for_best_model="f1_macro",
            greater_is_better=True,
            logging_strategy="steps",
            logging_steps=FIXED_LOGGING_STEPS,
            logging_dir=fold_logging_dir,
            report_to=FIXED_REPORT_TO,
            seed=RANDOM_SEED + fold_idx,
            dataloader_num_workers=FIXED_DATALOADER_NUM_WORKERS,
            dataloader_pin_memory=True,
            disable_tqdm=False,
        )

        try:
            # --- Modell + Trainer ---
            model = create_model(dropout)

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_ds,
                eval_dataset=val_ds,
                data_collator=data_collator,
                compute_metrics=compute_metrics,
                callbacks=[
                    EarlyStoppingCallback(early_stopping_patience=FIXED_EARLY_STOPPING_PATIENCE),
                    metrics_callback,
                ],
            )

            # --- Training ---
            trainer.train()

            # --- Check callback for NaN/LR=0 detection ---
            if metrics_callback.nan_detected or metrics_callback.lr_zero_detected:
                reason = "NaN" if metrics_callback.nan_detected else "LR=0"
                print(f"{reason} detected — pruning Trial {trial.number}")
                # Store partial fold data for analysis
                hu.store_fold_metrics_partial(trial, fold_idx, metrics_callback, ALL_LABELS)
                trial.set_user_attr("nan_trial", True)
                trial.set_user_attr("nan_reason", reason)
                trial.set_user_attr("nan_fold_idx", fold_idx)
                raise optuna.TrialPruned()

            # --- Evaluation (bestes Modell dank load_best_model_at_end) ---
            eval_result = trainer.evaluate()
            fold_f1 = eval_result["eval_f1_macro"]

            # Safety net: NaN in final eval
            if np.isnan(eval_result.get("eval_loss", 0)) or fold_f1 < 0.05:
                print(f"NaN/collapse in final eval (F1={fold_f1:.4f}) — pruning")
                hu.store_fold_metrics(trial, fold_idx, metrics_callback, eval_result, ALL_LABELS, id2label)
                trial.set_user_attr("nan_trial", True)
                trial.set_user_attr("nan_reason", "eval_nan")
                trial.set_user_attr("nan_fold_idx", fold_idx)
                raise optuna.TrialPruned()

            # Store fold metrics (per-class scores, confusion matrix, epoch history)
            hu.store_fold_metrics(trial, fold_idx, metrics_callback, eval_result, ALL_LABELS, id2label)

            fold_scores.append(fold_f1)
            print(f"F1 Macro = {fold_f1:.4f}")

        except torch.cuda.OutOfMemoryError:
            print(f"\n  OOM in Trial {trial.number}, Fold {fold_idx + 1}! Ueberspringe Trial.")
            _cleanup_cuda()
            if os.path.exists(fold_output_dir):
                shutil.rmtree(fold_output_dir, ignore_errors=True)
            raise optuna.TrialPruned()

        finally:
            # --- Cleanup (immer ausfuehren) ---
            for var in ["trainer", "model", "training_args", "train_ds", "val_ds"]:
                try:
                    exec(f"del {var}")
                except NameError:
                    pass
            _cleanup_cuda()
            if os.path.exists(fold_output_dir):
                shutil.rmtree(fold_output_dir, ignore_errors=True)

        # --- Pruning: Zwischenergebnis melden ---
        trial.report(np.mean(fold_scores), fold_idx)
        if trial.should_prune():
            print(f"  Trial {trial.number} PRUNED nach Fold {fold_idx + 1}")
            raise optuna.TrialPruned()

    # Store aggregated trial summary
    hu.store_trial_summary(trial, N_FOLDS, ALL_LABELS)

    mean_f1 = np.mean(fold_scores)
    std_f1 = np.std(fold_scores)
    print(f"\n  -> Trial {trial.number}: F1 Macro = {mean_f1:.4f} +/- {std_f1:.4f}")

    return mean_f1


print("Objective Function definiert.")
print(f"Folds: {N_FOLDS}, Trials: {N_TRIALS}")
print(f"Effektive Batch Size: {EFFECTIVE_BATCH_SIZE}")

In [None]:
# ===== OPTUNA STUDY STARTEN =====
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from pathlib import Path

# DB Management: Pfad aufloesen basierend auf DB_MODE
storage_url, storage_path = hu.resolve_db_path(
    db_mode=DB_MODE,
    study_name=STUDY_NAME,
    base_dir=Path(pu.REPORTS_DIR),
)

print(f"Optuna DB: {storage_path}")
print(f"  -> Bei Crash: DB_MODE='continue' setzen, 'Run All' -> Trials werden fortgesetzt")

sampler = TPESampler(seed=OPTUNA_SEED, n_startup_trials=5)
pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=1)

study, n_remaining, n_existing = hu.setup_study(
    storage_url=storage_url,
    study_name=STUDY_NAME,
    db_mode=DB_MODE,
    n_trials_target=N_TRIALS,
    sampler=sampler,
    pruner=pruner,
)

if n_remaining == 0:
    print("Alle Trials bereits abgeschlossen — ueberspringe optimize().")
else:
    timer_hpt = pu.ExperimentTimer()
    with timer_hpt:
        n_valid = hu.run_hpt_with_nan_exclusion(
            study=study,
            objective=objective,
            n_valid_target=n_remaining,
            timeout=OPTUNA_TIMEOUT,
        )
    print(f"\nOptuna HPT abgeschlossen: {timer_hpt.duration_formatted}")

print(f"\n{'='*60}")
print(f"Beste Trial: {study.best_trial.number}")
print(f"Bester F1 Macro (CV Mean): {study.best_value:.4f}")
print(f"\nBeste Hyperparameter:")
for key, val in study.best_params.items():
    print(f"  {key}: {val}")
print(f"{'='*60}")

In [None]:
# ===== OPTUNA QUICK SANITY CHECK (minimal — full analysis in analyse_hyperparameter_tuning.ipynb) =====
import optuna.visualization as vis

# 1. Optimization History
fig_history = vis.plot_optimization_history(study)
fig_history.update_layout(title="Optimization History: F1 Macro ueber Trials")
fig_history.show()

# 2. Parameter Importances (fANOVA)
try:
    fig_importance = vis.plot_param_importances(study)
    fig_importance.update_layout(title="Parameter Importance (fANOVA)")
    fig_importance.show()
except Exception as e:
    print(f"Parameter Importance nicht verfuegbar: {e}")

print("Sanity-Check Plots erstellt.")
print("Fuer vollstaendige Analyse: analyse_hyperparameter_tuning.ipynb lokal ausfuehren.")

In [None]:
# ===== QUICK RESULTS OVERVIEW =====
# Detaillierte Visualisierungen: analyse_hyperparameter_tuning.ipynb

trials_df = study.trials_dataframe(attrs=("number", "value", "params", "state", "duration"))
trials_df = trials_df.sort_values("value", ascending=False)

completed = trials_df[trials_df["state"] == "COMPLETE"].copy()
pruned = trials_df[trials_df["state"] == "PRUNED"].copy()

print(f"Trials: {len(completed)} abgeschlossen, {len(pruned)} gepruned")
print(f"\nTop 5 Trials:")
top5_cols = ["number", "value"] + [c for c in completed.columns if c.startswith("params_")]
print(completed[top5_cols].head().to_string(index=False))

print(f"\nStatistiken ueber alle abgeschlossenen Trials:")
print(f"  Mean F1:   {completed['value'].mean():.4f}")
print(f"  Std F1:    {completed['value'].std():.4f}")
print(f"  Min F1:    {completed['value'].min():.4f}")
print(f"  Max F1:    {completed['value'].max():.4f}")

In [None]:
# ===== HPT REPORT GENERIEREN =====
import json
from datetime import datetime

now = datetime.now()
report_name = f"{now.strftime('%d%m%y')}_{MODEL_SHORT_NAME}_hpt_phase1"

# Timer-Dauer (falls optimize() gelaufen ist)
duration_str = timer_hpt.duration_formatted if 'timer_hpt' in dir() else "N/A (aus DB geladen)"

# --- Markdown Report ---
report_lines = [
    f"# HPT Report: EuroBERT-210M Phase 1",
    f"**Generated:** {now.strftime('%Y-%m-%d %H:%M:%S')}",
    "",
    "---",
    "",
    "## Configuration",
    "| Property | Value |",
    "|---|---|",
    f"| Model | {MODEL_ID} |",
    f"| N Trials | {N_TRIALS} |",
    f"| N Folds | {N_FOLDS} |",
    f"| Completed Trials | {len(completed)} |",
    f"| Pruned Trials | {len(pruned)} |",
    f"| Duration | {duration_str} |",
    f"| GPU | {pu.get_gpu_info()['gpu_name']} |",
    f"| CV Pool Size | {len(cv_pool_df)} |",
    f"| Test Size (frozen) | {len(test_df)} |",
    f"| Effective Batch Size | {EFFECTIVE_BATCH_SIZE} |",
    f"| DB Mode | {DB_MODE} |",
    f"| DB Path | {storage_path} |",
    "",
    "## Fixed Parameters",
    "| Parameter | Value |",
    "|---|---|",
    f"| bf16 | {FIXED_BF16} |",
    f"| fp16 | {FIXED_FP16} |",
    f"| gradient_checkpointing | {FIXED_GRADIENT_CHECKPOINTING} |",
    f"| group_by_length | {FIXED_GROUP_BY_LENGTH} |",
    f"| optim | {FIXED_OPTIM} |",
    f"| early_stopping_patience | {FIXED_EARLY_STOPPING_PATIENCE} |",
    f"| logging_steps | {FIXED_LOGGING_STEPS} |",
    f"| max_length | {MAX_LENGTH} |",
    "",
    "## Search Ranges",
    "| Parameter | Range |",
    "|---|---|",
]
for param, range_val in HP_RANGES.items():
    report_lines.append(f"| {param} | {range_val} |")

report_lines += [
    "",
    f"**Best F1 Macro (CV Mean): {study.best_value:.4f}**",
    f"**Best Trial: {study.best_trial.number}**",
    "",
    "## Alle Trials",
    "",
]

# Trials-Tabelle
table_cols = ["number", "value", "state"] + [c for c in trials_df.columns if c.startswith("params_")]
report_lines.append(trials_df[table_cols].to_markdown(index=False))
report_lines += [
    "",
    "---",
    "*Best parameter extraction: see analyse_hyperparameter_tuning.ipynb*",
    "*Generated by eurobert_210_hpt_phase_1.ipynb*",
]

report_md = "\n".join(report_lines)
report_path = Path(pu.REPORTS_DIR) / f"{report_name}.md"
report_path.write_text(report_md, encoding="utf-8")

# --- JSON Sidecar ---
json_data = {
    "report_id": report_name,
    "model_id": MODEL_ID,
    "n_trials": N_TRIALS,
    "n_folds": N_FOLDS,
    "best_value": round(study.best_value, 4),
    "best_trial_number": study.best_trial.number,
    "best_params": study.best_params,
    "all_trials": [
        {
            "number": t.number,
            "value": round(t.value, 4) if t.value is not None else None,
            "params": t.params,
            "state": str(t.state.name),
            "duration_s": round(t.duration.total_seconds(), 1) if t.duration else None,
        }
        for t in study.trials
    ],
    "search_ranges": {k: str(v) for k, v in HP_RANGES.items()},
    "fixed_params": {
        "bf16": FIXED_BF16,
        "fp16": FIXED_FP16,
        "gradient_checkpointing": FIXED_GRADIENT_CHECKPOINTING,
        "optim": FIXED_OPTIM,
        "early_stopping_patience": FIXED_EARLY_STOPPING_PATIENCE,
        "group_by_length": FIXED_GROUP_BY_LENGTH,
        "max_length": MAX_LENGTH,
        "effective_batch_size": EFFECTIVE_BATCH_SIZE,
    },
    "duration_formatted": duration_str,
    "gpu": pu.get_gpu_info(),
    "db_mode": DB_MODE,
    "db_path": storage_path,
}
json_path = Path(pu.REPORTS_DIR) / f"{report_name}.json"
json_path.write_text(json.dumps(json_data, ensure_ascii=False, indent=2), encoding="utf-8")

print(f"Report: {report_path}")
print(f"JSON:   {json_path}")

In [None]:
# ===== HINWEIS: PARAMETER-EXTRAKTION =====
# Beste Hyperparameter werden NICHT hier extrahiert.
# -> Verwende analyse_hyperparameter_tuning.ipynb lokal fuer:
#    - Vollstaendige Analyse (per-epoch, per-class, confusion matrices)
#    - Multi-DB Vergleich
#    - Alle Optuna-Visualisierungen
#    - Copy-paste-ready beste Hyperparameter
print("Best parameter extraction: see analyse_hyperparameter_tuning.ipynb")
print(f"DB location: {storage_path}")

In [None]:
# ===== CLEANUP + AUTO-SHUTDOWN =====
import shutil

# Temporaere HPT-Dateien loeschen (Checkpoints etc.)
if os.path.exists("/content/hpt_tmp"):
    # TensorBoard Logs behalten (auf lokaler Disk, nicht Drive)
    # Nur Checkpoint-Ordner loeschen
    for item in Path("/content/hpt_tmp").iterdir():
        if item.name != "tb_logs":
            shutil.rmtree(item, ignore_errors=True)
    print("Temporaere Checkpoint-Dateien geloescht.")
    print("TensorBoard Logs behalten: /content/hpt_tmp/tb_logs/")

# GPU-Speicher freigeben
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    free_mem = torch.cuda.mem_get_info()[0] / 1e9
    print(f"GPU VRAM frei: {free_mem:.1f} GB")

print(f"\nErgebnisse auf Google Drive:")
print(f"  Report: {report_path}")
print(f"  JSON:   {json_path}")
print(f"  Plot:   {png_path}")
print(f"  DB:     {storage_path}")

# Runtime beenden (Colab Pro — spart Kosten)
print("\nRuntime wird beendet...")
from google.colab import runtime
runtime.unassign()