# Fine-Tuning Classification: EuroBERT-210m
## Model: EuroBERT/EuroBERT-210m

Single-label Klassifikation (13 Klassen) mit Hugging Face Trainer API.
Fine-Tuning auf gelabelten deutschen Nachrichtenartikeln (Bundestagswahl 2025).

**Voraussetzung:** GPU-Runtime aktiviert (T4 / L4), `HF_TOKEN` in Colab Secrets hinterlegt.

In [1]:
# === SETUP (in jedem Notebook identisch) ===
import os, sys

# Repo klonen / aktualisieren
REPO = "/content/news_articles_classification_thesis"
if not os.path.exists(REPO):
    !git clone https://github.com/ZorbeyOezcan/news_articles_classification_thesis.git {REPO}
else:
    !cd {REPO} && git pull -q

# Dependencies
!pip install -q transformers[sentencepiece] datasets huggingface_hub scikit-learn matplotlib seaborn tqdm pandas accelerate evaluate

# Google Drive mounten (persistente Reports)
from google.colab import drive
drive.mount("/content/drive", force_remount=False)

# pipeline_utils importierbar machen
PIPELINE_DIR = f"{REPO}/Python/classification_pipeline"
if PIPELINE_DIR not in sys.path:
    sys.path.insert(0, PIPELINE_DIR)

import importlib
import pipeline_utils as pu
importlib.reload(pu)

# HuggingFace Login
from huggingface_hub import login
from google.colab import userdata
login(token=userdata.get("HF_TOKEN"))

print(f"Reports-Ordner: {pu.REPORTS_DIR}")
print("Setup abgeschlossen.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reports-Ordner: /content/drive/MyDrive/thesis_reports/performance_reports
Setup abgeschlossen.


In [None]:
# ===== MODEL CONFIG =====
MODEL_ID = "EuroBERT/EuroBERT-210m"
MODEL_SHORT_NAME = "eurobert_210m"
MODEL_TYPE = "fine-tuned"

# Trainings-Hyperparameter
MAX_LENGTH = 2048

# Dynamische Batch Sizes basierend auf verfuegbarer Hardware
import torch
if torch.cuda.is_available():
    _gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    if _gpu_mem >= 40:       # A100 (40/80 GB)
        BATCH_SIZE_TRAIN = 8
        BATCH_SIZE_EVAL = 32
        GRADIENT_ACCUMULATION_STEPS = 2
    elif _gpu_mem >= 20:     # L4 (24 GB)
        BATCH_SIZE_TRAIN = 4
        BATCH_SIZE_EVAL = 16
        GRADIENT_ACCUMULATION_STEPS = 4
    else:                    # T4 (16 GB)
        BATCH_SIZE_TRAIN = 2
        BATCH_SIZE_EVAL = 8
        GRADIENT_ACCUMULATION_STEPS = 8
    print(f"GPU erkannt: {torch.cuda.get_device_name(0)} ({_gpu_mem:.1f} GB)")
    print(f"  -> Batch Size Train: {BATCH_SIZE_TRAIN}, Eval: {BATCH_SIZE_EVAL}, Grad Accum: {GRADIENT_ACCUMULATION_STEPS}")
else:
    BATCH_SIZE_TRAIN = 2
    BATCH_SIZE_EVAL = 8
    GRADIENT_ACCUMULATION_STEPS = 8
    print("WARNUNG: Keine GPU erkannt! Training wird SEHR langsam sein.")
    print("  -> In Colab: Runtime > Change runtime type > T4 GPU")

NUM_EPOCHS = 8
LEARNING_RATE = 2e-5
WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01
FP16 = True  # wird in Trainer-Zelle dynamisch angepasst (FP16/BF16/keins)

# Split-Konfiguration
TEST_PER_CLASS = 30   # Anzahl Testartikel pro Klasse (fix)
VAL_FRACTION = 0.2    # Anteil der restlichen Daten fuer Validation

RANDOM_SEED = 42

# Label-Liste (muss exakt mit dem Datensatz uebereinstimmen)
ALL_LABELS = [
    "Klima / Energie", "Zuwanderung", "Renten", "Soziales Gefälle",
    "AfD/Rechte", "Arbeitslosigkeit", "Wirtschaftslage", "Politikverdruss",
    "Gesundheitswesen, Pflege", "Kosten/Löhne/Preise",
    "Ukraine/Krieg/Russland", "Bundeswehr/Verteidigung", "Andere",
]

# ===== MODEL INFO (fuer Report) =====
MODEL_INFO = {
    "huggingface_id": MODEL_ID,
    "language": "Multilingual (inkl. Deutsch)",
    "max_tokens": MAX_LENGTH,
    "parameters": "210M",
    "notes": "EuroBERT-210m, fine-tuned for single-label classification. Mixed Precision.",
}

print(f"\nModell: {MODEL_ID}")
print(f"Max Length: {MAX_LENGTH}")
print(f"Effektive Batch Size: {BATCH_SIZE_TRAIN * GRADIENT_ACCUMULATION_STEPS}")
print(f"Epochen: {NUM_EPOCHS}")
print(f"Labels: {len(ALL_LABELS)} Klassen")

In [3]:
# ===== DATEN LADEN & CUSTOM SPLIT =====
# Eigene Split-Logik: Test (fix pro Klasse), Rest -> Train/Val stratifiziert

import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split

np.random.seed(RANDOM_SEED)

# Datensatz laden
ds = load_dataset(pu.DATASET_ID)

# Train + Test kombinieren zu einem Gesamtpool
train_hf = ds["train"].to_pandas()
test_hf = ds["test"].to_pandas()
all_labelled = pd.concat([train_hf, test_hf], ignore_index=True)

print(f"Gesamtpool gelabelter Artikel: {len(all_labelled)}")
print(f"Klassen im Datensatz: {all_labelled['label'].nunique()}")
print()

# --- Schritt 1: Test-Split (fix, stratifiziert) ---
test_indices = []
rest_indices = []

for label in ALL_LABELS:
    label_mask = all_labelled["label"] == label
    label_indices = all_labelled[label_mask].index.tolist()
    n_total = len(label_indices)

    if n_total < 60:
        # Weniger als 60 Artikel -> Haelfte fuer Test
        n_test = n_total // 2
        print(f"  {label}: nur {n_total} Artikel -> {n_test} fuer Test (Haelfte)")
    else:
        n_test = TEST_PER_CLASS

    np.random.shuffle(label_indices)
    test_indices.extend(label_indices[:n_test])
    rest_indices.extend(label_indices[n_test:])

test_df = all_labelled.loc[test_indices].reset_index(drop=True)
rest_df = all_labelled.loc[rest_indices].reset_index(drop=True)

print(f"\nTest-Split: {len(test_df)} Artikel")
print(f"Verbleibend fuer Train/Val: {len(rest_df)} Artikel")

# --- Schritt 2: Train/Validation-Split (stratifiziert) ---
# Klassen mit <2 Artikeln komplett in Train
class_counts = rest_df["label"].value_counts()
small_classes = class_counts[class_counts < 2].index.tolist()

if small_classes:
    print(f"\nKlassen mit <2 Artikeln (komplett in Train): {small_classes}")
    small_mask = rest_df["label"].isin(small_classes)
    train_small = rest_df[small_mask]
    rest_for_split = rest_df[~small_mask]
else:
    train_small = pd.DataFrame(columns=rest_df.columns)
    rest_for_split = rest_df

train_main, val_df = train_test_split(
    rest_for_split,
    test_size=VAL_FRACTION,
    stratify=rest_for_split["label"],
    random_state=RANDOM_SEED,
)

train_df = pd.concat([train_main, train_small], ignore_index=True)
val_df = val_df.reset_index(drop=True)

print(f"\n{'='*50}")
print(f"  Train:      {len(train_df):>5} Artikel")
print(f"  Validation: {len(val_df):>5} Artikel")
print(f"  Test:       {len(test_df):>5} Artikel")
print(f"  Gesamt:     {len(train_df) + len(val_df) + len(test_df):>5} Artikel")
print(f"{'='*50}")

# Klassenverteilung
split_overview = pd.DataFrame({
    "Train": train_df["label"].value_counts(),
    "Val": val_df["label"].value_counts(),
    "Test": test_df["label"].value_counts(),
}).fillna(0).astype(int)
split_overview["Gesamt"] = split_overview.sum(axis=1)
split_overview.loc["TOTAL"] = split_overview.sum()
print("\nKlassenverteilung:")
print(split_overview.to_string())

# Split-Config fuer Report
split_config = {
    "dataset_id": pu.DATASET_ID,
    "split_mode": "custom_finetune",
    "test_per_class": TEST_PER_CLASS,
    "val_fraction": VAL_FRACTION,
    "random_seed": RANDOM_SEED,
    "train_size": len(train_df),
    "eval_size": len(val_df),
    "test_size": len(test_df),
    "raw_size": 0,
}

Gesamtpool gelabelter Artikel: 1921
Klassen im Datensatz: 13

  Politikverdruss: nur 18 Artikel -> 9 fuer Test (Haelfte)
  Kosten/Löhne/Preise: nur 59 Artikel -> 29 fuer Test (Haelfte)

Test-Split: 368 Artikel
Verbleibend fuer Train/Val: 1553 Artikel

  Train:       1242 Artikel
  Validation:   311 Artikel
  Test:         368 Artikel
  Gesamt:      1921 Artikel

Klassenverteilung:
                          Train  Val  Test  Gesamt
label                                             
AfD/Rechte                  136   34    30     200
Andere                      136   34    30     200
Arbeitslosigkeit             61   16    30     107
Bundeswehr/Verteidigung     117   29    30     176
Gesundheitswesen, Pflege     89   22    30     141
Klima / Energie             136   34    30     200
Kosten/Löhne/Preise          24    6    29      59
Politikverdruss               7    2     9      18
Renten                       92   23    30     145
Soziales Gefälle             32    8    30      70
Ukra

In [4]:
# ===== LABEL ENCODING =====
label2id = {label: idx for idx, label in enumerate(ALL_LABELS)}
id2label = {idx: label for idx, label in enumerate(ALL_LABELS)}

# Numerische label_id Spalte hinzufuegen
train_df["label_id"] = train_df["label"].map(label2id)
val_df["label_id"] = val_df["label"].map(label2id)
test_df["label_id"] = test_df["label"].map(label2id)

# Kontrolle: keine NaN-Labels
for name, df in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    n_missing = df["label_id"].isna().sum()
    if n_missing > 0:
        print(f"WARNUNG: {n_missing} unbekannte Labels in {name}!")
        print(df[df["label_id"].isna()]["label"].unique())

print("Label-Mapping:")
print("-" * 40)
for label, idx in label2id.items():
    print(f"  {idx:>2}: {label}")
print(f"\nAnzahl Klassen: {len(ALL_LABELS)}")

Label-Mapping:
----------------------------------------
   0: Klima / Energie
   1: Zuwanderung
   2: Renten
   3: Soziales Gefälle
   4: AfD/Rechte
   5: Arbeitslosigkeit
   6: Wirtschaftslage
   7: Politikverdruss
   8: Gesundheitswesen, Pflege
   9: Kosten/Löhne/Preise
  10: Ukraine/Krieg/Russland
  11: Bundeswehr/Verteidigung
  12: Andere

Anzahl Klassen: 13


In [5]:
# ===== TOKENISIERUNG =====
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# DataFrames -> HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))
val_dataset = Dataset.from_pandas(val_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))
test_dataset = Dataset.from_pandas(test_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        max_length=MAX_LENGTH,
        truncation=True,
        # Kein padding hier — wird dynamisch per DataCollatorWithPadding gemacht
    )

print("Tokenisiere Trainings-Daten...")
train_dataset = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
print("Tokenisiere Validierungs-Daten...")
val_dataset = val_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
print("Tokenisiere Test-Daten...")
test_dataset = test_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

# Format setzen
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Beispiel-Tokenisierung
example_text = train_df["text"].iloc[0]
example_tokens = tokenizer(example_text, max_length=MAX_LENGTH, truncation=True)
print(f"\nBeispiel-Tokenisierung:")
print(f"  Textlaenge (Zeichen): {len(example_text)}")
print(f"  Token-Anzahl:         {len(example_tokens['input_ids'])}")
print(f"  Max Length:           {MAX_LENGTH}")
print(f"\nDataset-Groessen:")
print(f"  Train:      {len(train_dataset)}")
print(f"  Validation: {len(val_dataset)}")
print(f"  Test:       {len(test_dataset)}")

Tokenisiere Trainings-Daten...


Map:   0%|          | 0/1242 [00:00<?, ? examples/s]

Tokenisiere Validierungs-Daten...


Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Tokenisiere Test-Daten...


Map:   0%|          | 0/368 [00:00<?, ? examples/s]


Beispiel-Tokenisierung:
  Textlaenge (Zeichen): 2947
  Token-Anzahl:         886
  Max Length:           2048

Dataset-Groessen:
  Train:      1242
  Validation: 311
  Test:       368


In [None]:
# ===== MODELL INITIALISIEREN =====
import torch
from transformers import AutoModelForSequenceClassification, AutoConfig

# EuroBERT-Fix: Das custom modeling_eurobert.py setzt rope_type="default" wenn
# config.rope_scaling null ist.
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS

# Wir definieren eine robuste default-Funktion, die fehlendes rope_theta handhabt.
# Wir ueberschreiben "default" in jedem Fall, um sicherzustellen, dass
# auch bei einem Re-Run der Zelle (nach Fehler) die korrigierte Version genutzt wird.
def _default_rope_init(config, device=None, **kwargs):
    # Fix: rope_theta ist in EuroBertConfig nicht immer vorhanden -> Fallback auf 10000.0
    base = getattr(config, "rope_theta", 10000.0)

    # partial_rotary_factor existiert bei manchen Modellen
    partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
    dim = int(head_dim * partial_rotary_factor)
    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
    return inv_freq, 1.0

ROPE_INIT_FUNCTIONS["default"] = _default_rope_init
print("ROPE_INIT_FUNCTIONS gepatcht: 'default' gesetzt/aktualisiert.")

# Config laden (trust_remote_code fuer custom EuroBertConfig)
config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
config.num_labels = len(ALL_LABELS)
config.id2label = id2label
config.label2id = label2id

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    config=config,
    ignore_mismatched_sizes=True,
    trust_remote_code=True,
)

# Parameter zaehlen
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(f"Modell: {MODEL_ID}")
print(f"Parameter gesamt:     {total_params:>12,}")
print(f"Parameter trainierbar:{trainable_params:>12,}")
print(f"Device: {device}")

gpu_info = pu.get_gpu_info()
print(f"GPU: {gpu_info['gpu_name']} ({gpu_info['gpu_vram_gb']} GB)")

In [7]:
# ===== METRICS-FUNKTION =====
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)

def compute_metrics(eval_pred):
    """Metriken fuer den Trainer: Accuracy, Balanced Accuracy, F1, Precision, Recall."""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "balanced_accuracy": balanced_accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
        "f1_micro": f1_score(labels, preds, average="micro", zero_division=0),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "precision_micro": precision_score(labels, preds, average="micro", zero_division=0),
        "recall_macro": recall_score(labels, preds, average="macro", zero_division=0),
        "recall_micro": recall_score(labels, preds, average="micro", zero_division=0),
    }

print("compute_metrics definiert.")

compute_metrics definiert.


In [None]:
# ===== TRAINER EINRICHTEN =====
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding
import torch

OUTPUT_DIR = "/content/eurobert_finetune_output"

# Mixed Precision: BF16 fuer Ampere+ GPUs (L4, A100), FP16 fuer T4, keins fuer CPU
use_fp16 = False
use_bf16 = False
if torch.cuda.is_available():
    gpu_cap = torch.cuda.get_device_capability()
    if gpu_cap[0] >= 8:  # Ampere oder neuer (L4, A100)
        use_bf16 = True
        print(f"GPU Compute Capability {gpu_cap[0]}.{gpu_cap[1]} -> BF16 aktiviert")
    else:  # Aeltere GPUs (T4 = Compute Capability 7.5)
        use_fp16 = True
        print(f"GPU Compute Capability {gpu_cap[0]}.{gpu_cap[1]} -> FP16 aktiviert")
else:
    print("WARNUNG: Keine GPU gefunden. Kein Mixed Precision moeglich.")

# Gradient Checkpointing auf CPU deaktivieren, da es dort zu XLA-Fehlern fuehren kann
use_grad_ckpt = torch.cuda.is_available()

# Fused AdamW nur auf CUDA (schnellere Optimizer-Updates)
use_fused = torch.cuda.is_available()
optim_name = "adamw_torch_fused" if use_fused else "adamw_torch"

# WICHTIG: Sicherstellen, dass das Modell den Status uebernimmt (falls es vorher aktiviert war)
if use_grad_ckpt:
    model.gradient_checkpointing_enable()
else:
    if hasattr(model, "gradient_checkpointing_disable"):
        model.gradient_checkpointing_disable()
    if hasattr(model, "gradient_checkpointing"):
        model.gradient_checkpointing = False

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE_TRAIN,
    per_device_eval_batch_size=BATCH_SIZE_EVAL,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
    fp16=use_fp16,
    bf16=use_bf16,
    gradient_checkpointing=use_grad_ckpt,
    optim=optim_name,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    save_total_limit=3,
    report_to="none",
    seed=RANDOM_SEED,
    dataloader_num_workers=4 if torch.cuda.is_available() else 0,
    dataloader_pin_memory=torch.cuda.is_available(),
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

print("\nTrainer konfiguriert.")
print(f"  Output:     {OUTPUT_DIR}")
print(f"  Epochen:    {NUM_EPOCHS}")
print(f"  LR:         {LEARNING_RATE}")
print(f"  Batch Size: {BATCH_SIZE_TRAIN} x {GRADIENT_ACCUMULATION_STEPS} = {BATCH_SIZE_TRAIN * GRADIENT_ACCUMULATION_STEPS} effektiv")
print(f"  Early Stop: patience=2, metric=f1_macro")
print(f"  Gradient Checkpointing: {use_grad_ckpt}")
print(f"  FP16: {use_fp16} | BF16: {use_bf16}")
print(f"  Optimizer:  {optim_name}")
print(f"  DataLoader Workers: {training_args.dataloader_num_workers}")

In [None]:
# ===== TRAINING STARTEN =====
timer = pu.ExperimentTimer()
with timer:
    train_result = trainer.train()

print(f"\nTraining abgeschlossen: {timer.duration_formatted}")
print(f"\nTraining-Metriken:")
for key, val in train_result.metrics.items():
    print(f"  {key}: {val}")



Epoch,Training Loss,Validation Loss


In [None]:
# ===== TRAINING-VERLAUF PLOTTEN =====
import matplotlib.pyplot as plt

log_history = trainer.state.log_history

# Training Loss extrahieren (aus den Step-Logs)
train_steps = [e["step"] for e in log_history if "loss" in e]
train_losses = [e["loss"] for e in log_history if "loss" in e]

# Eval-Metriken extrahieren (aus den Epoch-Logs)
eval_logs = [e for e in log_history if "eval_loss" in e]
eval_epochs = [e["epoch"] for e in eval_logs]
eval_losses = [e["eval_loss"] for e in eval_logs]
eval_f1_macro = [e.get("eval_f1_macro", 0) for e in eval_logs]
eval_accuracy = [e.get("eval_accuracy", 0) for e in eval_logs]
eval_balanced_acc = [e.get("eval_balanced_accuracy", 0) for e in eval_logs]
eval_precision_macro = [e.get("eval_precision_macro", 0) for e in eval_logs]
eval_recall_macro = [e.get("eval_recall_macro", 0) for e in eval_logs]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))

# Plot 1: Loss
ax1.plot(train_steps, train_losses, alpha=0.4, label="Train Loss (Steps)", color="steelblue")
# Eval Loss auf der Step-Achse (am Ende jeder Epoch)
eval_steps_approx = [e.get("step", 0) for e in eval_logs]
ax1.plot(eval_steps_approx, eval_losses, "o-", label="Eval Loss (Epoch)", color="orangered", linewidth=2)
ax1.set_xlabel("Steps")
ax1.set_ylabel("Loss")
ax1.set_title("Training vs. Eval Loss")
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Eval-Metriken ueber Epochen
ax2.plot(eval_epochs, eval_f1_macro, "o-", label="F1 Macro", linewidth=2)
ax2.plot(eval_epochs, eval_accuracy, "s-", label="Accuracy", linewidth=2)
ax2.plot(eval_epochs, eval_balanced_acc, "^-", label="Balanced Accuracy", linewidth=2)
ax2.plot(eval_epochs, eval_precision_macro, "d-", label="Precision Macro", linewidth=1.5, alpha=0.7)
ax2.plot(eval_epochs, eval_recall_macro, "v-", label="Recall Macro", linewidth=1.5, alpha=0.7)
ax2.set_xlabel("Epoch")
ax2.set_ylabel("Score")
ax2.set_title("Eval-Metriken pro Epoch")
ax2.legend(loc="lower right")
ax2.grid(True, alpha=0.3)
ax2.set_ylim(0, 1)

plt.suptitle("EuroBERT-210m Fine-Tuning Verlauf", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.show()

# Tabellarisch
print("\nEval-Metriken pro Epoch:")
print(f"{'Epoch':>6} {'Loss':>8} {'F1 Macro':>10} {'Accuracy':>10} {'Bal. Acc.':>10}")
print("-" * 50)
for log in eval_logs:
    print(f"{log['epoch']:>6.0f} {log['eval_loss']:>8.4f} {log.get('eval_f1_macro', 0):>10.4f} {log.get('eval_accuracy', 0):>10.4f} {log.get('eval_balanced_accuracy', 0):>10.4f}")

In [None]:
# ===== EVALUATION AUF TEST-SET =====
print("Evaluation auf Test-Set mit bestem Modell...")

test_preds = trainer.predict(test_dataset)
pred_ids = np.argmax(test_preds.predictions, axis=-1)
pred_labels = [id2label[i] for i in pred_ids]
true_labels = [id2label[i] for i in test_preds.label_ids]

# Standardisierte Evaluation mit pipeline_utils
metrics = pu.evaluate(
    true_labels,
    pred_labels,
    labels=ALL_LABELS,
    experiment_name="test",
)
pu.print_metrics(metrics, "Fine-Tuned EuroBERT-210m — Test Split")

In [None]:
# ===== CONFUSION MATRIX =====
pu.plot_confusion_matrix(
    metrics,
    title="Fine-Tuned EuroBERT-210m (Test)",
)

In [None]:
# ===== PER-CLASS METRICS BARPLOT =====
import matplotlib.pyplot as plt
import numpy as np

pc_df = metrics["per_class_df"].copy()
pc_df = pc_df.sort_values("F1", ascending=True)

fig, ax = plt.subplots(figsize=(12, 8))

y_pos = np.arange(len(pc_df))
bar_height = 0.25

ax.barh(y_pos - bar_height, pc_df["Precision"], bar_height, label="Precision", color="#2196F3", alpha=0.85)
ax.barh(y_pos, pc_df["Recall"], bar_height, label="Recall", color="#FF9800", alpha=0.85)
ax.barh(y_pos + bar_height, pc_df["F1"], bar_height, label="F1", color="#4CAF50", alpha=0.85)

ax.set_yticks(y_pos)
ax.set_yticklabels(pc_df["Label"])
ax.set_xlabel("Score")
ax.set_title("Per-Class Metrics: Fine-Tuned EuroBERT-210m", fontsize=13, fontweight="bold")
ax.legend(loc="lower right")
ax.set_xlim(0, 1.05)
ax.grid(axis="x", alpha=0.3)

# F1-Werte als Text
for i, (_, row) in enumerate(pc_df.iterrows()):
    ax.text(row["F1"] + 0.01, y_pos[i] + bar_height, f"{row['F1']:.2f}", va="center", fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# ===== REPORT GENERIEREN =====

# Training-Parameter fuer Report
training_params = {
    "num_epochs": NUM_EPOCHS,
    "learning_rate": LEARNING_RATE,
    "batch_size_train": BATCH_SIZE_TRAIN,
    "batch_size_eval": BATCH_SIZE_EVAL,
    "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
    "effective_batch_size": BATCH_SIZE_TRAIN * GRADIENT_ACCUMULATION_STEPS,
    "warmup_ratio": WARMUP_RATIO,
    "weight_decay": WEIGHT_DECAY,
    "max_length": MAX_LENGTH,
    "fp16": FP16,
    "early_stopping_patience": 2,
    "best_checkpoint": trainer.state.best_model_checkpoint,
    "best_metric": round(trainer.state.best_metric, 4) if trainer.state.best_metric else None,
}

# Model-Config manuell extrahieren (extract_model_config erwartet Pipeline-Objekt)
config_dict = model.config.to_dict()
model_config = {}
for field in ["architectures", "model_type", "hidden_size", "num_hidden_layers",
              "num_attention_heads", "vocab_size", "max_position_embeddings"]:
    if field in config_dict:
        val = config_dict[field]
        if field == "architectures" and isinstance(val, list):
            val = val[0] if len(val) == 1 else ", ".join(val)
        model_config[field] = val

report_path = pu.generate_report(
    model_name=f"{MODEL_SHORT_NAME}_finetune",
    model_type=MODEL_TYPE,
    metrics=metrics,
    timer=timer,
    model_info=MODEL_INFO,
    candidate_labels=ALL_LABELS,
    hypothesis_template=None,
    split_config=split_config,
    label_mapping={l: l for l in ALL_LABELS},
    model_config=model_config,
    training_params=training_params,
    experiment_notes=(
        f"Fine-Tuned EuroBERT-210m auf {len(train_df)} Trainingsartikeln. "
        f"Max Length {MAX_LENGTH}, FP16, EarlyStoppingCallback(patience=2). "
        f"Custom Split: {TEST_PER_CLASS} Test/Klasse, Rest 80/20 Train/Val."
    ),
)

print(f"\nReport gespeichert: {report_path}")

In [None]:
# ===== MODELL AUF HUGGINGFACE HOCHLADEN (optional) =====
UPLOAD = False  # auf True setzen zum Hochladen

if UPLOAD:
    REPO_NAME = "Zorryy/eurobert-210m-news-classifier-v1"
    url = pu.upload_model_to_hub(
        model=trainer.model,
        tokenizer=tokenizer,
        repo_name=REPO_NAME,
        private=True,
        training_params=training_params,
    )
    print(f"Model uploaded: {url}")
else:
    print("Upload uebersprungen (UPLOAD = False).")
    print("Setze UPLOAD = True und fuehre die Zelle erneut aus, um das Modell hochzuladen.")

In [None]:
# ===== SUMMARY =====
print("=" * 70)
print(f"  Model:           {MODEL_ID}")
print(f"  Type:            {MODEL_TYPE}")
print(f"  Train:           {len(train_df)} Artikel")
print(f"  Validation:      {len(val_df)} Artikel")
print(f"  Test:            {len(test_df)} Artikel")
print(f"  Epochen:         {NUM_EPOCHS} (best: {trainer.state.best_model_checkpoint})")
print(f"  F1 Macro:        {metrics['f1_macro']:.4f}")
print(f"  F1 Weighted:     {metrics['f1_weighted']:.4f}")
print(f"  Accuracy:        {metrics['accuracy']:.4f}")
print(f"  Precision Macro: {metrics['precision_macro']:.4f}")
print(f"  Recall Macro:    {metrics['recall_macro']:.4f}")
print(f"  Dauer:           {timer.duration_formatted}")
print(f"  Report:          {report_path}")
print("=" * 70)