# Fine-Tuning: newsBERT_euro_210m
## EuroBERT-210M mit optimierten HPT-Parametern

Trainiert EuroBERT-210M mit den besten Hyperparametern aus Phase 1 HPT
und pusht das Modell als `Zorryy/newsBERT_euro_210m` auf HuggingFace.

**Voraussetzung:** GPU-Runtime (L4 empfohlen), `HF_TOKEN` in Colab Secrets.

In [None]:
# === SETUP ===
import os, sys

REPO = "/content/news_articles_classification_thesis"
if not os.path.exists(REPO):
    !git clone https://github.com/ZorbeyOezcan/news_articles_classification_thesis.git {REPO}
else:
    !cd {REPO} && git pull -q

!pip install -q transformers[sentencepiece] datasets huggingface_hub \
    scikit-learn matplotlib seaborn tqdm pandas accelerate evaluate

from google.colab import drive
drive.mount("/content/drive", force_remount=False)

PIPELINE_DIR = f"{REPO}/Python/classification_pipeline"
if PIPELINE_DIR not in sys.path:
    sys.path.insert(0, PIPELINE_DIR)

import importlib
import pipeline_utils as pu
importlib.reload(pu)

from huggingface_hub import login
from google.colab import userdata
login(token=userdata.get("HF_TOKEN"))

print(f"Reports-Ordner: {pu.REPORTS_DIR}")
print("Setup abgeschlossen.")

In [None]:
# === MODEL & HPT CONFIG ===
import torch
import numpy as np

MODEL_ID = "EuroBERT/EuroBERT-210m"
MODEL_SHORT_NAME = "newsBERT_euro_210m"
MODEL_TYPE = "fine-tuned"
REPO_NAME = "Zorryy/newsBERT_euro_210m"  # HuggingFace Repo

# ----- Beste HPT-Parameter (Phase 1) -----
LEARNING_RATE = 3.76e-05
LR_SCHEDULER_TYPE = "linear"
NUM_EPOCHS = 13
BATCH_SIZE_TRAIN = 4
WARMUP_RATIO = 0.0880168
WEIGHT_DECAY = 0.0439249
LABEL_SMOOTHING = 0.0320202

# ----- Feste Parameter -----
MAX_LENGTH = 2048
EFFECTIVE_BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = EFFECTIVE_BATCH_SIZE // BATCH_SIZE_TRAIN  # = 4
RANDOM_SEED = 42
TEST_PER_CLASS = 30
VAL_FRACTION = 0.2
EARLY_STOPPING_PATIENCE = 3

# ----- GPU-adaptive Einstellungen -----
if not torch.cuda.is_available():
    raise RuntimeError("GPU benoetigt! Bitte Colab Runtime aendern.")

_gpu_cap = torch.cuda.get_device_capability()
_gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9

if _gpu_cap[0] >= 8:  # Ampere+ (L4, A100)
    USE_BF16 = True
    USE_FP16 = False
else:  # T4
    USE_BF16 = False
    USE_FP16 = True

if _gpu_mem >= 40:
    BATCH_SIZE_EVAL = 32
elif _gpu_mem >= 20:
    BATCH_SIZE_EVAL = 16
else:
    BATCH_SIZE_EVAL = 8

OPTIM = "adamw_torch_fused"

# Labels
ALL_LABELS = [
    "Klima / Energie", "Zuwanderung", "Renten", "Soziales Gef\u00e4lle",
    "AfD/Rechte", "Arbeitslosigkeit", "Wirtschaftslage", "Politikverdruss",
    "Gesundheitswesen, Pflege", "Kosten/L\u00f6hne/Preise",
    "Ukraine/Krieg/Russland", "Bundeswehr/Verteidigung", "Andere",
]

MODEL_INFO = {
    "huggingface_id": MODEL_ID,
    "language": "Multilingual (inkl. Deutsch)",
    "max_tokens": MAX_LENGTH,
    "parameters": "210M",
    "notes": "EuroBERT-210m fine-tuned mit HPT-optimierten Parametern. Pushed als newsBERT_euro_210m.",
}

print(f"GPU: {torch.cuda.get_device_name(0)} ({_gpu_mem:.1f} GB, CC {_gpu_cap[0]}.{_gpu_cap[1]})")
print(f"  BF16={USE_BF16}, FP16={USE_FP16}, Eval Batch={BATCH_SIZE_EVAL}")
print(f"\nHPT-Parameter:")
print(f"  LR:              {LEARNING_RATE}")
print(f"  Scheduler:       {LR_SCHEDULER_TYPE}")
print(f"  Epochs:          {NUM_EPOCHS}")
print(f"  Batch (train):   {BATCH_SIZE_TRAIN}")
print(f"  Grad Accum:      {GRADIENT_ACCUMULATION_STEPS}")
print(f"  Effektive BS:    {EFFECTIVE_BATCH_SIZE}")
print(f"  Warmup Ratio:    {WARMUP_RATIO}")
print(f"  Weight Decay:    {WEIGHT_DECAY}")
print(f"  Label Smoothing: {LABEL_SMOOTHING}")
print(f"  Early Stopping:  patience={EARLY_STOPPING_PATIENCE}")

In [None]:
# === DATEN LADEN & CUSTOM SPLIT ===
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

np.random.seed(RANDOM_SEED)

ds = load_dataset(pu.DATASET_ID)
train_hf = ds["train"].to_pandas()
test_hf = ds["test"].to_pandas()
all_labelled = pd.concat([train_hf, test_hf], ignore_index=True)

print(f"Gesamtpool gelabelter Artikel: {len(all_labelled)}")
print(f"Klassen: {all_labelled['label'].nunique()}\n")

# Test-Split (fix, stratifiziert)
test_indices = []
rest_indices = []

for label in ALL_LABELS:
    label_mask = all_labelled["label"] == label
    label_indices = all_labelled[label_mask].index.tolist()
    n_total = len(label_indices)

    if n_total < 60:
        n_test = n_total // 2
        print(f"  {label}: nur {n_total} Artikel -> {n_test} fuer Test")
    else:
        n_test = TEST_PER_CLASS

    np.random.shuffle(label_indices)
    test_indices.extend(label_indices[:n_test])
    rest_indices.extend(label_indices[n_test:])

test_df = all_labelled.loc[test_indices].reset_index(drop=True)
rest_df = all_labelled.loc[rest_indices].reset_index(drop=True)

# Train/Val Split (stratifiziert)
class_counts = rest_df["label"].value_counts()
small_classes = class_counts[class_counts < 2].index.tolist()

if small_classes:
    small_mask = rest_df["label"].isin(small_classes)
    train_small = rest_df[small_mask]
    rest_for_split = rest_df[~small_mask]
else:
    train_small = pd.DataFrame(columns=rest_df.columns)
    rest_for_split = rest_df

train_main, val_df = train_test_split(
    rest_for_split, test_size=VAL_FRACTION,
    stratify=rest_for_split["label"], random_state=RANDOM_SEED,
)
train_df = pd.concat([train_main, train_small], ignore_index=True)
val_df = val_df.reset_index(drop=True)

print(f"\n{'='*50}")
print(f"  Train:      {len(train_df):>5}")
print(f"  Validation: {len(val_df):>5}")
print(f"  Test:       {len(test_df):>5}")
print(f"  Gesamt:     {len(train_df) + len(val_df) + len(test_df):>5}")
print(f"{'='*50}")

split_config = {
    "dataset_id": pu.DATASET_ID,
    "split_mode": "custom_finetune",
    "test_per_class": TEST_PER_CLASS,
    "val_fraction": VAL_FRACTION,
    "random_seed": RANDOM_SEED,
    "train_size": len(train_df),
    "eval_size": len(val_df),
    "test_size": len(test_df),
}

In [None]:
# === LABEL ENCODING ===
label2id = {label: idx for idx, label in enumerate(ALL_LABELS)}
id2label = {idx: label for idx, label in enumerate(ALL_LABELS)}

train_df["label_id"] = train_df["label"].map(label2id)
val_df["label_id"] = val_df["label"].map(label2id)
test_df["label_id"] = test_df["label"].map(label2id)

for name, _df in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    assert _df["label_id"].isna().sum() == 0, f"Unbekannte Labels in {name}!"

print("Label-Mapping:")
for label, idx in label2id.items():
    print(f"  {idx:>2}: {label}")

In [None]:
# === TOKENISIERUNG ===
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

def tokenize_fn(examples):
    return tokenizer(examples["text"], max_length=MAX_LENGTH, truncation=True)

train_dataset = Dataset.from_pandas(train_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))
val_dataset = Dataset.from_pandas(val_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))
test_dataset = Dataset.from_pandas(test_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))

print("Tokenisiere...")
train_dataset = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
test_dataset = test_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")

In [None]:
# === MODELL INITIALISIEREN ===
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoConfig
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS

# EuroBERT RoPE Fix
def _default_rope_init(config, device=None, **kwargs):
    base = getattr(config, "rope_theta", 10000.0)
    partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
    dim = int(head_dim * partial_rotary_factor)
    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
    return inv_freq, 1.0

ROPE_INIT_FUNCTIONS["default"] = _default_rope_init

# Modell laden
torch.manual_seed(RANDOM_SEED)

config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
config.num_labels = len(ALL_LABELS)
config.id2label = id2label
config.label2id = label2id

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID, config=config,
    ignore_mismatched_sizes=True, trust_remote_code=True,
)

# Classifier Head stabilisieren (kleinere Init -> verhindert Logit-Overflow in BF16)
for name, module in model.named_modules():
    if name in ("dense", "classifier") and isinstance(module, nn.Linear):
        nn.init.normal_(module.weight, mean=0.0, std=0.002)
        if module.bias is not None:
            nn.init.zeros_(module.bias)

device = torch.device("cuda")
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
print(f"Modell: {MODEL_ID}")
print(f"Parameter: {total_params:,}")
print(f"Device: {device}")
print(f"GPU: {pu.get_gpu_info()['gpu_name']}")

In [None]:
# === METRICS ===
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score,
    f1_score, precision_score, recall_score,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "balanced_accuracy": balanced_accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
        "f1_weighted": f1_score(labels, preds, average="weighted", zero_division=0),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro": recall_score(labels, preds, average="macro", zero_division=0),
    }

print("compute_metrics definiert.")

In [None]:
# === TRAINER ===
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding

OUTPUT_DIR = "/content/newsbert_finetune_output"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    per_device_train_batch_size=BATCH_SIZE_TRAIN,
    per_device_eval_batch_size=BATCH_SIZE_EVAL,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
    label_smoothing_factor=LABEL_SMOOTHING,
    fp16=USE_FP16,
    bf16=USE_BF16,
    gradient_checkpointing=False,  # EuroBERT custom modeling: loss=0 mit grad ckpt
    optim=OPTIM,
    group_by_length=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    save_total_limit=3,
    report_to="none",
    seed=RANDOM_SEED,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE)],
)

print("Trainer konfiguriert.")
print(f"  Epochs:       {NUM_EPOCHS}")
print(f"  LR:           {LEARNING_RATE} ({LR_SCHEDULER_TYPE})")
print(f"  Batch:        {BATCH_SIZE_TRAIN} x {GRADIENT_ACCUMULATION_STEPS} = {EFFECTIVE_BATCH_SIZE} effektiv")
print(f"  Warmup:       {WARMUP_RATIO}")
print(f"  Weight Decay: {WEIGHT_DECAY}")
print(f"  Label Smooth: {LABEL_SMOOTHING}")
print(f"  BF16={USE_BF16}, FP16={USE_FP16}")
print(f"  Early Stop:   patience={EARLY_STOPPING_PATIENCE}")

In [None]:
# === TRAINING ===
timer = pu.ExperimentTimer()
with timer:
    train_result = trainer.train()

print(f"\nTraining abgeschlossen: {timer.duration_formatted}")
print(f"\nTraining-Metriken:")
for key, val in train_result.metrics.items():
    print(f"  {key}: {val}")

In [None]:
# === TRAINING-VERLAUF ===
import matplotlib.pyplot as plt

log_history = trainer.state.log_history

train_steps = [e["step"] for e in log_history if "loss" in e]
train_losses = [e["loss"] for e in log_history if "loss" in e]

eval_logs = [e for e in log_history if "eval_loss" in e]
eval_epochs = [e["epoch"] for e in eval_logs]
eval_losses = [e["eval_loss"] for e in eval_logs]
eval_f1 = [e.get("eval_f1_macro", 0) for e in eval_logs]
eval_acc = [e.get("eval_accuracy", 0) for e in eval_logs]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))

ax1.plot(train_steps, train_losses, alpha=0.4, label="Train Loss", color="steelblue")
eval_steps = [e.get("step", 0) for e in eval_logs]
ax1.plot(eval_steps, eval_losses, "o-", label="Eval Loss", color="orangered", linewidth=2)
ax1.set_xlabel("Steps")
ax1.set_ylabel("Loss")
ax1.set_title("Training vs. Eval Loss")
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(eval_epochs, eval_f1, "o-", label="F1 Macro", linewidth=2)
ax2.plot(eval_epochs, eval_acc, "s-", label="Accuracy", linewidth=2)
ax2.set_xlabel("Epoch")
ax2.set_ylabel("Score")
ax2.set_title("Eval-Metriken pro Epoch")
ax2.legend(loc="lower right")
ax2.grid(True, alpha=0.3)
ax2.set_ylim(0, 1)

plt.suptitle("newsBERT_euro_210m Fine-Tuning Verlauf", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.show()

print("\nEval-Metriken pro Epoch:")
print(f"{'Epoch':>6} {'Loss':>8} {'F1 Macro':>10} {'Accuracy':>10}")
print("-" * 40)
for log in eval_logs:
    print(f"{log['epoch']:>6.0f} {log['eval_loss']:>8.4f} {log.get('eval_f1_macro', 0):>10.4f} {log.get('eval_accuracy', 0):>10.4f}")

In [None]:
# === EVALUATION AUF TEST-SET ===
print("Evaluation auf Test-Set mit bestem Modell...")

test_preds = trainer.predict(test_dataset)
pred_ids = np.argmax(test_preds.predictions, axis=-1)
pred_labels = [id2label[i] for i in pred_ids]
true_labels = [id2label[i] for i in test_preds.label_ids]

metrics = pu.evaluate(
    true_labels, pred_labels,
    labels=ALL_LABELS, experiment_name="test",
)
pu.print_metrics(metrics, "newsBERT_euro_210m â€” Test Split")

In [None]:
# === CONFUSION MATRIX ===
pu.plot_confusion_matrix(metrics, title="newsBERT_euro_210m (Test)")

In [None]:
# === PER-CLASS METRICS ===
pc_df = metrics["per_class_df"].copy().sort_values("F1", ascending=True)

fig, ax = plt.subplots(figsize=(12, 8))
y_pos = np.arange(len(pc_df))
bar_h = 0.25

ax.barh(y_pos - bar_h, pc_df["Precision"], bar_h, label="Precision", color="#2196F3", alpha=0.85)
ax.barh(y_pos, pc_df["Recall"], bar_h, label="Recall", color="#FF9800", alpha=0.85)
ax.barh(y_pos + bar_h, pc_df["F1"], bar_h, label="F1", color="#4CAF50", alpha=0.85)

ax.set_yticks(y_pos)
ax.set_yticklabels(pc_df["Label"])
ax.set_xlabel("Score")
ax.set_title("Per-Class Metrics: newsBERT_euro_210m", fontsize=13, fontweight="bold")
ax.legend(loc="lower right")
ax.set_xlim(0, 1.05)
ax.grid(axis="x", alpha=0.3)

for i, (_, row) in enumerate(pc_df.iterrows()):
    ax.text(row["F1"] + 0.01, y_pos[i] + bar_h, f"{row['F1']:.2f}", va="center", fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# === REPORT ===
training_params = {
    "num_epochs": NUM_EPOCHS,
    "learning_rate": LEARNING_RATE,
    "lr_scheduler_type": LR_SCHEDULER_TYPE,
    "batch_size_train": BATCH_SIZE_TRAIN,
    "batch_size_eval": BATCH_SIZE_EVAL,
    "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
    "effective_batch_size": EFFECTIVE_BATCH_SIZE,
    "warmup_ratio": WARMUP_RATIO,
    "weight_decay": WEIGHT_DECAY,
    "label_smoothing_factor": LABEL_SMOOTHING,
    "max_length": MAX_LENGTH,
    "bf16": USE_BF16,
    "fp16": USE_FP16,
    "early_stopping_patience": EARLY_STOPPING_PATIENCE,
    "best_checkpoint": trainer.state.best_model_checkpoint,
    "best_metric": round(trainer.state.best_metric, 4) if trainer.state.best_metric else None,
}

config_dict = model.config.to_dict()
model_config = {}
for field in ["architectures", "model_type", "hidden_size", "num_hidden_layers",
              "num_attention_heads", "vocab_size", "max_position_embeddings"]:
    if field in config_dict:
        val = config_dict[field]
        if field == "architectures" and isinstance(val, list):
            val = val[0] if len(val) == 1 else ", ".join(val)
        model_config[field] = val

report_path = pu.generate_report(
    model_name=MODEL_SHORT_NAME,
    model_type=MODEL_TYPE,
    metrics=metrics,
    timer=timer,
    model_info=MODEL_INFO,
    candidate_labels=ALL_LABELS,
    split_config=split_config,
    label_mapping={l: l for l in ALL_LABELS},
    model_config=model_config,
    training_params=training_params,
    experiment_notes=(
        f"newsBERT_euro_210m: EuroBERT-210M fine-tuned mit HPT-optimierten Parametern. "
        f"LR={LEARNING_RATE}, Epochs={NUM_EPOCHS}, Label Smoothing={LABEL_SMOOTHING}. "
        f"Custom Split: {TEST_PER_CLASS} Test/Klasse, Rest {int((1-VAL_FRACTION)*100)}/{int(VAL_FRACTION*100)} Train/Val."
    ),
)
print(f"Report gespeichert: {report_path}")

In [None]:
# === MODELL AUF HUGGINGFACE PUSHEN ===
url = pu.upload_model_to_hub(
    model=trainer.model,
    tokenizer=tokenizer,
    repo_name=REPO_NAME,
    private=True,
    training_params=training_params,
)
print(f"\nModell hochgeladen: {url}")

In [None]:
# === SUMMARY ===
print("=" * 70)
print(f"  Model:           {REPO_NAME}")
print(f"  Base:            {MODEL_ID}")
print(f"  Train:           {len(train_df)} Artikel")
print(f"  Validation:      {len(val_df)} Artikel")
print(f"  Test:            {len(test_df)} Artikel")
print(f"  Epochs:          {NUM_EPOCHS} (best: {trainer.state.best_model_checkpoint})")
print(f"  F1 Macro:        {metrics['f1_macro']:.4f}")
print(f"  F1 Weighted:     {metrics['f1_weighted']:.4f}")
print(f"  Accuracy:        {metrics['accuracy']:.4f}")
print(f"  Precision Macro: {metrics['precision_macro']:.4f}")
print(f"  Recall Macro:    {metrics['recall_macro']:.4f}")
print(f"  Dauer:           {timer.duration_formatted}")
print(f"  Report:          {report_path}")
print(f"  HuggingFace:     {url}")
print("=" * 70)

In [None]:
# === CLEANUP ===
import gc, shutil

if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
    print("Checkpoint-Dateien geloescht.")

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    free_mem = torch.cuda.mem_get_info()[0] / 1e9
    print(f"GPU VRAM frei: {free_mem:.1f} GB")

print("\nFertig. Runtime kann beendet werden.")