# Cleanlab Label Quality Analyse
## K-Fold CV mit EuroBERT-210M → Out-of-Fold Predictions → Cleanlab

Dieses Notebook fuehrt Option 1 (Confident Learning) durch:
1. 3-Fold Stratified CV mit EuroBERT-210M + HPT-Params
2. Out-of-Fold `pred_probs` sammeln (jedes Sample wird einmal predicted)
3. Cleanlab: `find_label_issues()`, `health_summary()`, `get_label_quality_scores()`
4. Verdaechtige Samples inspizieren

**Voraussetzung:** GPU-Runtime (L4 empfohlen), `HF_TOKEN` in Colab Secrets.

In [None]:
# === SETUP ===
import os, sys

REPO = "/content/news_articles_classification_thesis"
if not os.path.exists(REPO):
    !git clone https://github.com/ZorbeyOezcan/news_articles_classification_thesis.git {REPO}
else:
    !cd {REPO} && git pull -q

!pip install -q transformers[sentencepiece] datasets huggingface_hub \
    scikit-learn matplotlib seaborn tqdm pandas accelerate evaluate cleanlab

from google.colab import drive
drive.mount("/content/drive", force_remount=False)

PIPELINE_DIR = f"{REPO}/Python/classification_pipeline"
if PIPELINE_DIR not in sys.path:
    sys.path.insert(0, PIPELINE_DIR)

import importlib
import pipeline_utils as pu
importlib.reload(pu)

from huggingface_hub import login
from google.colab import userdata
login(token=userdata.get("HF_TOKEN"))

print("Setup abgeschlossen.")

In [None]:
# === KONFIGURATION ===
import torch
import numpy as np

MODEL_ID = "EuroBERT/EuroBERT-210m"
MAX_LENGTH = 2048
RANDOM_SEED = 42
N_FOLDS = 3

# HPT-optimierte Parameter (identisch mit train_newsbert_euro_210m.ipynb)
LEARNING_RATE = 3.76e-05
LR_SCHEDULER_TYPE = "linear"
NUM_EPOCHS = 13
BATCH_SIZE_TRAIN = 4
WARMUP_RATIO = 0.0880168
WEIGHT_DECAY = 0.0439249
LABEL_SMOOTHING = 0.0320202
EFFECTIVE_BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = EFFECTIVE_BATCH_SIZE // BATCH_SIZE_TRAIN
EARLY_STOPPING_PATIENCE = 3

ALL_LABELS = [
    "Klima / Energie", "Zuwanderung", "Renten", "Soziales Gef\u00e4lle",
    "AfD/Rechte", "Arbeitslosigkeit", "Wirtschaftslage", "Politikverdruss",
    "Gesundheitswesen, Pflege", "Kosten/L\u00f6hne/Preise",
    "Ukraine/Krieg/Russland", "Bundeswehr/Verteidigung", "Andere",
]

label2id = {label: idx for idx, label in enumerate(ALL_LABELS)}
id2label = {idx: label for idx, label in enumerate(ALL_LABELS)}

# GPU
if not torch.cuda.is_available():
    raise RuntimeError("GPU benoetigt!")

_gpu_cap = torch.cuda.get_device_capability()
_gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
USE_BF16 = _gpu_cap[0] >= 8
USE_FP16 = not USE_BF16
BATCH_SIZE_EVAL = 32 if _gpu_mem >= 40 else (16 if _gpu_mem >= 20 else 8)

print(f"GPU: {torch.cuda.get_device_name(0)} ({_gpu_mem:.1f} GB)")
print(f"BF16={USE_BF16}, Eval Batch={BATCH_SIZE_EVAL}")
print(f"{N_FOLDS}-Fold CV mit {len(ALL_LABELS)} Klassen")

In [None]:
# === DATEN LADEN ===
import pandas as pd
from datasets import load_dataset

np.random.seed(RANDOM_SEED)

ds = load_dataset(pu.DATASET_ID)
train_hf = ds["train"].to_pandas()
test_hf = ds["test"].to_pandas()
all_labelled = pd.concat([train_hf, test_hf], ignore_index=True)

# Label IDs
all_labelled["label_id"] = all_labelled["label"].map(label2id)
assert all_labelled["label_id"].isna().sum() == 0

print(f"Gesamtpool: {len(all_labelled)} Artikel, {all_labelled['label'].nunique()} Klassen")
print(all_labelled["label"].value_counts().to_string())

In [None]:
# === TOKENIZER + ROPE FIX ===
from transformers import AutoTokenizer
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

def _default_rope_init(config, device=None, **kwargs):
    base = getattr(config, "rope_theta", 10000.0)
    partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
    dim = int(head_dim * partial_rotary_factor)
    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
    return inv_freq, 1.0

ROPE_INIT_FUNCTIONS["default"] = _default_rope_init

def tokenize_fn(examples):
    return tokenizer(examples["text"], max_length=MAX_LENGTH, truncation=True)

print("Tokenizer + RoPE Fix OK.")

In [None]:
# === K-FOLD CV: OUT-OF-FOLD PREDICTIONS SAMMELN ===
import gc
import shutil
import torch.nn as nn
from scipy.special import softmax
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification, AutoConfig,
    TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

def create_model(seed=42):
    torch.manual_seed(seed)
    config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
    config.num_labels = len(ALL_LABELS)
    config.id2label = id2label
    config.label2id = label2id
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID, config=config,
        ignore_mismatched_sizes=True, trust_remote_code=True,
    )
    for name, module in model.named_modules():
        if name in ("dense", "classifier") and isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.002)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
    return model

# Stratified K-Fold
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)
fold_indices = list(skf.split(all_labelled, all_labelled["label_id"]))

# Arrays fuer Out-of-Fold Predictions
oof_pred_probs = np.zeros((len(all_labelled), len(ALL_LABELS)))
oof_predicted = np.full(len(all_labelled), -1, dtype=int)

print(f"Starte {N_FOLDS}-Fold CV...\n")

for fold_idx, (train_idx, val_idx) in enumerate(fold_indices):
    print(f"{'='*60}")
    print(f"Fold {fold_idx + 1}/{N_FOLDS}: Train={len(train_idx)}, Val={len(val_idx)}")
    print(f"{'='*60}")

    # Datasets erstellen
    fold_train_df = all_labelled.iloc[train_idx]
    fold_val_df = all_labelled.iloc[val_idx]

    train_ds = Dataset.from_pandas(
        fold_train_df[["text", "label_id"]].rename(columns={"label_id": "labels"})
    )
    val_ds = Dataset.from_pandas(
        fold_val_df[["text", "label_id"]].rename(columns={"label_id": "labels"})
    )

    train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
    val_ds = val_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
    train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    val_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

    # Modell
    model = create_model(seed=RANDOM_SEED + fold_idx)
    model = model.to("cuda")

    fold_output_dir = f"/content/cleanlab_cv_tmp/fold_{fold_idx}"

    training_args = TrainingArguments(
        output_dir=fold_output_dir,
        num_train_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        lr_scheduler_type=LR_SCHEDULER_TYPE,
        per_device_train_batch_size=BATCH_SIZE_TRAIN,
        per_device_eval_batch_size=BATCH_SIZE_EVAL,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_ratio=WARMUP_RATIO,
        weight_decay=WEIGHT_DECAY,
        label_smoothing_factor=LABEL_SMOOTHING,
        fp16=USE_FP16,
        bf16=USE_BF16,
        gradient_checkpointing=False,
        optim="adamw_torch_fused",
        group_by_length=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        logging_strategy="steps",
        logging_steps=25,
        report_to="none",
        seed=RANDOM_SEED + fold_idx,
        dataloader_num_workers=4,
        dataloader_pin_memory=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE)],
    )

    # Training
    trainer.train()

    # Out-of-Fold Predictions (bestes Modell)
    preds = trainer.predict(val_ds)
    fold_probs = softmax(preds.predictions, axis=-1)
    fold_preds = np.argmax(fold_probs, axis=-1)

    fold_f1 = f1_score(preds.label_ids, fold_preds, average="macro", zero_division=0)
    print(f"  Fold {fold_idx + 1} F1 Macro: {fold_f1:.4f}")

    # In OOF-Arrays speichern
    oof_pred_probs[val_idx] = fold_probs
    oof_predicted[val_idx] = fold_preds

    # Cleanup
    del trainer, model, training_args, train_ds, val_ds
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    if os.path.exists(fold_output_dir):
        shutil.rmtree(fold_output_dir, ignore_errors=True)

# Gesamt-F1
overall_f1 = f1_score(all_labelled["label_id"].values, oof_predicted, average="macro", zero_division=0)
print(f"\nGesamt OOF F1 Macro: {overall_f1:.4f}")
print("Out-of-Fold Predictions gesammelt.")

In [None]:
# === CLEANLAB: LABEL ISSUES FINDEN ===
from cleanlab.filter import find_label_issues
from cleanlab.rank import get_label_quality_scores
from cleanlab.dataset import health_summary

labels = all_labelled["label_id"].values

# Label Quality Scores (0 = sehr verdaechtig, 1 = sicher korrekt)
quality_scores = get_label_quality_scores(labels, oof_pred_probs)
all_labelled["label_quality_score"] = quality_scores

# Label Issues identifizieren
issue_mask = find_label_issues(
    labels=labels,
    pred_probs=oof_pred_probs,
    return_indices_ranked_by="self_confidence",
)

print(f"Cleanlab hat {len(issue_mask)} potenzielle Label-Fehler gefunden.")
print(f"Das sind {len(issue_mask) / len(all_labelled) * 100:.1f}% aller gelabelten Daten.\n")

# Issue-Flag hinzufuegen
all_labelled["is_label_issue"] = False
all_labelled.loc[issue_mask, "is_label_issue"] = True

# Predicted Label hinzufuegen
all_labelled["predicted_label"] = [id2label[i] for i in oof_predicted]

In [None]:
# === CLEANLAB: HEALTH SUMMARY ===
print("Dataset Health Summary:")
print("=" * 60)
health = health_summary(labels, oof_pred_probs, class_names=ALL_LABELS)
health

In [None]:
# === LABEL ISSUES PRO KLASSE ===
print("Label Issues pro Klasse:")
print(f"{'Klasse':35s}  {'Total':>5s}  {'Issues':>6s}  {'%':>6s}  {'Mean Score':>10s}")
print("-" * 70)

for lbl in ALL_LABELS:
    mask = all_labelled["label"] == lbl
    n_total = mask.sum()
    n_issues = all_labelled.loc[mask, "is_label_issue"].sum()
    mean_score = all_labelled.loc[mask, "label_quality_score"].mean()
    pct = n_issues / n_total * 100 if n_total > 0 else 0
    print(f"  {lbl:35s}  {n_total:4d}    {n_issues:4d}   {pct:5.1f}%     {mean_score:.3f}")

In [None]:
# === CONFUSION: GIVEN LABEL vs. PREDICTED LABEL ===
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Nur fuer die als Issue identifizierten Samples
issue_df = all_labelled[all_labelled["is_label_issue"]].copy()

if len(issue_df) > 0:
    cm = pd.crosstab(
        issue_df["label"],
        issue_df["predicted_label"],
        margins=True,
    )
    print("Confusion: Given Label (Zeilen) vs. Predicted Label (Spalten)")
    print("Nur fuer als Issue identifizierte Samples:\n")
    print(cm.to_string())

    # Heatmap (ohne margins)
    cm_numeric = pd.crosstab(issue_df["label"], issue_df["predicted_label"])
    fig, ax = plt.subplots(figsize=(14, 10))
    sns.heatmap(cm_numeric, annot=True, fmt="d", cmap="YlOrRd", ax=ax)
    ax.set_title("Label Issues: Given vs. Predicted Label", fontsize=13)
    ax.set_xlabel("Predicted Label")
    ax.set_ylabel("Given Label")
    plt.tight_layout()
    plt.show()
else:
    print("Keine Label Issues gefunden.")

In [None]:
# === LABEL QUALITY SCORE VERTEILUNG ===
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Histogramm
ax1.hist(quality_scores, bins=50, edgecolor="black", alpha=0.7, color="steelblue")
ax1.axvline(x=np.median(quality_scores), color="red", linestyle="--", label=f"Median: {np.median(quality_scores):.3f}")
ax1.set_xlabel("Label Quality Score")
ax1.set_ylabel("Anzahl")
ax1.set_title("Verteilung Label Quality Scores")
ax1.legend()

# Boxplot pro Klasse
score_data = []
for lbl in ALL_LABELS:
    mask = all_labelled["label"] == lbl
    for s in quality_scores[mask]:
        score_data.append({"label": lbl, "score": s})
score_df = pd.DataFrame(score_data)
score_df.boxplot(column="score", by="label", ax=ax2, rot=45, grid=False)
ax2.set_title("Label Quality Score pro Klasse")
ax2.set_xlabel("")
ax2.set_ylabel("Quality Score")
fig.suptitle("")

plt.tight_layout()
plt.show()

print(f"Score Statistiken:")
print(f"  Mean:   {quality_scores.mean():.3f}")
print(f"  Median: {np.median(quality_scores):.3f}")
print(f"  Min:    {quality_scores.min():.3f}")
print(f"  <0.5:   {(quality_scores < 0.5).sum()} Artikel")

In [None]:
# === TOP VERDAECHTIGE SAMPLES INSPIZIEREN ===
issue_df_sorted = all_labelled[all_labelled["is_label_issue"]].sort_values(
    "label_quality_score", ascending=True
)

N_SHOW = 30  # Anzahl angezeigter Samples

print(f"Top {min(N_SHOW, len(issue_df_sorted))} verdaechtigste Label Issues:\n")

for i, (_, row) in enumerate(issue_df_sorted.head(N_SHOW).iterrows()):
    print(f"{'='*70}")
    print(f"[{i+1}]  Score: {row['label_quality_score']:.3f}")
    print(f"  Given Label:     {row['label']}")
    print(f"  Predicted Label: {row['predicted_label']}")
    print(f"  ID: {row['id']}  |  Domain: {row.get('domain', 'N/A')}")
    print(f"  Headline: {row['headline']}")
    print(f"  Text: {str(row['text'])[:200]}...")
    print()

In [None]:
# === EINZELNE KLASSE INSPIZIEREN ===
INSPECT_CLASS = "Andere"  # <-- hier aendern

class_issues = all_labelled[
    (all_labelled["label"] == INSPECT_CLASS) &
    (all_labelled["is_label_issue"])
].sort_values("label_quality_score")

print(f"Label Issues in '{INSPECT_CLASS}': {len(class_issues)}\n")

for i, (_, row) in enumerate(class_issues.iterrows()):
    print(f"[{i+1}] Score={row['label_quality_score']:.3f}  -> Predicted: {row['predicted_label']}")
    print(f"    Headline: {row['headline']}")
    print(f"    Text: {str(row['text'])[:150]}...")
    print()

In [None]:
# === REPORT SPEICHERN ===
OUTPUT_DIR = "/content/drive/MyDrive/thesis_reports/cleanlab_analysis"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Vollstaendiger DataFrame
full_path = os.path.join(OUTPUT_DIR, "labeled_with_cleanlab_scores.csv")
all_labelled.to_csv(full_path, index=False, encoding="utf-8")
print(f"Vollstaendiger DataFrame: {full_path}")

# Nur Issues
issues_path = os.path.join(OUTPUT_DIR, "label_issues.csv")
issue_export = all_labelled[all_labelled["is_label_issue"]].sort_values("label_quality_score")
issue_export["text_preview"] = issue_export["text"].str[:200] + "..."
export_cols = ["id", "label", "predicted_label", "label_quality_score",
               "headline", "text_preview", "domain"]
export_cols = [c for c in export_cols if c in issue_export.columns]
issue_export[export_cols].to_csv(issues_path, index=False, encoding="utf-8")
print(f"Label Issues:            {issues_path} ({len(issue_export)} Artikel)")

# OOF Predictions speichern (fuer spaetere Analyse)
np.save(os.path.join(OUTPUT_DIR, "oof_pred_probs.npy"), oof_pred_probs)
print(f"OOF Predictions:         {os.path.join(OUTPUT_DIR, 'oof_pred_probs.npy')}")

In [None]:
# === SUMMARY ===
print("=" * 70)
print("CLEANLAB ANALYSE ZUSAMMENFASSUNG")
print("=" * 70)
print(f"  Gesamte Artikel:        {len(all_labelled)}")
print(f"  OOF F1 Macro:           {overall_f1:.4f}")
print(f"  Label Issues gefunden:  {all_labelled['is_label_issue'].sum()}")
print(f"  Anteil Issues:          {all_labelled['is_label_issue'].mean() * 100:.1f}%")
print(f"  Mean Quality Score:     {quality_scores.mean():.3f}")
print(f"  Scores < 0.5:           {(quality_scores < 0.5).sum()}")
print(f"\nErgebnisse gespeichert in: {OUTPUT_DIR}")
print("=" * 70)