# Init Data

Lädt den Datensatz von HuggingFace und erstellt Train/Eval/Test Splits.
Nach Ausführung sind die Daten für alle `classify_*.ipynb` Notebooks verfügbar.

**Voraussetzung:** `init_colab.ipynb` muss vorher ausgeführt worden sein.

In [None]:
# Pfad setzen (falls nicht über init_colab geladen)
import sys
PIPELINE_DIR = "/content/news_articles_classification_thesis/Python/classification_pipeline"
if PIPELINE_DIR not in sys.path:
    sys.path.insert(0, PIPELINE_DIR)

In [None]:
# ===== SPLIT KONFIGURATION =====

# Modus: "percentage" oder "absolute"
SPLIT_MODE = "percentage"

# Percentage-Modus: Anteil der Train-Daten für Eval
EVAL_FRACTION = 0.2

# Absolute-Modus: Exakte Anzahl pro Klasse für Eval (nur wenn SPLIT_MODE = "absolute")
EVAL_PER_CLASS = 10

# Random Seed für reproduzierbare Splits
RANDOM_SEED = 42

# ===== DATEN LADEN =====

# True  = Alle Daten laden inkl. ~259k unlabeled Raw-Daten (braucht mehr RAM)
# False = Nur gelabelte Daten laden (Train + Test) — schneller, weniger Speicher
LOAD_RAW = False

In [None]:
# ===== LABEL MAPPING =====
# Links:  Original-Label im Datensatz
# Rechts: Neuer Name (Default = gleich wie Original)
#
# Beispiel: Um "Ukraine/Krieg/Russland" umzubenennen:
#   "Ukraine/Krieg/Russland": "Ukraine-Krieg",

LABEL_MAPPING = {
    "Klima / Energie": "Klima / Energie",
    "Zuwanderung": "Zuwanderung",
    "Renten": "Renten",
    "Soziales Gefälle": "Soziales Gefälle",
    "AfD/Rechte": "AfD/Rechte",
    "Arbeitslosigkeit": "Arbeitslosigkeit",
    "Wirtschaftslage": "Wirtschaftslage",
    "Politikverdruss": "Politikverdruss",
    "Gesundheitswesen, Pflege": "Gesundheitswesen, Pflege",
    "Kosten/Löhne/Preise": "Kosten/Löhne/Preise",
    "Ukraine/Krieg/Russland": "Ukraine/Krieg/Russland",
    "Bundeswehr/Verteidigung": "Bundeswehr/Verteidigung",
    "Andere": "Andere",
}

In [None]:
# HuggingFace Auth
from huggingface_hub import login

try:
    from google.colab import userdata
    hf_token = userdata.get("HF_TOKEN")
except Exception:
    hf_token = input("HuggingFace Token eingeben: ")

login(token=hf_token)
print("HuggingFace authentifiziert.")

In [None]:
# Dataset von HuggingFace laden
import pandas as pd
from datasets import load_dataset

DATASET_ID = "Zorryy/news_articles_2025_elections_germany"

ds = load_dataset(DATASET_ID, token=hf_token)

test_df = ds["test"].to_pandas()
train_full_df = ds["train"].to_pandas()

if LOAD_RAW:
    raw_df = ds["raw"].to_pandas()
    print(f"Raw:   {len(raw_df):>7} Artikel (unlabeled) geladen")
else:
    raw_df = None
    print("Raw-Daten nicht geladen (LOAD_RAW = False)")

print(f"Test:  {len(test_df):>7} Artikel (FROZEN)")
print(f"Train: {len(train_full_df):>7} Artikel (wird in Train + Eval aufgeteilt)")

In [None]:
# Label-Mapping anwenden
def apply_label_mapping(df, mapping):
    """Benennt Labels gemäß Mapping um."""
    if "label" not in df.columns:
        return df
    original_labels = set(df["label"].unique())
    mapping_keys = set(mapping.keys())
    missing = original_labels - mapping_keys
    if missing:
        print(f"  WARNUNG: Labels im Datensatz ohne Mapping: {missing}")
    df = df.copy()
    df["label"] = df["label"].map(mapping).fillna(df["label"])
    return df

test_df = apply_label_mapping(test_df, LABEL_MAPPING)
train_full_df = apply_label_mapping(train_full_df, LABEL_MAPPING)
if raw_df is not None and "label" in raw_df.columns:
    raw_df = apply_label_mapping(raw_df, LABEL_MAPPING)

# Zeige Mapping-Änderungen
remapped = {k: v for k, v in LABEL_MAPPING.items() if k != v}
if remapped:
    print("Label-Mapping Änderungen:")
    for orig, new in remapped.items():
        print(f"  {orig} → {new}")
else:
    print("Keine Labels umbenannt (alle Defaults).")

In [None]:
# Train/Eval Split
from sklearn.model_selection import train_test_split

if SPLIT_MODE == "percentage":
    # Klassen mit <2 Artikeln können nicht stratified gesplittet werden
    # Diese gehen komplett in Train
    class_counts = train_full_df["label"].value_counts()
    small_classes = class_counts[class_counts < 2].index.tolist()

    if small_classes:
        print(f"Klassen mit <2 Artikeln (gehen komplett in Train): {small_classes}")
        mask_small = train_full_df["label"].isin(small_classes)
        splittable_df = train_full_df[~mask_small]
        small_df = train_full_df[mask_small]

        train_df, eval_df = train_test_split(
            splittable_df,
            test_size=EVAL_FRACTION,
            stratify=splittable_df["label"],
            random_state=RANDOM_SEED,
        )
        train_df = pd.concat([train_df, small_df])
    else:
        train_df, eval_df = train_test_split(
            train_full_df,
            test_size=EVAL_FRACTION,
            stratify=train_full_df["label"],
            random_state=RANDOM_SEED,
        )
elif SPLIT_MODE == "absolute":
    eval_parts = []
    train_parts = []
    for label in train_full_df["label"].unique():
        class_df = train_full_df[train_full_df["label"] == label]
        n = min(len(class_df), EVAL_PER_CLASS)
        if n < 2:
            print(f"  {label}: nur {len(class_df)} Artikel -> komplett in Train")
            train_parts.append(class_df)
            continue
        eval_sample = class_df.sample(n=n, random_state=RANDOM_SEED)
        eval_parts.append(eval_sample)
        train_parts.append(class_df.drop(eval_sample.index))
    eval_df = pd.concat(eval_parts).reset_index(drop=True)
    train_df = pd.concat(train_parts).reset_index(drop=True)
else:
    raise ValueError(f"Ungültiger SPLIT_MODE: {SPLIT_MODE}. Nutze 'percentage' oder 'absolute'.")

train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)

print(f"\nTrain: {len(train_df):>6} Artikel")
print(f"Eval:  {len(eval_df):>6} Artikel")
print(f"Test:  {len(test_df):>6} Artikel (FROZEN)")

In [None]:
# In Runtime speichern
import importlib
import pipeline_utils as pu
importlib.reload(pu)

split_config = {
    "dataset_id": DATASET_ID,
    "split_mode": SPLIT_MODE,
    "eval_fraction": EVAL_FRACTION if SPLIT_MODE == "percentage" else None,
    "eval_per_class": EVAL_PER_CLASS if SPLIT_MODE == "absolute" else None,
    "random_seed": RANDOM_SEED,
    "load_raw": LOAD_RAW,
    "train_size": len(train_df),
    "eval_size": len(eval_df),
    "test_size": len(test_df),
    "raw_size": len(raw_df) if raw_df is not None else 0,
}

pu.set_runtime_data(train_df, eval_df, test_df, raw_df, split_config, LABEL_MAPPING)

print("\nDaten im Runtime-Cache gespeichert.")
print("Verfügbar via: pipeline_utils.get_runtime_data()")

# Verteilung pro Klasse
for name, split_df in [("Train", train_df), ("Eval", eval_df), ("Test", test_df)]:
    print(f"\n{name} Verteilung ({len(split_df)} Artikel):")
    print(split_df["label"].value_counts().to_string())