In [None]:
"""
Feinabstimmung von XLM-RoBERTa-large für Stance Detection mit klassen-gewichteter Verlustfunktion.
Bewertet Macro-F1 auf dem Validierungsdatensatz.
"""

# Bibliotheken importieren
import logging, inspect
from pathlib import Path
from collections import Counter

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Konfiguration
DATA_PATH = Path("/content/")
MODEL_NAME = "xlm-roberta-large"
SAVE_DIR = Path("/content/Output/xlm-roberta-large-finetuned")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

label2id = {"Zustimmung": 0, "Neutral": 1, "Ablehnung": 2}
id2label = {v: k for k, v in label2id.items()}

# CSV-Ladefunktion
def load_flat_dataset(fname):
    df = pd.read_csv(DATA_PATH / fname, sep=";", encoding="utf-8-sig").dropna()

    if "\ufefftext" in df.columns:
        df = df.rename(columns={"﻿text": "text"})

    df["text"] = df["text"].astype(str)
    df["label"] = df["label"].map(label2id)

    print(f"{fname} geladen: {len(df)} Zeilen")
    print("Beispiel:", df.iloc[0]["text"])
    print("Typen:", df["text"].apply(type).value_counts())
    return Dataset.from_pandas(df)

# Tokenizer und Data Collator
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tok_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding=True, max_length=256)

collator = DataCollatorWithPadding(tokenizer)

# Trainings- und Validierungsdaten laden
train_ds = load_flat_dataset("train.csv")
val_ds   = load_flat_dataset("val.csv")

train_ds = train_ds.map(tok_fn, batched=True)
val_ds   = val_ds.map(tok_fn, batched=True)

# Modell initialisieren
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=3, id2label=id2label, label2id=label2id
)

# Klassen-Gewichte berechnen
counts  = Counter(train_ds["label"])
N_total = sum(counts.values())
weights = torch.tensor([N_total / (counts[i] * 3) for i in range(3)], dtype=torch.float)
logger.info("Klassenverteilung %s → Gewichte %s", counts, weights)

# Trainingsargumente
kw = "evaluation_strategy" if "evaluation_strategy" in inspect.signature(TrainingArguments.__init__).parameters else "eval_strategy"

training_args = TrainingArguments(
    **{
        "output_dir": str(SAVE_DIR),
        kw: "epoch",
        "save_strategy": "epoch",
        "logging_strategy": "steps",
        "logging_steps": 50,
        "learning_rate": 2e-5,
        "per_device_train_batch_size": 8,
        "per_device_eval_batch_size": 8,
        "gradient_accumulation_steps": 4,
        "num_train_epochs": 5,
        "weight_decay": 0.1,
        "warmup_ratio": 0.1,
        "fp16": True,
        "load_best_model_at_end": True,
        "metric_for_best_model": "eval_macro_f1",
        "logging_dir": str(SAVE_DIR / "logs"),
        "report_to": "none",
    }
)

# Metriken
def metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
    }

# Trainer mit klassen-gewichteter Loss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").to(model.device)
        outputs = model(**inputs)
        weight_vec = weights.to(outputs.logits.device)
        loss = nn.functional.cross_entropy(outputs.logits, labels, weight=weight_vec)
        return (loss, outputs) if return_outputs else loss

# Training starten
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=metrics,
)

trainer.train()

# Bestes Modell speichern
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
logger.info("Bestes Modell gespeichert in %s", SAVE_DIR)



📄 train.csv geladen: 3150 Zeilen
🔍 Beispiel: Ich kenne mehrere, große und langjährige Papierfabriken, die von Ausländischen Investoren aufgekauft wurden und nun alle Abgewirtschaftet und geschlossen sind. Im besten Deutschland aller Zeiten.
🧪 Typen: text
<class 'str'>    3150
Name: count, dtype: int64
📄 val.csv geladen: 675 Zeilen
🔍 Beispiel: Der Ball ist rund und ein Spiel dauert 90 Minuten.
🧪 Typen: text
<class 'str'>    675
Name: count, dtype: int64


Map:   0%|          | 0/3150 [00:00<?, ? examples/s]

Map:   0%|          | 0/675 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,1.1032,0.931196,0.632593,0.613992
2,0.8982,0.769282,0.682963,0.650535
3,0.7217,0.768051,0.718519,0.679516
4,0.5176,0.878861,0.711111,0.671088
5,0.3822,0.908747,0.708148,0.667665


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


In [None]:
from huggingface_hub import login, create_repo
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from pathlib import Path

# Hugging Face Token aus Colab Secrets laden
from google.colab import userdata
HF_TOKEN = userdata.get("HF_TOKEN")  # Vorher in den Secrets setzen

if HF_TOKEN is None:
    raise ValueError("Hugging Face Token nicht gefunden. Bitte unter 'Notebook > Secrets' als 'HF_TOKEN' hinzufügen.")

# Lokaler Modellpfad und Ziel-Repository
MODEL_DIR = Path("/content/Output/xlm-roberta-large-finetuned")
REPO_NAME = "xlm-roberta-large-stance-finetuned"
HF_USERNAME = "YangZexi"
REPO_ID = f"{HF_USERNAME}/{REPO_NAME}"

# Anmeldung und Repository anlegen
login(token=HF_TOKEN)
create_repo(repo_id=REPO_ID, exist_ok=True)

# Modell und Tokenizer laden
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

# Modell und Tokenizer zu Hugging Face hochladen
model.push_to_hub(REPO_ID)
tokenizer.push_to_hub(REPO_ID)

print(f"Erfolgreich hochgeladen: https://huggingface.co/{REPO_ID}")


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmp1pf9cc1s/model.safetensors    :   0%|          |  554kB / 2.24GB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpw6feby54/tokenizer.json       :   1%|1         |  178kB / 17.1MB            

  ...tmpw6feby54/sentencepiece.bpe.model: 100%|##########| 5.07MB / 5.07MB            

✅ Erfolgreich hochgeladen: https://huggingface.co/YangZexi/xlm-roberta-large-stance-finetuned


In [None]:
# Evaluation-Skript – XLM-RoBERTa-Large (Klassifikator)
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report

# Pfade
MODEL_DIR = Path("/content/Output/xlm-roberta-large-finetuned")  # Pfad zum Fine-Tuned-Modell
TEST_PATH = Path("/content/test.csv")

# Label-Mapping
LABEL_MAP = {"Zustimmung": 0, "Neutral": 1, "Ablehnung": 2}
ID2LABEL  = {v: k for k, v in LABEL_MAP.items()}

# Modell und Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

# Testdaten laden
df = pd.read_csv(TEST_PATH, sep=";", encoding="utf-8-sig")
df = df[df["label"].isin(LABEL_MAP.keys())].copy()
df["label_id"] = df["label"].map(LABEL_MAP)

# Vorhersagefunktion
def predict(texts, batch_size=16):
    preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch.tolist(),
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        ).to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
        pred_ids = torch.argmax(logits, dim=-1).cpu().numpy()
        preds.extend(pred_ids)
    return preds

# Vorhersagen berechnen
df["pred_id"] = predict(df["text"])
df["pred"] = df["pred_id"].map(ID2LABEL)

# Klassifikationsbericht ausgeben
print("\nKlassifikationsbericht:")
print(classification_report(df["label"], df["pred"], labels=list(LABEL_MAP.keys())))

# Ergebnisse speichern
OUT_PATH = MODEL_DIR.parent / f"{MODEL_DIR.name}_predictions.csv"
df.to_csv(OUT_PATH, sep=";", index=False, encoding="utf-8-sig")
print(f"\nVorhersagen gespeichert unter: {OUT_PATH}")


  return forward_call(*args, **kwargs)



🔢 Classification Report:
              precision    recall  f1-score   support

  Zustimmung       0.67      0.62      0.64       149
     Neutral       0.53      0.68      0.59       143
   Ablehnung       0.83      0.77      0.80       383

    accuracy                           0.72       675
   macro avg       0.68      0.69      0.68       675
weighted avg       0.73      0.72      0.72       675


📂 Predictions saved to: /content/Output/xlm-roberta-large-finetuned_predictions.csv
