In [None]:
# Test Evaluation Script – mT5-XL (HF-LoRA, no Prompt)
import torch, json
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import (
    classification_report, accuracy_score, f1_score, precision_score,
    recall_score, confusion_matrix
)
from peft import PeftModel, PeftConfig
from time import time
from huggingface_hub import login
from google.colab import userdata, files
import shutil

# Login
login(token=userdata.get("HF_TOKEN"))

# Konfiguration
HF_REPO_ID = "YangZexi/mt5-xl-stance-lora"
MODEL_ID   = "mt5-xl"
TEST_PATH  = Path("/content/test3.csv")
MAX_LEN    = 160

LABEL_TOKEN_MAP = {
    "Zustimmung": "Zu",
    "Ablehnung":  "Ab",
    "Neutral":    "Ne",
}
LABELS = list(LABEL_TOKEN_MAP.keys())

# Modell & Tokenizer laden
peft_config = PeftConfig.from_pretrained(HF_REPO_ID)
base_model = AutoModelForSeq2SeqLM.from_pretrained(peft_config.base_model_name_or_path, device_map="auto")
model = PeftModel.from_pretrained(base_model, HF_REPO_ID)
model = model.merge_and_unload()
model.eval()

tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

# Testdaten laden
df = pd.read_csv(TEST_PATH, sep=";", encoding="utf-8-sig")
df = df[df["label"].isin(LABELS)].copy()
df["input_text"] = df["name"] + "; " + df["text"]

# Vorhersagefunktion (kein Prompt)
def predict_label(text: str) -> str:
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LEN,
        padding=True,
    ).to(model.device)

    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=2)

    decoded = tokenizer.decode(out[0], skip_special_tokens=True).strip()
    for label, token in LABEL_TOKEN_MAP.items():
        if token.lower() in decoded.lower():
            return label
    return "Unklar"

# Predictions mit Ladebalken
start = time()
tqdm.pandas(desc="Predicting")
df["pred"] = df["input_text"].progress_apply(predict_label)
duration = time() - start

# Metriken
report = classification_report(df["label"], df["pred"], labels=LABELS, output_dict=True)
conf_mat = confusion_matrix(df["label"], df["pred"], labels=LABELS)
per_class_f1 = f1_score(df["label"], df["pred"], labels=LABELS, average=None)

metrics = {
    "model": MODEL_ID,
    "accuracy": accuracy_score(df["label"], df["pred"]),
    "macro_f1": f1_score(df["label"], df["pred"], average="macro"),
    "weighted_f1": f1_score(df["label"], df["pred"], average="weighted"),
    "precision_macro": precision_score(df["label"], df["pred"], average="macro"),
    "recall_macro": recall_score(df["label"], df["pred"], average="macro"),
    "per_class_f1": dict(zip(LABELS, per_class_f1.tolist())),
    "confusion_matrix": conf_mat.tolist(),
    "inference_time_per_1000": duration / len(df) * 1000,
}

# Speichern
df_out = Path(f"/content/{MODEL_ID}_preds.csv")
metrics_json = Path(f"/content/{MODEL_ID}_metrics.json")
metrics_md = Path(f"/content/{MODEL_ID}_metrics.md")

df.to_csv(df_out, sep=";", index=False, encoding="utf-8-sig")
with open(metrics_json, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2, ensure_ascii=False)

# Markdown-Tabelle
markdown_table = "| Metric | Value |\n|---|---|\n"
for k, v in metrics.items():
    if isinstance(v, dict) or isinstance(v, list): continue
    markdown_table += f"| {k} | {v:.4f} |\n" if isinstance(v, (int, float)) else f"| {k} | {v} |\n"

with open(metrics_md, "w", encoding="utf-8") as f:
    f.write(markdown_table)

# Abschluss
print("\nClassification Report:")
print(classification_report(df["label"], df["pred"], labels=LABELS))
print(f"\nPredictions saved to: {df_out}")
print(f"Metrics saved to: {metrics_json}")

# Zielordner mit Modellnamen
DOWNLOAD_DIR = Path(f"/content/{MODEL_ID}_downloads")
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)

# Dateien in Zielordner verschieben/kopieren
shutil.copy(df_out, DOWNLOAD_DIR / df_out.name)
shutil.copy(metrics_json, DOWNLOAD_DIR / metrics_json.name)
shutil.copy(metrics_md, DOWNLOAD_DIR / metrics_md.name)

# Direkt herunterladen (Browser-Download einzeln)
print(f"\nStarte Downloads aus {DOWNLOAD_DIR} ...")
files.download(DOWNLOAD_DIR / df_out.name)
files.download(DOWNLOAD_DIR / metrics_json.name)
files.download(DOWNLOAD_DIR / metrics_md.name)

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/15.0G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/15.0G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/75.5M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
🔍 Predicting: 100%|██████████| 675/675 [01:23<00:00,  8.07it/s]



🔢 Classification Report:
              precision    recall  f1-score   support

  Zustimmung       0.66      0.69      0.68       150
   Ablehnung       0.76      0.90      0.82       387
     Neutral       0.68      0.30      0.42       138

    accuracy                           0.73       675
   macro avg       0.70      0.63      0.64       675
weighted avg       0.72      0.73      0.71       675


📂 Predictions saved to: /content/mt5-xl_preds.csv
📊 Metrics saved to: /content/mt5-xl_metrics.json

📥 Starte Downloads aus /content/mt5-xl_downloads ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Evaluation Script – HF-Hub XLM-RoBERTa-Large (Classifier + Metriken + Export)
import torch, json
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import (
    classification_report, accuracy_score, f1_score, precision_score,
    recall_score, confusion_matrix
)
from time import time
from huggingface_hub import login
from google.colab import userdata, files
import shutil

# Login (Falls Modell privat ist)
login(token=userdata.get("HF_TOKEN"))

# Konfiguration
HF_MODEL_ID = "YangZexi/xlm-roberta-large-stance-finetuned"
MODEL_ID    = "xlm-roberta-large"
TEST_PATH   = Path("/content/test3.csv")
MAX_LEN     = 256

LABEL_MAP = {"Zustimmung": 0, "Neutral": 1, "Ablehnung": 2}
ID2LABEL  = {v: k for k, v in LABEL_MAP.items()}
LABELS    = list(LABEL_MAP.keys())

# Modell & Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL_ID)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

# Testdaten laden
df = pd.read_csv(TEST_PATH, sep=";", encoding="utf-8-sig")
df = df[df["label"].isin(LABELS)].copy()
df["label_id"] = df["label"].map(LABEL_MAP)

# Vorhersagefunktion
def predict(texts, batch_size=16):
    preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch.tolist(), padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt").to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
        pred_ids = torch.argmax(logits, dim=-1).cpu().numpy()
        preds.extend(pred_ids)
    return preds

# Vorhersagen berechnen
start = time()
tqdm.pandas(desc="Predicting")
df["pred_id"] = predict(df["text"])
df["pred"] = df["pred_id"].map(ID2LABEL)
duration = time() - start

# Metriken berechnen
report = classification_report(df["label"], df["pred"], labels=LABELS, output_dict=True)
conf_mat = confusion_matrix(df["label"], df["pred"], labels=LABELS)
per_class_f1 = f1_score(df["label"], df["pred"], labels=LABELS, average=None)

metrics = {
    "model": MODEL_ID,
    "accuracy": accuracy_score(df["label"], df["pred"]),
    "macro_f1": f1_score(df["label"], df["pred"], average="macro"),
    "weighted_f1": f1_score(df["label"], df["pred"], average="weighted"),
    "precision_macro": precision_score(df["label"], df["pred"], average="macro"),
    "recall_macro": recall_score(df["label"], df["pred"], average="macro"),
    "per_class_f1": dict(zip(LABELS, per_class_f1.tolist())),
    "confusion_matrix": conf_mat.tolist(),
    "inference_time_per_1000": duration / len(df) * 1000,
}

# Speichern
df_out = Path(f"/content/{MODEL_ID}_preds.csv")
metrics_json = Path(f"/content/{MODEL_ID}_metrics.json")
metrics_md = Path(f"/content/{MODEL_ID}_metrics.md")

df.to_csv(df_out, sep=";", index=False, encoding="utf-8-sig")
with open(metrics_json, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2, ensure_ascii=False)

# Markdown-Tabelle
markdown_table = "| Metric | Value |\n|---|---|\n"
for k, v in metrics.items():
    if isinstance(v, dict) or isinstance(v, list): continue
    markdown_table += f"| {k} | {v:.4f} |\n" if isinstance(v, (int, float)) else f"| {k} | {v} |\n"

with open(metrics_md, "w", encoding="utf-8") as f:
    f.write(markdown_table)

# Abschluss
print("\nClassification Report:")
print(classification_report(df["label"], df["pred"], labels=LABELS))
print(f"\nPredictions saved to: {df_out}")
print(f"Metrics saved to: {metrics_json}")

# Zielordner mit Modellnamen
DOWNLOAD_DIR = Path(f"/content/{MODEL_ID}_downloads")
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)

# Dateien in Zielordner verschieben/kopieren
shutil.copy(df_out, DOWNLOAD_DIR / df_out.name)
shutil.copy(metrics_json, DOWNLOAD_DIR / metrics_json.name)
shutil.copy(metrics_md, DOWNLOAD_DIR / metrics_md.name)

# Direkt herunterladen (Browser-Download einzeln)
print(f"\nStarte Downloads aus {DOWNLOAD_DIR} ...")
files.download(DOWNLOAD_DIR / df_out.name)
files.download(DOWNLOAD_DIR / metrics_json.name)
files.download(DOWNLOAD_DIR / metrics_md.name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]


🔢 Classification Report:
              precision    recall  f1-score   support

  Zustimmung       0.71      0.65      0.68       150
     Neutral       0.55      0.74      0.63       138
   Ablehnung       0.85      0.78      0.82       387

    accuracy                           0.74       675
   macro avg       0.71      0.72      0.71       675
weighted avg       0.76      0.74      0.75       675


📂 Predictions saved to: /content/xlm-roberta-large_preds.csv
📊 Metrics saved to: /content/xlm-roberta-large_metrics.json

📥 Starte Downloads aus /content/xlm-roberta-large_downloads ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Evaluation Script – HF-Hub GBERT-Large (Classifier + Metriken + Export)
import torch, json
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import (
    classification_report, accuracy_score, f1_score, precision_score,
    recall_score, confusion_matrix
)
from time import time
from huggingface_hub import login
from google.colab import userdata, files
import shutil

# Login (nur nötig bei privaten Modellen)
login(token=userdata.get("HF_TOKEN"))

# Konfiguration
HF_MODEL_ID = "YangZexi/gbert-large-stance-finetuned"
MODEL_ID    = "gbert-large"
TEST_PATH   = Path("/content/test3.csv")
MAX_LEN     = 256

LABEL_MAP = {"Zustimmung": 0, "Neutral": 1, "Ablehnung": 2}
ID2LABEL  = {v: k for k, v in LABEL_MAP.items()}
LABELS    = list(LABEL_MAP.keys())

# Modell & Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL_ID)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

# Testdaten laden
df = pd.read_csv(TEST_PATH, sep=";", encoding="utf-8-sig")
df = df[df["label"].isin(LABELS)].copy()
df["label_id"] = df["label"].map(LABEL_MAP)

# Vorhersagefunktion
def predict(texts, batch_size=16):
    preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch.tolist(), padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt").to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
        pred_ids = torch.argmax(logits, dim=-1).cpu().numpy()
        preds.extend(pred_ids)
    return preds

# Vorhersagen berechnen
start = time()
tqdm.pandas(desc="Predicting")
df["pred_id"] = predict(df["text"])
df["pred"] = df["pred_id"].map(ID2LABEL)
duration = time() - start

# Metriken berechnen
report = classification_report(df["label"], df["pred"], labels=LABELS, output_dict=True)
conf_mat = confusion_matrix(df["label"], df["pred"], labels=LABELS)
per_class_f1 = f1_score(df["label"], df["pred"], labels=LABELS, average=None)

metrics = {
    "model": MODEL_ID,
    "accuracy": accuracy_score(df["label"], df["pred"]),
    "macro_f1": f1_score(df["label"], df["pred"], average="macro"),
    "weighted_f1": f1_score(df["label"], df["pred"], average="weighted"),
    "precision_macro": precision_score(df["label"], df["pred"], average="macro"),
    "recall_macro": recall_score(df["label"], df["pred"], average="macro"),
    "per_class_f1": dict(zip(LABELS, per_class_f1.tolist())),
    "confusion_matrix": conf_mat.tolist(),
    "inference_time_per_1000": duration / len(df) * 1000,
}

# Speichern
df_out = Path(f"/content/{MODEL_ID}_preds.csv")
metrics_json = Path(f"/content/{MODEL_ID}_metrics.json")
metrics_md = Path(f"/content/{MODEL_ID}_metrics.md")

df.to_csv(df_out, sep=";", index=False, encoding="utf-8-sig")
with open(metrics_json, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2, ensure_ascii=False)

# Markdown-Tabelle
markdown_table = "| Metric | Value |\n|---|---|\n"
for k, v in metrics.items():
    if isinstance(v, dict) or isinstance(v, list): continue
    markdown_table += f"| {k} | {v:.4f} |\n" if isinstance(v, (int, float)) else f"| {k} | {v} |\n"

with open(metrics_md, "w", encoding="utf-8") as f:
    f.write(markdown_table)

# Abschluss
print("\nClassification Report:")
print(classification_report(df["label"], df["pred"], labels=LABELS))
print(f"\nPredictions saved to: {df_out}")
print(f"Metrics saved to: {metrics_json}")

# Zielordner mit Modellnamen
DOWNLOAD_DIR = Path(f"/content/{MODEL_ID}_downloads")
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)

# Dateien in Zielordner verschieben/kopieren
shutil.copy(df_out, DOWNLOAD_DIR / df_out.name)
shutil.copy(metrics_json, DOWNLOAD_DIR / metrics_json.name)
shutil.copy(metrics_md, DOWNLOAD_DIR / metrics_md.name)

# Direkt herunterladen (Browser-Download einzeln)
print(f"\nStarte Downloads aus {DOWNLOAD_DIR} ...")
files.download(DOWNLOAD_DIR / df_out.name)
files.download(DOWNLOAD_DIR / metrics_json.name)
files.download(DOWNLOAD_DIR / metrics_md.name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]


🔢 Classification Report:
              precision    recall  f1-score   support

  Zustimmung       0.72      0.69      0.70       150
     Neutral       0.62      0.64      0.63       138
   Ablehnung       0.84      0.84      0.84       387

    accuracy                           0.76       675
   macro avg       0.72      0.72      0.72       675
weighted avg       0.77      0.76      0.76       675


📂 Predictions saved to: /content/gbert-large_preds.csv
📊 Metrics saved to: /content/gbert-large_metrics.json

📥 Starte Downloads aus /content/gbert-large_downloads ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>