In [None]:
# Setup: Umgebung prüfen & Installation (Colab Kompatibel)
import importlib, logging, os, sys, subprocess
from packaging import version
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger("setup")

def ensure_min(pkg: str, min_ver: str):
    try:
        cur_ver = importlib.metadata.version(pkg)
        if version.parse(cur_ver) < version.parse(min_ver):
            raise importlib.metadata.PackageNotFoundError
        logger.info(f"{pkg} >= {min_ver} ist bereits installiert.")
    except importlib.metadata.PackageNotFoundError:
        subprocess.run([sys.executable, "-m", "pip", "install", f"{pkg}>={min_ver}"], check=True)

ensure_min("transformers", "4.40.0")
ensure_min("peft", "0.10.0")
ensure_min("datasets", "2.19.0")
ensure_min("bitsandbytes", "0.43.1")
ensure_min("accelerate", "0.27.2")

# Imports
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType

# Konfiguration
MODEL_NAME = "google/mt5-xl"
DATA_PATH = Path("/content/")
SAVE_DIR = DATA_PATH / "Output" / "mt5-xl-finetuned-noprompt"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

LABEL_TOKEN_MAP = {
    "Zustimmung": "Zu",
    "Ablehnung": "Ab",
    "Neutral": "Ne",
}

# Dataset laden & Formatierung
def load_dataset(path: Path) -> Dataset:
    df = pd.read_csv(path, sep=";", encoding="utf-8-sig")
    print("\nCSV-Spalten:", df.columns.tolist())
    df = df[df["label"].isin(LABEL_TOKEN_MAP.keys())].copy()
    df["input_text"] = df["name"] + "; " + df["text"]
    df["target_text"] = df["label"]
    return Dataset.from_pandas(df[["input_text", "target_text"]])

train_ds = load_dataset(DATA_PATH / "train.csv")
val_ds = load_dataset(DATA_PATH / "val.csv")
val_ds = val_ds.shuffle(seed=42).select(range(200))

# Tokenizer & Modell
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_config,
)

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=["q", "v"],
)
model = get_peft_model(model, peft_config)
model.enable_input_require_grads()
model.config.use_cache = False
model.gradient_checkpointing_enable()

# Tokenisierung (ohne Prompt)
def tokenize(example):
    input_enc = tokenizer(
        example["input_text"],
        truncation=True,
        max_length=160,
        padding="max_length"
    )
    target_enc = tokenizer(
        LABEL_TOKEN_MAP[example["target_text"]],
        truncation=True,
        max_length=2,
        padding="max_length"
    )

    input_enc["labels"] = [
        tid if tid != tokenizer.pad_token_id else -100 for tid in target_enc["input_ids"]
    ]
    return input_enc

train_ds = train_ds.map(tokenize, remove_columns=train_ds.column_names)
val_ds = val_ds.map(tokenize, remove_columns=val_ds.column_names)

# Training
training_args = TrainingArguments(
    output_dir=str(SAVE_DIR),
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    learning_rate=2e-4,
    bf16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
)

# Trainieren
trainer.train()
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

In [None]:
from huggingface_hub import login, upload_folder, create_repo
from google.colab import userdata
from pathlib import Path

# Konfiguration
HF_TOKEN = userdata.get("HF_TOKEN")
HF_REPO_ID = "YangZexi/mt5-xl-stance-lora"
SAVE_DIR = Path("/content/Output/mt5-xl-finetuned-")

# Login
if not HF_TOKEN:
    raise ValueError("Kein HF_TOKEN gefunden – unter 'Notebook > Secrets > HF_TOKEN' setzen.")
login(HF_TOKEN)

# Repository erstellen (falls nicht vorhanden)
create_repo(repo_id=HF_REPO_ID, repo_type="model", exist_ok=True, token=HF_TOKEN)

# Hochladen
upload_folder(
    repo_id=HF_REPO_ID,
    folder_path=SAVE_DIR,
    path_in_repo=".",
    repo_type="model",
    token=HF_TOKEN
)

print(f"Modell erfolgreich hochgeladen: https://huggingface.co/{HF_REPO_ID}")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...etuned-/checkpoint-495/spiece.model:  95%|#########5| 4.11MB / 4.31MB            

  ...tuned-/checkpoint-99/tokenizer.json:  99%|#########9| 16.3MB / 16.4MB            

  ...ut/mt5-xl-finetuned-/tokenizer.json:  99%|#########9| 16.3MB / 16.4MB            

  ...etuned-/checkpoint-396/spiece.model:  95%|#########5| 4.11MB / 4.31MB            

  ...point-198/adapter_model.safetensors:   0%|          | 8.13kB / 75.5MB            

  ...inetuned-/adapter_model.safetensors:   0%|          | 8.13kB / 75.5MB            

  ...uned-/checkpoint-396/tokenizer.json:  99%|#########9| 16.3MB / 16.4MB            

  ...kpoint-99/adapter_model.safetensors:   0%|          | 8.13kB / 75.5MB            

  ...uned-/checkpoint-495/tokenizer.json:  99%|#########9| 16.3MB / 16.4MB            

  ...etuned-/checkpoint-198/spiece.model:  95%|#########5| 4.11MB / 4.31MB            

✅ Modell erfolgreich hochgeladen: https://huggingface.co/YangZexi/mt5-xl-stance-lora


In [None]:
# Test Evaluation Script – mT5-XL (Kein Prompt)
import torch
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import classification_report

# Konfiguration
MODEL_DIR = Path("/content/Output/mt5-xl-finetuned-noprompt")   # ggf. anpassen
TEST_PATH = Path("/content/test4.csv")
MAX_LEN   = 160

LABEL_TOKEN_MAP = {
    "Zustimmung": "Zu",
    "Ablehnung":  "Ab",
    "Neutral":    "Ne",
}
LABELS    = list(LABEL_TOKEN_MAP.keys())
ID2LABEL  = {v: k for k, v in LABEL_TOKEN_MAP.items()}

# Modell & Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR, device_map="auto")
model.eval()

# Testdaten laden
df = pd.read_csv(TEST_PATH, sep=";", encoding="utf-8-sig")
df = df[df["label"].isin(LABELS)].copy()
df["input_text"] = df["name"] + "; " + df["text"]

# Vorhersagefunktion (Kein Prompt)
def predict_label(text: str) -> str:
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LEN,
        padding=True,
    ).to(model.device)

    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=2)

    decoded = tokenizer.decode(out[0], skip_special_tokens=True).strip()
    for label, token in LABEL_TOKEN_MAP.items():
        if token.lower() in decoded.lower():
            return label
    return "Unklar"

# Vorhersagen ausführen
df["pred"] = df["input_text"].apply(predict_label)

# Evaluierung
print("\nKlassifizierungsbericht:")
print(classification_report(df["label"], df["pred"], labels=LABELS))

# Vorhersagen exportieren
OUT_FILE = MODEL_DIR.parent / f"{MODEL_DIR.name}_predictions.csv"
df.to_csv(OUT_FILE, sep=";", index=False, encoding="utf-8-sig")
print(f"\nVorhersagen gespeichert unter: {OUT_FILE}")


🔢 Classification Report:
              precision    recall  f1-score   support

  Zustimmung       0.68      0.73      0.70       145
   Ablehnung       0.76      0.89      0.82       389
     Neutral       0.76      0.33      0.46       141

    accuracy                           0.74       675
   macro avg       0.73      0.65      0.66       675
weighted avg       0.74      0.74      0.72       675


📂 Predictions saved to: /content/Output/mt5-xl-finetuned-noprompt_predictions.csv
