In [None]:
# Umgebung prÃ¼fen & Installation
import importlib, logging, os, sys, subprocess
from packaging import version
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger("setup")

def ensure_min(pkg: str, min_ver: str):
    try:
        cur_ver = importlib.metadata.version(pkg)
        if version.parse(cur_ver) < version.parse(min_ver):
            raise importlib.metadata.PackageNotFoundError
        logger.info(f"{pkg} â‰¥ {min_ver} bereits installiert.")
    except importlib.metadata.PackageNotFoundError:
        subprocess.run([sys.executable, "-m", "pip", "install", f"{pkg}>={min_ver}"], check=True)

ensure_min("transformers", "4.40.0")
ensure_min("peft", "0.10.0")
ensure_min("datasets", "2.19.0")
ensure_min("bitsandbytes", "0.43.1")
ensure_min("accelerate", "0.27.2")

# Imports
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType

# Konfiguration
MODEL_NAME = "google/flan-t5-xl"
DATA_PATH = Path("/content/")
SAVE_DIR = DATA_PATH / "Output" / "flan-t5-xl-finetuned"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

LABEL_TOKEN_MAP = {
    "Zustimmung": "Zu",
    "Ablehnung": "Ab",
    "Neutral": "Ne",
}

# Datensatz laden & formatieren
def load_dataset(path: Path) -> Dataset:
    df = pd.read_csv(path, sep=";", encoding="utf-8-sig")
    print("\nCSV-Spalten:", df.columns.tolist())
    df = df[df["label"].isin(LABEL_TOKEN_MAP.keys())].copy()
    df["input_text"] = df["name"] + "; " + df["text"]
    df["target_text"] = df["label"]
    return Dataset.from_pandas(df[["input_text", "target_text"]])

train_ds = load_dataset(DATA_PATH / "train.csv")
val_ds = load_dataset(DATA_PATH / "val.csv")
val_ds = val_ds.shuffle(seed=42).select(range(200))

# Tokenizer & Modell
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_config,
)

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=["q", "v", "k", "o", "wi", "wo"],
)
model = get_peft_model(model, peft_config)
model.enable_input_require_grads()
model.config.use_cache = False
model.gradient_checkpointing_enable()

# Tokenisierung
def tokenize(example):
    name, text = example["input_text"].split(";", 1)
    text = text.strip()
    label = example["target_text"]

    prompt = (
        "### Aufgabe\n"
        f"Bewerte die Haltung des folgenden Tweets gegenÃ¼ber \"{name}\". "
        "BerÃ¼cksichtige Wortlaut, UntertÃ¶ne, Ironie und politische Anspielungen.\n\n"
        f"Tweet: {text}\n\n"
        "### Frage\n"
        f"Welche Haltung drÃ¼ckt der Tweet gegenÃ¼ber \"{name}\" aus?\n\n"
        "### AntwortmÃ¶glichkeiten:\n"
        "â€¢ Zustimmung: Der Tweet Ã¤uÃŸert sich explizit oder implizit positiv oder unterstÃ¼tzend Ã¼ber das Ziel.\n"
        "â€¢ Ablehnung: Der Tweet Ã¤uÃŸert sich explizit oder implizit negativ oder kritisch Ã¼ber das Ziel.\n"
        "â€¢ Neutral: Der Tweet ist sachlich, ambivalent oder zeigt keine erkennbare Haltung."
    )

    input_enc = tokenizer(prompt, truncation=True, max_length=160, padding="max_length")
    target_enc = tokenizer(LABEL_TOKEN_MAP[label], truncation=True, max_length=2, padding="max_length")

    # Maskiere Padding fÃ¼r den Loss mit -100
    labels = [tid if tid != tokenizer.pad_token_id else -100 for tid in target_enc["input_ids"]]
    input_enc["labels"] = labels

    return input_enc

train_ds = train_ds.map(tokenize, remove_columns=train_ds.column_names)
val_ds   = val_ds.map(tokenize, remove_columns=val_ds.column_names)

# Training
training_args = TrainingArguments(
    output_dir=str(SAVE_DIR),
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    learning_rate=2e-4,
    bf16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
)

# Trainieren
trainer.train()
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)


ðŸ“‹ CSV-Spalten: ['text', 'label', 'name']

ðŸ“‹ CSV-Spalten: ['text', 'label', 'name']


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/3150 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,1.063,0.361679
2,0.4143,0.350641
3,0.3702,0.331481
4,0.3444,0.332641
5,0.329,0.310808


('/content/Output/flan-t5-xl-finetuned/tokenizer_config.json',
 '/content/Output/flan-t5-xl-finetuned/special_tokens_map.json',
 '/content/Output/flan-t5-xl-finetuned/spiece.model',
 '/content/Output/flan-t5-xl-finetuned/added_tokens.json',
 '/content/Output/flan-t5-xl-finetuned/tokenizer.json')

In [None]:
# Stance Detection â€” FLANâ€‘T5 (XL) Training mit LoRA (4â€‘bit)
# Zeroâ€‘Shot Prompt, Ausgabe NUR als KÃ¼rzel: Zu / Ab / Ne
# Speichereffizient (QLoRAâ€‘Ã¤hnlich)

# Umgebung prÃ¼fen & Installation
import importlib, logging, os, sys, subprocess
from packaging import version
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger("setup")

def ensure_min(pkg: str, min_ver: str):
    try:
        cur_ver = importlib.metadata.version(pkg)
        if version.parse(cur_ver) < version.parse(min_ver):
            raise importlib.metadata.PackageNotFoundError
        logger.info(f"{pkg} â‰¥ {min_ver} bereits installiert.")
    except importlib.metadata.PackageNotFoundError:
        subprocess.run([sys.executable, "-m", "pip", "install", f"{pkg}>={min_ver}"], check=True)

ensure_min("transformers", "4.40.0")
ensure_min("peft", "0.10.0")
ensure_min("datasets", "2.19.0")
ensure_min("bitsandbytes", "0.43.1")
ensure_min("accelerate", "0.27.2")

# Imports
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType

# Konfiguration
MODEL_NAME = "google/flan-t5-xl"
DATA_PATH  = Path("/content/")
SAVE_DIR   = DATA_PATH / "Output" / "flan-t5-xl-finetuned"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

LABEL_TOKEN_MAP = {
    "Zustimmung": "Zu",
    "Ablehnung":  "Ab",
    "Neutral":    "Ne",
}

# Zeroâ€‘Shot Prompt (ohne Fewâ€‘Shots)
def build_prompt_text(name: str, text: str) -> str:
    return (
        f"### Aufgabe\n"
        f"Bewerte die Haltung des folgenden Tweets gegenÃ¼ber \"{name}\".\n\n"
        f"Tweet: {text}\n\n"
        "### AntwortmÃ¶glichkeiten:\n"
        "â€¢ Zustimmung: Der Tweet Ã¤uÃŸert sich explizit oder implizit positiv oder unterstÃ¼tzend Ã¼ber das Ziel.\n"
        "â€¢ Ablehnung: Der Tweet Ã¤uÃŸert sich explizit oder implizit negativ oder kritisch Ã¼ber das Ziel.\n"
        "â€¢ Neutral: Der Tweet ist sachlich, ambivalent oder zeigt keine erkennbare Haltung.\n"
        "### Ausgabeformat (Kurzform):\n"
        "Gib genau eines der folgenden KÃ¼rzel zurÃ¼ck (ohne AnfÃ¼hrungszeichen, ohne Punkt):\n"
        "Zu\nAb\nNe"
    )

# Datensatz laden & formatieren
# erwartet CSV mit Spalten: name; text; label
def load_dataset(path: Path) -> Dataset:
    df = pd.read_csv(path, sep=";", encoding="utf-8-sig")
    print("\nCSV-Spalten:", df.columns.tolist())
    df = df[df["label"].isin(LABEL_TOKEN_MAP.keys())].copy()
    df["input_text"]  = df["name"].astype(str) + "; " + df["text"].astype(str)
    df["target_text"] = df["label"]
    return Dataset.from_pandas(df[["input_text", "target_text"]])

train_ds = load_dataset(DATA_PATH / "train.csv")
val_ds   = load_dataset(DATA_PATH / "val.csv")
# optional kÃ¼rzen fÃ¼r schnellere Evals
val_ds   = val_ds.shuffle(seed=42).select(range(min(200, len(val_ds))))

# Tokenizer & Modell (4â€‘bit) + LoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_config,
)

# LoRAâ€‘Zielmodule fÃ¼r T5 (sauber fÃ¼r Q/K/V/O + FFN)
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=["q", "k", "v", "o", "wi", "wo"],  # T5-Namen
)
model = get_peft_model(model, peft_config)
model.enable_input_require_grads()   # wichtig bei 4â€‘bit
model.config.use_cache = False
model.gradient_checkpointing_enable()

# Tokenisierung
def tokenize(example):
    # Zerlege "name; text"
    name, text = example["input_text"].split(";", 1)
    name  = name.strip()
    text  = text.strip()
    label = example["target_text"]

    # Zeroâ€‘Shot Prompt
    prompt = build_prompt_text(name, text)

    # Eingabe/Target â€“ Target auf KÃ¼rzel â€žZu/Ab/Neâ€œ
    input_enc  = tokenizer(prompt, truncation=True, max_length=192, padding="max_length")
    target_enc = tokenizer(LABEL_TOKEN_MAP[label], truncation=True, max_length=3, padding="max_length")

    # Padding im Ziel maskieren
    labels = [tid if tid != tokenizer.pad_token_id else -100 for tid in target_enc["input_ids"]]
    input_enc["labels"] = labels
    return input_enc

train_ds = train_ds.map(tokenize, remove_columns=train_ds.column_names)
val_ds   = val_ds.map(tokenize,   remove_columns=val_ds.column_names)

# Training
training_args = TrainingArguments(
    output_dir=str(SAVE_DIR),
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    learning_rate=2e-4,
    bf16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
)

# Trainieren & Speichern
trainer.train()
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print(f"\nModell gespeichert unter: {SAVE_DIR}")


ðŸ“‹ CSV-Spalten: ['text', 'label', 'name']

ðŸ“‹ CSV-Spalten: ['text', 'label', 'name']


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/3150 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.7938,0.327253
2,0.3994,0.318699
3,0.3759,0.32505
4,0.3496,0.326682
5,0.3347,0.305165



âœ… Fertig. Modell gespeichert unter: /content/Output/flan-t5-xl-finetuned


In [None]:
# Test Evaluation Script â€“ FLAN-T5-XL (Seq2Seq)
import torch
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import classification_report

# Konfiguration
MODEL_DIR = Path("/content/Output/flan-t5-xl-finetuned")
TEST_PATH = Path("/content/test3.csv")
MAX_LEN   = 160

LABEL_TOKEN_MAP = {
    "Zustimmung": "Zu",
    "Ablehnung":  "Ab",
    "Neutral":    "Ne",
}
LABELS    = list(LABEL_TOKEN_MAP.keys())
ID2LABEL  = {v: k for k, v in LABEL_TOKEN_MAP.items()}

# Modell & Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR, device_map="auto")
model.eval()

# Testdaten laden
df = pd.read_csv(TEST_PATH, sep=";", encoding="utf-8-sig")
df = df[df["label"].isin(LABELS)].copy()
df["input_text"] = df["name"] + "; " + df["text"]

# Prompted Prediction Funktion (Seq2Seq)
def predict_label(name: str, text: str) -> str:
    def build_prompt_text(name: str, text: str) -> str:
        return (
            f"### Aufgabe\n"
            f"Bewerte die Haltung des folgenden Tweets gegenÃ¼ber \"{name}\".\n\n"
            f"Tweet: {text}\n\n"
            "### AntwortmÃ¶glichkeiten:\n"
            "â€¢ Zustimmung: Der Tweet Ã¤uÃŸert sich explizit oder implizit positiv oder unterstÃ¼tzend Ã¼ber das Ziel.\n"
            "â€¢ Ablehnung: Der Tweet Ã¤uÃŸert sich explizit oder implizit negativ oder kritisch Ã¼ber das Ziel.\n"
            "â€¢ Neutral: Der Tweet ist sachlich, ambivalent oder zeigt keine erkennbare Haltung.\n"
            "### Ausgabeformat (Kurzform):\n"
            "Gib genau eines der folgenden KÃ¼rzel zurÃ¼ck (ohne AnfÃ¼hrungszeichen, ohne Punkt):\n"
            "Zu\nAb\nNe"
        )

    prompt = build_prompt_text(name, text)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LEN,
        padding=True,
    ).to(model.device)

    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=2)

    decoded = tokenizer.decode(out[0], skip_special_tokens=True).strip()
    for label, token in LABEL_TOKEN_MAP.items():
        if token.lower() in decoded.lower():
            return label
    return "Unklar"

# Vorhersagen ausfÃ¼hren
df["pred"] = df.apply(lambda r: predict_label(r["name"], r["text"]), axis=1)

# Evaluation
print("\nClassification Report:")
print(classification_report(df["label"], df["pred"], labels=LABELS))

# Vorhersagen exportieren
OUT_FILE = MODEL_DIR.parent / f"{MODEL_DIR.name}_predictions.csv"
df.to_csv(OUT_FILE, sep=";", index=False, encoding="utf-8-sig")
print(f"\nVorhersagen gespeichert unter: {OUT_FILE}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


ðŸ”¢ Classification Report:
              precision    recall  f1-score   support

  Zustimmung       0.63      0.69      0.66       150
   Ablehnung       0.80      0.86      0.83       387
     Neutral       0.68      0.47      0.56       138

    accuracy                           0.74       675
   macro avg       0.70      0.67      0.68       675
weighted avg       0.74      0.74      0.73       675


ðŸ“‚ Predictions saved to: /content/Output/flan-t5-xl-finetuned_predictions.csv


In [None]:
from huggingface_hub import login, upload_folder, create_repo
from google.colab import userdata
from pathlib import Path

# Konfiguration
HF_TOKEN = userdata.get("HF_TOKEN")
HF_REPO_ID = "YangZexi/flan-t5-xl-stance-lora-v2"
SAVE_DIR = Path("/content/Output/flan-t5-xl-finetuned")

# Login
if not HF_TOKEN:
    raise ValueError("Kein HF_TOKEN gefunden â€“ unter 'Notebook > Secrets > HF_TOKEN' setzen.")
login(HF_TOKEN)

# Repository erstellen (falls nicht vorhanden)
create_repo(repo_id=HF_REPO_ID, repo_type="model", exist_ok=True, token=HF_TOKEN)

# Hochladen
upload_folder(
    repo_id=HF_REPO_ID,
    folder_path=SAVE_DIR,
    path_in_repo=".",
    repo_type="model",
    token=HF_TOKEN
)

print(f"Modell erfolgreich hochgeladen: https://huggingface.co/{HF_REPO_ID}")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...netuned/checkpoint-198/optimizer.pt:   0%|          |  130kB /  391MB            

  ...netuned/checkpoint-297/scheduler.pt: 100%|##########| 1.06kB / 1.06kB            

  ...netuned/checkpoint-198/scheduler.pt: 100%|##########| 1.06kB / 1.06kB            

  ...netuned/checkpoint-396/optimizer.pt:   0%|          |  130kB /  391MB            

  ...etuned/checkpoint-198/rng_state.pth:  78%|#######7  | 11.0kB / 14.2kB            

  ...netuned/checkpoint-396/scheduler.pt: 100%|##########| 1.06kB / 1.06kB            

  ...etuned/checkpoint-396/rng_state.pth:  78%|#######7  | 11.0kB / 14.2kB            

  ...netuned/checkpoint-396/spiece.model: 100%|##########|  792kB /  792kB            

  ...netuned/checkpoint-495/scheduler.pt: 100%|##########| 1.06kB / 1.06kB            

  ...netuned/checkpoint-495/spiece.model: 100%|##########|  792kB /  792kB            

âœ… Modell erfolgreich hochgeladen: https://huggingface.co/YangZexi/flan-t5-xl-stance-lora-v2
