In [None]:
# Inferenz – LLaMA-3.1-8B ODER Mistral-7B Instruct
# LoRA aus HF laden, HARD-MERGE & evaluieren

import os, time, re, json, hashlib, warnings
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
from pathlib import Path
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
transformers.logging.set_verbosity_error()
warnings.filterwarnings("ignore", message=".*generation flags are not valid.*")
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from peft import PeftModel, PeftConfig
from huggingface_hub import login

# Modell Auswahl
MODEL_CHOICE = "mistral"

# Repos für deine LoRA-Adapter
HF_REPO_LLAMA   = "YangZexi/llama-3.1-8B-Instruct-stance-lora-v2"
HF_REPO_MISTRAL = "YangZexi/mistral-7b-stance-lora-v2"

# Default Base-Modelle (werden ggf. per PeftConfig überschrieben)
BASE_LLAMA   = "meta-llama/Meta-Llama-3.1-8B-Instruct"
BASE_MISTRAL = "mistralai/Mistral-7B-Instruct-v0.3"

# Pfade / Namen
TEST_CSV   = "/content/test3.csv"
MODEL_TAG  = "llama-3.1-8b-ft" if MODEL_CHOICE == "llama" else "mistral-7b-ft"

# Gen-Settings
MAX_LEN_IN  = 512
MAX_NEW_TOK = 3
TEMPERATURE = 0.0
DO_SAMPLE   = False

LABEL_TOKEN_MAP = {"Zustimmung": "Zu", "Ablehnung": "Ab", "Neutral": "Ne"}
LABELS = list(LABEL_TOKEN_MAP.keys())

# Login (optional)
try:
    from google.colab import userdata
    HF_TOKEN = userdata.get("HF_TOKEN")
except Exception:
    HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN:
    login(token=HF_TOKEN)

# Helfer Funktionen
_CLEAN_RE = re.compile(r'[\"\'\.\,\:\;\!\?\-\—\–\(\)\[\]\{\}]')
def parse_short_to_label(raw_output: str) -> str:
    if not isinstance(raw_output, str): return "Unklar"
    t = _CLEAN_RE.sub(" ", raw_output.strip().lower())
    t = re.sub(r"\s+", " ", t).strip()
    if not t: return "Unklar"
    first = t.split()[0]
    if first.startswith("zu"):  return "Zustimmung"
    if first.startswith("ab"):  return "Ablehnung"
    if first.startswith("ne"):  return "Neutral"
    if "zu" in t: return "Zustimmung"
    if "ab" in t: return "Ablehnung"
    if "ne" in t: return "Neutral"
    return "Unklar"

def pick_attn_impl():
    try:
        _ = torch.backends.cuda.sdp_kernel
        return "sdpa"
    except Exception:
        return "eager"

def tensor_md5(t):
    try:
        return hashlib.md5(t.detach().float().cpu().numpy().tobytes()).hexdigest()
    except Exception:
        return "NA"

# Prompt (chat-template kompatibel)
def build_messages(name: str, text: str):
    system_msg = {
        "role": "system",
        "content": (
            "Du bist ein Stance-Klassifizierer für politische Tweets. "
            "Kategorisiere die Haltung als genau eine der drei Klassen: Zustimmung, Ablehnung oder Neutral."
        ),
    }
    user_prompt = (
        f"### Aufgabe\n"
        f"Bewerte die Haltung des folgenden Tweets gegenüber \"{name}\".\n\n"
        f"Tweet: {text}\n\n"
        "### Antwortmöglichkeiten:\n"
        "• Zustimmung: Der Tweet äußert sich explizit oder implizit positiv oder unterstützend über das Ziel.\n"
        "• Ablehnung: Der Tweet äußert sich explizit oder implizit negativ oder kritisch über das Ziel.\n"
        "• Neutral: Der Tweet ist sachlich, ambivalent oder zeigt keine erkennbare Haltung.\n"
        "### Ausgabeformat (Kurzform):\n"
        "Gib **genau eines** der folgenden Kürzel zurück (ohne Anführungszeichen, ohne Punkt):\n"
        "Zu\nAb\nNe"
    )
    return [system_msg, {"role": "user", "content": user_prompt}]

# Modell & Adapter Auswahl
if MODEL_CHOICE == "llama":
    HF_REPO_ID = HF_REPO_LLAMA
    BASE_MODEL = BASE_LLAMA
elif MODEL_CHOICE == "mistral":
    HF_REPO_ID = HF_REPO_MISTRAL
    BASE_MODEL = BASE_MISTRAL
else:
    raise ValueError("MODEL_CHOICE muss 'llama' oder 'mistral' sein.")

# ggf. Base aus Adapter-Config überschreiben
peft_cfg = PeftConfig.from_pretrained(HF_REPO_ID)
if peft_cfg.base_model_name_or_path and peft_cfg.base_model_name_or_path != BASE_MODEL:
    print(f" Adapter meldet base={peft_cfg.base_model_name_or_path} überschreibe BASE_MODEL.")
    BASE_MODEL = peft_cfg.base_model_name_or_path

# Laden & MERGE
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=dtype,
    trust_remote_code=True,
    attn_implementation=pick_attn_impl(),
)

probe_name = next((k for k in base.state_dict().keys() if k.endswith(".weight")), None)
pre_hash = tensor_md5(base.state_dict()[probe_name]) if probe_name else "NA"

model = PeftModel.from_pretrained(base, HF_REPO_ID)
model = model.merge_and_unload(safe_merge=True)
model.eval()

post_hash = tensor_md5(model.state_dict()[probe_name]) if probe_name else "NA"
if pre_hash == post_hash:
    print(" Merge-Check: Probe-Gewicht unverändert – Adapter evtl. nicht angewendet.")
else:
    print(" Merge-Check: Gewichte geändert – Merge ok.")

# Daten laden
df = pd.read_csv(TEST_CSV, sep=";", encoding="utf-8-sig")
df = df[df["label"].isin(LABELS)].copy()

# Vorhersage
def predict_one(name: str, text: str) -> str:
    messages = build_messages(name, text)
    chat_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    enc = tokenizer(
        chat_text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LEN_IN,
        padding=False
    ).to(model.device)

    gen_kwargs = dict(
        max_new_tokens=MAX_NEW_TOK,
        do_sample=DO_SAMPLE,
        pad_token_id=tokenizer.eos_token_id,
    )
    if DO_SAMPLE:
        gen_kwargs["temperature"] = TEMPERATURE

    with torch.no_grad():
        gen = model.generate(**enc, **gen_kwargs)

    prompt_len = enc["input_ids"].shape[1]
    out = tokenizer.decode(gen[0][prompt_len:], skip_special_tokens=True).strip()
    return parse_short_to_label(out)

tqdm.pandas(desc=f" Vorhersage ({MODEL_TAG})", dynamic_ncols=True, leave=True)
start = time.time()
df["pred"] = df.progress_apply(lambda r: predict_one(str(r["name"]).strip(), str(r["text"]).strip()), axis=1)
dt = time.time() - start


# Metriken & Speichern
conf_mat = confusion_matrix(df["label"], df["pred"], labels=LABELS)
per_class_f1 = f1_score(df["label"], df["pred"], labels=LABELS, average=None)

metrics = {
    "model": MODEL_TAG,
    "base_model": BASE_MODEL,
    "adapter_repo": HF_REPO_ID,
    "accuracy": accuracy_score(df["label"], df["pred"]),
    "macro_f1": f1_score(df["label"], df["pred"], average="macro"),
    "weighted_f1": f1_score(df["label"], df["pred"], average="weighted"),
    "precision_macro": precision_score(df["label"], df["pred"], average="macro"),
    "recall_macro": recall_score(df["label"], df["pred"], average="macro"),
    "per_class_f1": dict(zip(LABELS, per_class_f1.tolist())),
    "confusion_matrix": conf_mat.tolist(),
    "inference_time_per_1000": (dt / max(len(df), 1)) * 1000,
}

OUT_PREDS   = Path(f"/content/{MODEL_TAG}_preds.csv")
OUT_METRICS = Path(f"/content/{MODEL_TAG}_metrics.json")
OUT_MD      = Path(f"/content/{MODEL_TAG}_metrics.md")

df.to_csv(OUT_PREDS, sep=";", index=False, encoding="utf-8-sig")
with open(OUT_METRICS, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2, ensure_ascii=False)

md = "| Metric | Value |\n|---|---|\n"
for k, v in metrics.items():
    if isinstance(v, (int, float)):
        md += f"| {k} | {v:.4f} |\n"
    elif isinstance(v, str):
        md += f"| {k} | {v} |\n"
with open(OUT_MD, "w", encoding="utf-8") as f:
    f.write(md)

print("\n Classification Report:")
print(classification_report(df["label"], df["pred"], labels=LABELS))
print(f"\n Vorhersagen: {OUT_PREDS}")
print(f" Metriken:     {OUT_METRICS}")
print(f" Metriken MD:  {OUT_MD}")

try:
    from google.colab import files
    files.download(OUT_PREDS)
    files.download(OUT_METRICS)
    files.download(OUT_MD)
except Exception:
    pass

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

❗ Merge-Check: Probe-Gewicht unverändert – Adapter evtl. nicht angewendet.


🔍 Predicting (mistral-7b-ft): 100%|██████████| 675/675 [02:17<00:00,  4.90it/s]



🔢 Classification Report:
              precision    recall  f1-score   support

  Zustimmung       0.89      0.69      0.78       150
   Ablehnung       0.88      0.86      0.87       387
     Neutral       0.62      0.80      0.70       138

   micro avg       0.81      0.81      0.81       675
   macro avg       0.80      0.78      0.78       675
weighted avg       0.83      0.81      0.82       675


📂 Predictions: /content/mistral-7b-ft_preds.csv
📊 Metrics:     /content/mistral-7b-ft_metrics.json
📝 Metrics MD:  /content/mistral-7b-ft_metrics.md


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Stance Detection — Mistral-7B / LLaMA-3.1 8B
# Modes: 1=Zero-shot, 2=Few-shot

import os, time, re, json, warnings
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

from pathlib import Path
import pandas as pd
import torch
from tqdm import tqdm

import transformers
transformers.logging.set_verbosity_error()
warnings.filterwarnings("ignore", message=".*generation flags are not valid.*")

from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import (
    classification_report, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
)

# Konfiguration
MODE = 2
MODEL_CHOICE = "llama"
TEST_CSV = "/content/test3.csv"
MAX_INPUT_TOKENS = 384

LABEL_TOKEN_MAP = {"Zustimmung": "Zu", "Ablehnung": "Ab", "Neutral": "Ne"}
SHORT_TO_LABEL  = {v.lower(): k for k, v in LABEL_TOKEN_MAP.items()}
SHORTS = list(LABEL_TOKEN_MAP.values())

# Few-shot Beispiele (kompakt)
FIXED_FEWSHOT = (
    '1:Tweet: "Polizeiliche Schnellgerichte für Haftstrafen bei Angriffen auf Einsatzkräfte – absolut richtig!"\n'
    'Bewerte die Haltung des folgenden Tweets gegenüber "CDU/CSU"?\nAntwort: Zu\n\n'
    '2:Tweet: "Sie/Ihr versteht es nicht und werdet es niemals verstehen."\n'
    'Bewerte die Haltung des folgenden Tweets gegenüber "Lang, Ricarda"?\nAntwort: Ab\n\n'
    '3:Tweet: "Energiesektor: #Subventionen im #Energiesektor müssen gezielt und nachhaltig eingesetzt werden."\n'
    'Bewerte die Haltung des folgenden Tweets gegenüber "Söder, Markus"?\nAntwort: Ne'
)

# Prompt Builder (Benutzerinhalt)
def build_prompt(name: str, text: str, few_shot: bool) -> str:
    header = (
        f"### Aufgabe\n"
        f"Bewerte die Haltung des folgenden Tweets gegenüber \"{name}\".\n\n"
        f"Tweet: {text}\n\n"
        "### Antwortmöglichkeiten:\n"
        "• Zustimmung: Der Tweet äußert sich explizit oder implizit positiv oder unterstützend über das Ziel.\n"
        "• Ablehnung: Der Tweet äußert sich explizit oder implizit negativ oder kritisch über das Ziel.\n"
        "• Neutral: Der Tweet ist sachlich, ambivalent oder zeigt keine erkennbare Haltung.\n"
    )
    mid = f"\n### Beispiele\n{FIXED_FEWSHOT}\n" if few_shot else ""
    tail = (
        "\n### Ausgabeformat (Kurzform):\n"
        "Gib genau eines der folgenden Kürzel zurück (ohne Anführungszeichen, ohne Punkt):\n"
        "Zu\n"
        "Ab\n"
        "Ne"
    )
    return header + mid + tail

# Nachrichten (System + Benutzer) und Chat-Vorlage
def build_messages(name: str, text: str, few_shot: bool):
    system_msg = {
        "role": "system",
        "content": (
            "Du bist ein Stance-Klassifizierer für politische Tweets. "
            "Kategorisiere die Haltung als genau eine der drei Klassen: Zustimmung, Ablehnung oder Neutral."
        )
    }
    user_msg = {"role": "user", "content": build_prompt(name, text, few_shot)}
    return [system_msg, user_msg]

def make_chat_text(tokenizer, name: str, text: str, few_shot: bool) -> str:
    messages = build_messages(name, text, few_shot)
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

# Parser Zu/Ab/Ne
_CLEAN_RE = re.compile(r'[\"\'\.\,\:\;\!\?\-\—\–\(\)\[\]\{\}]')
def parse_short_to_label(raw_output: str) -> str:
    if not isinstance(raw_output, str): return "Unklar"
    t = _CLEAN_RE.sub(" ", raw_output.strip().lower())
    t = re.sub(r"\s+", " ", t).strip()
    if not t: return "Unklar"
    w = t.split()[0]
    if w in SHORT_TO_LABEL: return SHORT_TO_LABEL[w]
    if w.startswith("zu"): return "Zustimmung"
    if w.startswith("ab"): return "Ablehnung"
    if w.startswith("ne"): return "Neutral"
    for s, lab in SHORT_TO_LABEL.items():
        if s in t: return lab
    return "Unklar"

# Erzwungene Dekodierung (nur Zu/Ab/Ne)
def build_allowed_token_seqs(tokenizer, shorts):
    seqs = []
    for s in shorts:
        ids = tokenizer.encode(" " + s, add_special_tokens=False)
        if not ids:
            ids = tokenizer.encode(s, add_special_tokens=False)
        seqs.append(ids)
    return seqs

def make_prefix_allowed_tokens_fn(prompt_len, allowed_seqs, eos_id):
    def fn(batch_id, input_ids):
        ids = input_ids.tolist()
        gen = ids[prompt_len:]
        if len(gen) == 0:
            return list({seq[0] for seq in allowed_seqs})
        allowed = set()
        for seq in allowed_seqs:
            if gen == seq[:len(gen)]:
                if len(gen) < len(seq):
                    allowed.add(seq[len(gen)])
                else:
                    if eos_id is not None:
                        allowed.add(eos_id)
        if not allowed:
            allowed = {seq[0] for seq in allowed_seqs}
        return list(allowed)
    return fn

# Modell laden (nur Basismodelle)
def load_model_and_tokenizer():
    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

    if MODEL_CHOICE.lower() == "mistral":
        BASE = "mistralai/Mistral-7B-Instruct-v0.3"
        base_name = "mistral-7b"
    elif MODEL_CHOICE.lower() in {"llama", "llama3", "llama-3"}:
        BASE = "meta-llama/Meta-Llama-3.1-8B-Instruct"
        base_name = "llama-3.1-8b"
    else:
        raise ValueError("MODEL_CHOICE muss 'mistral' oder 'llama' sein.")

    tok = AutoTokenizer.from_pretrained(BASE, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    tok.padding_side = "right"

    model = AutoModelForCausalLM.from_pretrained(
        BASE, device_map="auto", torch_dtype=dtype, trust_remote_code=True
    )
    return model.eval(), tok, base_name

model, tokenizer, base_name = load_model_and_tokenizer()
mode_name = {1: "zero", 2: "few"}[MODE]
MODEL_ID = f"{base_name}-{mode_name}"

# Daten
df = pd.read_csv(TEST_CSV, sep=";", encoding="utf-8-sig")
valid = set(LABEL_TOKEN_MAP.keys())
df = df[df["label"].isin(valid)].copy()

# Vorhersage (Progress-Bar, deterministisch, ohne temp/top_p)
allowed_seqs = build_allowed_token_seqs(tokenizer, SHORTS)
eos_id = tokenizer.eos_token_id

def predict_one(name: str, text: str):
    chat_text = make_chat_text(tokenizer, name, text, few_shot=(MODE == 2))
    enc = tokenizer(
        chat_text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_INPUT_TOKENS,
        padding=False
    ).to(model.device)
    prompt_len = enc["input_ids"].shape[1]
    allowed_fn = make_prefix_allowed_tokens_fn(prompt_len, allowed_seqs, eos_id)
    with torch.no_grad():
        gen = model.generate(
            **enc,
            max_new_tokens=max(len(s) for s in allowed_seqs) + 1,
            do_sample=False,  # deterministisch
            prefix_allowed_tokens_fn=allowed_fn,
            pad_token_id=tokenizer.eos_token_id,
        )
    out = tokenizer.decode(gen[0][prompt_len:], skip_special_tokens=True).strip()
    return parse_short_to_label(out)

tqdm.pandas(desc=f" Vorhersage ({base_name} {mode_name})", dynamic_ncols=True, leave=True)
start = time.time()
df["pred"] = df.progress_apply(lambda r: predict_one(str(r["name"]).strip(), str(r["text"]).strip()), axis=1)
duration = time.time() - start

# Metriken & Speichern
labels = list(LABEL_TOKEN_MAP.keys())
report = classification_report(df["label"], df["pred"], labels=labels, output_dict=True)
conf_mat = confusion_matrix(df["label"], df["pred"], labels=labels)
per_class_f1 = f1_score(df["label"], df["pred"], labels=labels, average=None)

metrics = {
    "model": MODEL_ID,
    "accuracy": accuracy_score(df["label"], df["pred"]),
    "macro_f1": f1_score(df["label"], df["pred"], average="macro"),
    "weighted_f1": f1_score(df["label"], df["pred"], average="weighted"),
    "precision_macro": precision_score(df["label"], df["pred"], average="macro"),
    "recall_macro": recall_score(df["label"], df["pred"], average="macro"),
    "per_class_f1": dict(zip(labels, per_class_f1.tolist())),
    "confusion_matrix": conf_mat.tolist(),
    "inference_time_per_1000": (duration / max(len(df), 1)) * 1000,
}

OUT_PREDS   = Path(f"/content/{MODEL_ID}_preds.csv")
OUT_METRICS = Path(f"/content/{MODEL_ID}_metrics.json")
df.to_csv(OUT_PREDS, sep=";", index=False, encoding="utf-8-sig")
with open(OUT_METRICS, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2, ensure_ascii=False)

print("\nClassification Report:")
print(classification_report(df["label"], df["pred"], labels=labels))
print(f"\nPredictions: {OUT_PREDS}")
print(f"Metrics:     {OUT_METRICS}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

🔍 Predicting (llama-3.1-8b few): 100%|██████████| 675/675 [02:56<00:00,  3.83it/s]


Classification Report:
              precision    recall  f1-score   support

  Zustimmung       0.32      0.83      0.46       150
   Ablehnung       0.82      0.53      0.64       387
     Neutral       0.68      0.20      0.30       138

    accuracy                           0.53       675
   macro avg       0.60      0.52      0.47       675
weighted avg       0.68      0.53      0.53       675


Predictions: /content/llama-3.1-8b-few_preds.csv
Metrics:     /content/llama-3.1-8b-few_metrics.json





In [None]:
# Test Evaluation Skript – FLAN-T5-XL + kompakter Prompt + HF-LoRA
import torch, json, warnings
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import (
    classification_report, accuracy_score, f1_score, precision_score,
    recall_score, confusion_matrix
)
from peft import PeftModel, PeftConfig
from time import time
from huggingface_hub import login
from google.colab import userdata, files
import shutil

# Login
try:
    login(token=userdata.get("HF_TOKEN"))
except Exception:
    print("Hugging Face Token nicht gefunden. Einige Modelle werden möglicherweise nicht geladen.")

# Konfiguration
HF_REPO_ID = "YangZexi/flan-t5-xl-stance-lora"
MODEL_ID   = "flan-t5-xl"
TEST_PATH  = Path("/content/test3.csv")
MAX_LEN    = 160

LABEL_TOKEN_MAP = {
    "Zustimmung": "Zu",
    "Ablehnung":  "Ab",
    "Neutral":    "Ne",
}
LABELS = list(LABEL_TOKEN_MAP.keys())

# Modell & Tokenizer laden
peft_config = PeftConfig.from_pretrained(HF_REPO_ID)
base_model = AutoModelForSeq2SeqLM.from_pretrained(peft_config.base_model_name_or_path, device_map="auto")
model = PeftModel.from_pretrained(base_model, HF_REPO_ID)
model = model.merge_and_unload()
model.eval()

tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

# Testdaten laden
df = pd.read_csv(TEST_PATH, sep=";", encoding="utf-8-sig")
df = df[df["label"].isin(LABELS)].copy()
df["input_text"] = df["name"] + "; " + df["text"]

# Vorhersagefunktion
def predict_label(name: str, text: str) -> str:
    def build_prompt_text(name: str, text: str) -> str:
        return (
            f"### Aufgabe\n"
            f"Bewerte die Haltung des folgenden Tweets gegenüber \"{name}\".\n\n"
            f"Tweet: {text}\n\n"
            "### Antwortmöglichkeiten:\n"
            "• Zustimmung: Der Tweet äußert sich explizit oder implizit positiv oder unterstützend über das Ziel.\n"
            "• Ablehnung: Der Tweet äußert sich explizit oder implizit negativ oder kritisch über das Ziel.\n"
            "• Neutral: Der Tweet ist sachlich, ambivalent oder zeigt keine erkennbare Haltung.\n"
            "### Ausgabeformat (Kurzform):\n"
            "Gib genau eines der folgenden Kürzel zurück (ohne Anführungszeichen, ohne Punkt):\n"
            "Zu\nAb\nNe"
        )

    prompt = build_prompt_text(name, text)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_LEN, padding=True).to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=2)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    for label, token in LABEL_TOKEN_MAP.items():
        if token.lower() in decoded.lower():
            return label
    return "Unklar"

# Predictions mit Ladebalken
start = time()
tqdm.pandas(desc=" Vorhersage")
df["pred"] = df.progress_apply(lambda r: predict_label(r["name"], r["text"]), axis=1)
duration = time() - start

# Metriken
report = classification_report(df["label"], df["pred"], labels=LABELS, output_dict=True)
conf_mat = confusion_matrix(df["label"], df["pred"], labels=LABELS)
per_class_f1 = f1_score(df["label"], df["pred"], labels=LABELS, average=None)

metrics = {
    "model": MODEL_ID,
    "accuracy": accuracy_score(df["label"], df["pred"]),
    "macro_f1": f1_score(df["label"], df["pred"], average="macro"),
    "weighted_f1": f1_score(df["label"], df["pred"], average="weighted"),
    "precision_macro": precision_score(df["label"], df["pred"], average="macro"),
    "recall_macro": recall_score(df["label"], df["pred"], average="macro"),
    "per_class_f1": dict(zip(LABELS, per_class_f1.tolist())),
    "confusion_matrix": conf_mat.tolist(),
    "inference_time_per_1000": duration / len(df) * 1000,
}

# Speichern
df_out = Path(f"/content/{MODEL_ID}_preds.csv")
metrics_json = Path(f"/content/{MODEL_ID}_metrics.json")
metrics_md = Path(f"/content/{MODEL_ID}__metrics.md")

df.to_csv(df_out, sep=";", index=False, encoding="utf-8-sig")
with open(metrics_json, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2, ensure_ascii=False)

# Markdown-Tabelle
markdown_table = "| Metric | Value |\n|---|---|\n"
for k, v in metrics.items():
    if isinstance(v, dict) or isinstance(v, list): continue
    markdown_table += f"| {k} | {v:.4f} |\n" if isinstance(v, (int, float)) else f"| {k} | {v} |\n"

with open(metrics_md, "w", encoding="utf-8") as f:
    f.write(markdown_table)

# Abschluss
print("\n Classification Report:")
print(classification_report(df["label"], df["pred"], labels=LABELS))
print(f"\n Vorhersagen gespeichert in: {df_out}")
print(f" Metriken gespeichert in: {metrics_json}")

DOWNLOAD_DIR = Path(f"/content/{MODEL_ID}_downloads")
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)

adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/195M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

🔍 Predicting: 100%|██████████| 675/675 [01:32<00:00,  7.31it/s]



🔢 Classification Report:
              precision    recall  f1-score   support

  Zustimmung       0.66      0.64      0.65       150
   Ablehnung       0.81      0.82      0.81       387
     Neutral       0.59      0.58      0.58       138

    accuracy                           0.73       675
   macro avg       0.68      0.68      0.68       675
weighted avg       0.73      0.73      0.73       675


📂 Predictions saved to: /content/flan-t5-xl_preds.csv
📊 Metrics saved to: /content/flan-t5-xl_metrics.json
