In [None]:
# Setup: Environment Check & Installation (Colab Compatible)
import importlib, logging, os, sys, subprocess
from packaging import version
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger("setup")

def ensure_min(pkg: str, min_ver: str):
    try:
        cur_ver = importlib.metadata.version(pkg)
        if version.parse(cur_ver) < version.parse(min_ver):
            raise importlib.metadata.PackageNotFoundError
        logger.info(f"{pkg} ≥ {min_ver} bereits installiert.")
    except importlib.metadata.PackageNotFoundError:
        subprocess.run([sys.executable, "-m", "pip", "install", f"{pkg}>={min_ver}"], check=True)

ensure_min("transformers", "4.40.0")
ensure_min("peft", "0.10.0")
ensure_min("datasets", "2.19.0")
ensure_min("bitsandbytes", "0.43.1")
ensure_min("accelerate", "0.27.2")

# Imports
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType

# Config
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
DATA_PATH  = Path("/content/")
SAVE_DIR   = DATA_PATH / "Output" / "llama-3-8b-finetuned"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

MAX_LEN = 384  # Kontextlimit Prompt+Antwort (bei Bedarf anpassen)

LABEL_TOKEN_MAP = {
    "Zustimmung": "Zu",
    "Ablehnung":  "Ab",
    "Neutral":    "Ne",
}
VALID_LABELS = list(LABEL_TOKEN_MAP.keys())

# Dataset Load & Formatting
def load_dataset(path: Path) -> Dataset:
    df = pd.read_csv(path, sep=";", encoding="utf-8-sig")
    print("\nCSV-Spalten:", df.columns.tolist())
    need = {"name", "text", "label"}
    miss = need - set(df.columns)
    if miss:
        raise ValueError(f"Fehlende Spalten in {path}: {miss}")
    df = df[df["label"].isin(VALID_LABELS)].copy()
    df["input_text"]  = df["name"].astype(str) + "; " + df["text"].astype(str)
    df["target_text"] = df["label"].astype(str)
    return Dataset.from_pandas(df[["input_text", "target_text"]])

train_ds = load_dataset(DATA_PATH / "train.csv")
val_ds   = load_dataset(DATA_PATH / "val.csv")

# Tokenizer & Model (4-bit + LoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # sichert Padding ab

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",     # LLaMA
        "gate_proj", "up_proj", "down_proj",        # (falls vorhanden)
    ],
)
model = get_peft_model(model, peft_config)
model.config.use_cache = False
for n, p in model.named_parameters():
    if "lora" in n.lower():
        p.requires_grad_(True)

# Chat-Prompt (ohne Few-Shot) – Ausgabeformat am Ende
def build_messages(name: str, text: str):
    system_msg = {
        "role": "system",
        "content": (
            "Du bist ein Stance-Klassifizierer für politische Tweets. "
            "Kategorisiere die Haltung als genau eine der drei Klassen: Zustimmung, Ablehnung oder Neutral."
        )
    }
    user_prompt = (
        f"### Aufgabe\n"
        f"Bewerte die Haltung des folgenden Tweets gegenüber \"{name}\".\n\n"
        f"Tweet: {text}\n\n"
        "### Antwortmöglichkeiten:\n"
        "• Zustimmung: Der Tweet äußert sich explizit oder implizit positiv oder unterstützend über das Ziel.\n"
        "• Ablehnung: Der Tweet äußert sich explizit oder implizit negativ oder kritisch über das Ziel.\n"
        "• Neutral: Der Tweet ist sachlich, ambivalent oder zeigt keine erkennbare Haltung.\n"
        "### Ausgabeformat (Kurzform):\n"
        "Gib **genau eines** der folgenden Kürzel zurück (ohne Anführungszeichen, ohne Punkt):\n"
        "Zu\nAb\nNe"
    )
    return [system_msg, {"role": "user", "content": user_prompt}]

def render_chat(messages: list) -> str:
    # Setzt LLaMA-Chat-Sondertokens automatisch
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,  # eröffnet den assistant-Turn
    )

# Tokenization – Prompt maskieren, nur Antwort "Zu/Ab/Ne" lernen
def tokenize(ex):
    name, text = ex["input_text"].split(";", 1)
    name = name.strip()
    text = text.strip()
    label_full  = ex["target_text"]
    label_short = LABEL_TOKEN_MAP[label_full]  # "Zu"/"Ab"/"Ne"

    # 1) Prompt (Chat) ohne Antwort
    msgs        = build_messages(name, text)
    chat_prompt = render_chat(msgs)

    # 2) Ziel-Antwort
    answer_text = label_short

    # 3) Tokenisierung: Prompt & Prompt+Antwort
    prompt_enc = tokenizer(
        chat_prompt,
        truncation=True, max_length=MAX_LEN, padding="max_length"
    )
    full_enc = tokenizer(
        chat_prompt + answer_text,
        truncation=True, max_length=MAX_LEN, padding="max_length"
    )

    input_ids      = full_enc["input_ids"]
    attention_mask = full_enc["attention_mask"]
    labels         = input_ids.copy()

    # Längen robust bestimmen (ohne Padding)
    prompt_len = sum(prompt_enc["attention_mask"])
    full_len   = sum(attention_mask)
    answer_len = max(full_len - prompt_len, 0)

    # Prompt-Bereich maskieren
    for i in range(prompt_len):
        labels[i] = -100
    # (Padding hinter full_len ist sowieso 0 im attention_mask; optional maskierbar)
    # for i in range(full_len, len(labels)): labels[i] = -100

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

train_ds_tok = train_ds.map(tokenize, remove_columns=train_ds.column_names)
val_ds_tok   = val_ds.map(tokenize,   remove_columns=val_ds.column_names)

# Training
training_args = TrainingArguments(
    output_dir=str(SAVE_DIR),
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=1e-4,
    bf16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_tok,
    eval_dataset=val_ds_tok,
    tokenizer=tokenizer,
)

# Train & Save
trainer.train()
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print(f"Fertig! Gespeichert in: {SAVE_DIR}")


📋 CSV-Spalten: ['text', 'label', 'name']

📋 CSV-Spalten: ['text', 'label', 'name']


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Map:   0%|          | 0/3150 [00:00<?, ? examples/s]

Map:   0%|          | 0/675 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.06,0.004095
2,0.003,0.004681
3,0.0012,0.004885


✅ Fertig! Gespeichert in: /content/Output/llama-3-8b-finetuned


In [None]:
# Test Evaluation Script – LLaMA 3.1 + Chat-Prompt (Zu/Ab/Ne) + Metriken + Ladebalken
import torch, warnings, json, re
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import (
    classification_report, accuracy_score, f1_score, precision_score,
    recall_score, confusion_matrix
)
from peft import PeftModel, PeftConfig
from time import time

# Konfiguration
MODEL_DIR  = Path("/content/Output/llama-3-8b-finetuned")
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
TEST_PATH  = Path("/content/test3.csv")
MAX_LEN_INPUT = 512          # Kontextlimit für Prompt (+System+User)
MAX_NEW_TOKENS = 3           # genug für "Zu"/"Ab"/"Ne" (1–3 BPE-Token)
TEMPERATURE    = 0.0
DO_SAMPLE      = False
MODEL_ID       = MODEL_DIR.name

LABEL_TOKEN_MAP = {"Zustimmung": "Zu", "Ablehnung": "Ab", "Neutral": "Ne"}
LABELS = list(LABEL_TOKEN_MAP.keys())
SHORTS = list(LABEL_TOKEN_MAP.values())
SHORT_TO_LABEL  = {v.lower(): k for k, v in LABEL_TOKEN_MAP.items()}

# Hilfen
def build_messages(name: str, text: str):
    # System + User; Ausgabeformat am Ende
    system_msg = {
        "role": "system",
        "content": (
            "Du bist ein Stance-Klassifizierer für politische Tweets. "
            "Kategorisiere die Haltung als genau eine der drei Klassen: Zustimmung, Ablehnung oder Neutral."
        )
    }
    user_prompt = (
        f"### Aufgabe\n"
        f"Bewerte die Haltung des folgenden Tweets gegenüber \"{name}\".\n\n"
        f"Tweet: {text}\n\n"
        "### Antwortmöglichkeiten:\n"
        "• Zustimmung: Der Tweet äußert sich explizit oder implizit positiv oder unterstützend über das Ziel.\n"
        "• Ablehnung: Der Tweet äußert sich explizit oder implizit negativ oder kritisch über das Ziel.\n"
        "• Neutral: Der Tweet ist sachlich, ambivalent oder zeigt keine erkennbare Haltung.\n"
        "### Ausgabeformat (Kurzform):\n"
        "Gib **genau eines** der folgenden Kürzel zurück (ohne Anführungszeichen, ohne Punkt):\n"
        "Zu\nAb\nNe"
    )
    return [system_msg, {"role": "user", "content": user_prompt}]

def apply_chat_template(tokenizer, messages: list) -> str:
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,  # eröffnet den assistant-Turn
    )

_CLEAN_RE = re.compile(r'[\"\'\.\,\:\;\!\?\-\—\–\(\)\[\]\{\}]')
def parse_short_to_label(raw_output: str) -> str:
    if not isinstance(raw_output, str):
        return "Unklar"
    t = _CLEAN_RE.sub(" ", raw_output.strip().lower())
    t = re.sub(r"\s+", " ", t).strip()
    if not t:
        return "Unklar"
    first = t.split()[0]
    if first in SHORT_TO_LABEL:
        return SHORT_TO_LABEL[first]
    if first.startswith("zu"): return "Zustimmung"
    if first.startswith("ab"): return "Ablehnung"
    if first.startswith("ne"): return "Neutral"
    # fallback: suche irgendwo im string
    for s, lab in SHORT_TO_LABEL.items():
        if s in t:
            return lab
    return "Unklar"

# Modell & Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

try:
    # Prüfen, ob MODEL_DIR ein LoRA-Adapter ist
    PeftConfig.from_pretrained(MODEL_DIR)
    base = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL, device_map="auto", torch_dtype=torch.bfloat16
    )
    model = PeftModel.from_pretrained(base, MODEL_DIR)
    model = model.merge_and_unload()
except (FileNotFoundError, ValueError):
    warnings.warn("Kein LoRA-Adapter gefunden – lade vollständiges Modell")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_DIR, device_map="auto", torch_dtype=torch.bfloat16
    )
model.eval()

# Testdaten laden
df = pd.read_csv(TEST_PATH, sep=";", encoding="utf-8-sig")
df = df[df["label"].isin(LABELS)].copy()
df["input_text"] = df["name"] + "; " + df["text"]

# Vorhersagefunktion (Chat-Prompt)
def predict_label(name: str, text: str) -> str:
    messages = build_messages(name, text)
    chat_text = apply_chat_template(tokenizer, messages)

    inputs = tokenizer(
        chat_text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LEN_INPUT,
        padding=False,
    ).to(model.device)

    with torch.no_grad():
        generated = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=DO_SAMPLE,
            temperature=TEMPERATURE,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Nur den neu generierten Teil dekodieren
    prompt_len = inputs["input_ids"].shape[1]
    pred_text = tokenizer.decode(generated[0][prompt_len:], skip_special_tokens=True).strip()
    return parse_short_to_label(pred_text)

# Predictions mit Ladebalken
start = time()
tqdm.pandas(desc="Predicting")
df["pred"] = df.progress_apply(lambda r: predict_label(r["name"], r["text"]), axis=1)
duration = time() - start

# Metriken
report = classification_report(df["label"], df["pred"], labels=LABELS, output_dict=True)
conf_mat = confusion_matrix(df["label"], df["pred"], labels=LABELS)
per_class_f1 = f1_score(df["label"], df["pred"], labels=LABELS, average=None)

metrics = {
    "model": MODEL_ID,
    "accuracy": accuracy_score(df["label"], df["pred"]),
    "macro_f1": f1_score(df["label"], df["pred"], average="macro"),
    "weighted_f1": f1_score(df["label"], df["pred"], average="weighted"),
    "precision_macro": precision_score(df["label"], df["pred"], average="macro"),
    "recall_macro": recall_score(df["label"], df["pred"], average="macro"),
    "per_class_f1": dict(zip(LABELS, per_class_f1.tolist())),
    "confusion_matrix": conf_mat.tolist(),
    "inference_time_per_1000": duration / max(len(df), 1) * 1000,
}

# Speichern
df_out        = Path(f"/content/{MODEL_ID}_chat_preds.csv")
metrics_json  = Path(f"/content/{MODEL_ID}_chat_metrics.json")
metrics_md    = Path(f"/content/{MODEL_ID}_chat_metrics.md")

df.to_csv(df_out, sep=";", index=False, encoding="utf-8-sig")
with open(metrics_json, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2, ensure_ascii=False)

# Markdown-Tabelle
markdown_table = "| Metric | Value |\n|---|---|\n"
for k, v in metrics.items():
    if isinstance(v, dict) or isinstance(v, list): continue
    markdown_table += f"| {k} | {v:.4f} |\n" if isinstance(v, (int, float)) else f"| {k} | {v} |\n"
with open(metrics_md, "w", encoding="utf-8") as f:
    f.write(markdown_table)

# Abschluss
print("\nClassification Report:")
print(classification_report(df["label"], df["pred"], labels=LABELS))
print(f"\nPredictions gespeichert unter: {df_out}")
print(f"Metriken gespeichert unter: {metrics_json}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

🔍 Predicting:   0%|          | 0/675 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🔍 Predicting:   0%|          | 2/675 [00:00<03:04,  3.66it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🔍 Predicting:   1%|          | 4/675 [00:00<01:54,  5.84it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🔍 Predicting:   1%|          | 5/675 [00:00<01:43,  6.44it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and 


🔢 Classification Report:
              precision    recall  f1-score   support

  Zustimmung       0.85      0.74      0.79       150
   Ablehnung       0.85      0.92      0.88       387
     Neutral       0.77      0.69      0.73       138

    accuracy                           0.83       675
   macro avg       0.82      0.78      0.80       675
weighted avg       0.83      0.83      0.83       675


📂 Predictions saved to: /content/llama-3-8b-finetuned_chat_preds.csv
📊 Metrics saved to: /content/llama-3-8b-finetuned_chat_metrics.json


In [None]:
from huggingface_hub import login, upload_folder, create_repo
from google.colab import userdata
from pathlib import Path

# Konfiguration
HF_TOKEN = userdata.get("HF_TOKEN")
HF_REPO_ID = "YangZexi/llama-3.1-8B-Instruct-stance-lora-v2"
SAVE_DIR = Path("/content/Output/llama-3-8b-finetuned")

# Login
if not HF_TOKEN:
    raise ValueError("Kein HF_TOKEN gefunden – unter 'Notebook > Secrets > HF_TOKEN' setzen.")
login(HF_TOKEN)

# Repository erstellen (falls nicht vorhanden)
create_repo(repo_id=HF_REPO_ID, repo_type="model", exist_ok=True, token=HF_TOKEN)

# Hochladen
upload_folder(
    repo_id=HF_REPO_ID,
    folder_path=SAVE_DIR,
    path_in_repo=".",
    repo_type="model",
    token=HF_TOKEN
)

print(f"Modell erfolgreich hochgeladen: https://huggingface.co/{HF_REPO_ID}")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...llama-3-8b-finetuned/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

  ...netuned/checkpoint-197/scheduler.pt: 100%|##########| 1.06kB / 1.06kB            

  ...etuned/checkpoint-197/rng_state.pth:  78%|#######7  | 11.0kB / 14.2kB            

  ...netuned/checkpoint-394/scheduler.pt: 100%|##########| 1.06kB / 1.06kB            

  ...tuned/checkpoint-394/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

  ...netuned/checkpoint-197/optimizer.pt:   0%|          |  554kB /  671MB            

  ...finetuned/adapter_model.safetensors:   0%|          |  559kB /  336MB            

  ...point-197/adapter_model.safetensors:   0%|          |  559kB /  336MB            

  ...point-394/adapter_model.safetensors:   0%|          |  558kB /  336MB            

  ...netuned/checkpoint-591/scheduler.pt: 100%|##########| 1.06kB / 1.06kB            

✅ Modell erfolgreich hochgeladen: https://huggingface.co/YangZexi/llama-3.1-8B-Instruct-stance-lora-v2
