In [None]:
%pip install transformers peft bitsandbytes requests Pillow python-dotenv -U datasets qwen_vl_utils



In [None]:
# 📦 Dateiverwaltung & Umgebung
import os
from dotenv import load_dotenv

# 🔗 Laufwerk (für Colab)
from google.colab import drive

# ⚙️ PyTorch
import torch

# 📚 Hugging Face
from datasets import load_dataset
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoProcessor,
    BitsAndBytesConfig,
)
from qwen_vl_utils import process_vision_info


# 🧠 PEFT & LoRA
from peft import LoraConfig, get_peft_model, PeftModel

# 📈 BLEU
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# 📊 Logging
import wandb

# Image
from PIL import Image
import requests
from io import BytesIO

In [None]:
drive.mount('/content/drive')
# Load secrets (WANDB + HF tokens) from .env
load_dotenv("/content/drive/MyDrive/ma-colab/.env")
HF_TOKEN      = os.getenv("HF_TOKEN")

# (Optional) HuggingFace Hub login for later model push
if HF_TOKEN:
    os.environ["HF_TOKEN"] = HF_TOKEN  # Used implicitly by huggingface_hub

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Baseline-Modell: unverändert
baseline_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_cache=True
)

# Fine-Tuned-Modell: separate Instanz mit LoRA
finetuned_model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_cache=True
)

finetuned_model = PeftModel.from_pretrained(
    finetuned_model_base, "Alex23o4/Qwen2.5-VL-7B_news_alttext"
)


processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/57.6k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not in

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/913 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/104M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/5.70k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [None]:
SYSTEM_MESSAGE = (
    "You are a helpful assistant specialized in generating concise and context-sensitive alternative text descriptions for images. "
    "Your goal is to create meaningful and accessible alt texts using the provided article context, including the headline, abstract, and image caption. "
    "Keep descriptions under 150 characters, avoid subjective language, and focus strictly on the visible content and relevant contextual entities."
)

PROMPT_TEMPLATE = (
    'Given the article headline: "{headline}", the abstract: "{abstract}", and the caption: "{caption}", '
    'generate a short and descriptive alt text for the provided image. '
    'Ensure the description includes key visual elements and relevant entities from the context. '
    'Do not exceed 150 characters and avoid unnecessary details or subjective opinions.'
)

def format_data(example):
    formatted_prompt = PROMPT_TEMPLATE.format(
        headline=example["headline"],
        abstract=example["abstract"],
        caption=example["caption"]
    )

    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": SYSTEM_MESSAGE}]
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": example.get("image_url_clean")},
                {"type": "text", "text": formatted_prompt}
            ]
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": example["openai_alt_text_refined"]}
            ]
        }
    ]

    return {
        "id": example["image_id"],
        "messages": messages
    }



In [None]:
test_ds = load_dataset(
    "Alex23o4/n24news_sample_synthetic_alttext_reduced",
    split="test"
)

# Anschließend explizit den Split wählen:
#test_ds = ds["test"]

len(test_ds)
print("-"*30)
print(test_ds)
print("-"*30)
print(test_ds[0])
print("-"*30)
test_ds_formatted = [format_data(example) for example in test_ds]

print(test_ds_formatted[0])

README.md:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.90M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/362k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/367k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7680 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/960 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/960 [00:00<?, ? examples/s]

------------------------------
Dataset({
    features: ['abstract', 'image_id', 'image_url_clean', 'openai_alt_text_refined', 'headline', 'caption', 'section', 'section_label'],
    num_rows: 960
})
------------------------------
{'abstract': 'In the Chicago teachers\' strike, Mayor Rahm Emanuel\'s oft-cited goal of expanding charter schools is not officially on the table, but a union official called it "the elephant in the room."', 'image_id': 'a0aa8aa0-4ed7-501c-8f3e-7fe90696ff43', 'image_url_clean': 'https://static01.nyt.com/images/2012/09/13/us/CHARTERS-1/CHARTERS-1-articleLarge.jpg', 'openai_alt_text_refined': 'A teacher assists a student in a classroom with several students seated at tables. Educational materials are visible on the walls.', 'headline': 'Push to Add Charter Schools Hangs Over Strike', 'caption': 'Blair Burson, a second-grade teacher at Chicago International Charter School Bucktown, helped a student on Wednesday.', 'section': 'Education', 'section_label': 5}
------

In [None]:
@torch.inference_mode()
def text_generator(sample_data, model, processor, device, max_tokens=512):
    """
    Erzeugt Alt-Text für ein Sample und gibt
    (Generated, Reference) zurück.
    """
    # ───── 1) Prompt bauen ─────
    prompt = processor.apply_chat_template(
        sample_data["messages"][:2],    # system + user
        tokenize=False,
        add_generation_prompt=True
    )

    # ───── 2) Bild(e) extrahieren ─────
    images, _ = process_vision_info(sample_data["messages"])

    # ───── 3) Tokenisierung + Bild-Vorverarbeitung ─────
    model_inputs = processor(
        text=[prompt],
        images=images,
        return_tensors="pt",
        padding=True,
        truncation=True,
    ).to(device)

    # ───── 4) Generieren ─────
    gen_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_tokens,
        top_p=0.9,
        do_sample=True,
        temperature=0.7
    )

    # Nur den Antwort-Teil herauslösen
    trimmed = [
        output[len(inp):] for inp, output in zip(model_inputs.input_ids, gen_ids)
    ]
    gen_text = processor.batch_decode(
        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0].strip()

    return gen_text


In [None]:
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Verwende Gerät: {device}")

# Ergebnisse vorbereiten
results = []

for sample in tqdm(test_ds_formatted):
    image_id = sample["id"]  # ID = image_id!
    gen_baseline = text_generator(sample, baseline_model, processor, device)
    gen_ft = text_generator(sample, finetuned_model, processor, device)

    results.append({
        "id": image_id,
        "qwen_baseline": gen_baseline,
        "qwen_finetuned": gen_ft
    })




Verwende Gerät: cuda


100%|██████████| 960/960 [1:36:13<00:00,  6.01s/it]


In [None]:
# Erstelle ein Lookup-Dict: image_id → generierte Texte
results_dict = {entry["id"]: entry for entry in results}

# Spalten befüllen
baseline_outputs = []
finetuned_outputs = []

for example in test_ds:
    result = results_dict.get(example["image_id"], {})
    baseline_outputs.append(result.get("qwen_baseline", ""))
    finetuned_outputs.append(result.get("qwen_finetuned", ""))

# Spalten hinzufügen
test_ds = test_ds.add_column("generated_baseline", baseline_outputs)
test_ds = test_ds.add_column("generated_finetuned", finetuned_outputs)

In [None]:
from datetime import datetime
now = datetime.now()
test_ds.to_json(f"/content/drive/MyDrive/ma-colab/testset_with_predictions_{now.strftime('%Y%m%d_%H%M%S')}.json", orient="records", lines=True)


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

950274

In [None]:
import os, signal, time

try:
    # funktioniert NUR in einer echten Colab-Sitzung
    from google.colab import runtime
    print("GPU wird in 10 s freigegeben …")
    time.sleep(30)
    runtime.unassign()                 # offizielles Disconnect
except Exception:
    # Fallback: Kernel hart beenden
    os.kill(os.getpid(), signal.SIGTERM)
