# VLM Captioning - Exploration Notebook

Use this notebook to:
- Load and inspect the dataset
- Visualize sample images with their captions
- Test model inference interactively
- Compare generated captions against ground truth

In [None]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
from PIL import Image

# Load annotations
with open("../data/train.json") as f:
    annotations = json.load(f)

print(f"Total entries: {len(annotations)}")
print(f"Sample entry: {annotations[0]}")

In [None]:
# Visualize a few samples
IMAGE_ROOT = Path("../data/images/")

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for ax, entry in zip(axes, annotations[:3]):
    img = Image.open(IMAGE_ROOT / entry["image"])
    ax.imshow(img)
    ax.set_title(entry["caption"][:60] + "...", fontsize=9)
    ax.axis("off")
plt.tight_layout()
plt.show()

In [None]:
# Load a fine-tuned model and test inference
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import PeftModel

CHECKPOINT = "../outputs/final"  # Update this path
BASE_MODEL = "Salesforce/blip2-opt-2.7b"

processor = AutoProcessor.from_pretrained(CHECKPOINT)
base_model = AutoModelForVision2Seq.from_pretrained(BASE_MODEL, torch_dtype=torch.float16)
model = PeftModel.from_pretrained(base_model, CHECKPOINT)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print("Model loaded.")

In [None]:
# Generate a caption for a sample image
sample = annotations[0]
image = Image.open(IMAGE_ROOT / sample["image"]).convert("RGB")
inputs = processor(images=image, return_tensors="pt").to(device)

with torch.no_grad():
    output_ids = model.generate(**inputs, num_beams=5, max_new_tokens=50)

generated = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

print(f"Ground truth: {sample['caption']}")
print(f"Generated:    {generated}")

plt.imshow(image)
plt.axis("off")
plt.title(generated, fontsize=10)
plt.show()