In [9]:
# @title Image Captioning Benchmark Replication (COCO Subset)
# @markdown This script compares GIT, BLIP, ViT-GPT2, and BLIP2 on 200 MS-COCO images.
# @markdown It computes CLIP Scores and Semantic Similarity (SBERT).

import torch
import gc
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from PIL import Image
import io

# --- 1. Installation & Setup ---
print("Installing dependencies... (This may take a minute)")
!pip install -q transformers datasets sentence-transformers torchmetrics[image] evaluate rouge_score

import evaluate
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from torchmetrics.multimodal.clip_score import CLIPScore
from transformers import (
    AutoProcessor, AutoModelForCausalLM,
    BlipProcessor, BlipForConditionalGeneration,
    Blip2Processor, Blip2ForConditionalGeneration,
    VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
)

# Setup Hardware
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# --- 2. Configuration ---

# We select 200 random images from COCO 2017 validation set
NUM_IMAGES = 200
SEED = 42

# Model Registry (Mapping names to HuggingFace IDs)
# Note: For BLIP2, we use 2.7b to fit in Colab GPU memory. 6.7b requires A100.
MODELS_CONFIG = {
    "GIT-base": {
        "id": "microsoft/git-base",
        "type": "git"
    },
    "GIT-large": {
        "id": "microsoft/git-large",
        "type": "git"
    },
    "BLIP-base": {
        "id": "Salesforce/blip-image-captioning-base",
        "type": "blip"
    },
    "BLIP-large": {
        "id": "Salesforce/blip-image-captioning-large",
        "type": "blip"
    },
    "ViT-GPT-2": {
        "id": "nlpconnect/vit-gpt2-image-captioning",
        "type": "vit-gpt2"
    },
    "BLIP2-OPT": {
        "id": "Salesforce/blip2-opt-2.7b",
        "type": "blip2"
    }
}

# --- 3. Load Data ---
# --- 3. Load Data (DIRECT DOWNLOAD METHOD) ---
# This method bypasses the broken 'datasets' library scripts by fetching
# directly from the official COCO source.

import requests
import zipfile
import os
import json
import random
from collections import defaultdict

print("Downloading COCO annotations (Ground Truth)...")
# Download the official annotation zip (small file ~240MB)
if not os.path.exists("annotations_trainval2017.zip"):
    !wget -q http://images.cocodataset.org/annotations/annotations_trainval2017.zip
    !unzip -q annotations_trainval2017.zip

# Load the JSON containing captions
with open("annotations/captions_val2017.json", "r") as f:
    coco_data = json.load(f)

# Group captions by Image ID so we have 5 references per image
print("Parsing annotations...")
img_to_caps = defaultdict(list)
for ann in coco_data['annotations']:
    img_to_caps[ann['image_id']].append(ann['caption'])

# Select 200 random images that have captions
all_ids = list(img_to_caps.keys())
random.seed(SEED) # Ensure replicability
selected_ids = random.sample(all_ids, NUM_IMAGES)

data_samples = []
print(f"Downloading {NUM_IMAGES} images directly from COCO servers...")

for img_id in tqdm(selected_ids):
    # Construct the official URL for COCO Validation images
    # Format: http://images.cocodataset.org/val2017/000000xxxxxx.jpg
    img_url = f"http://images.cocodataset.org/val2017/{img_id:012d}.jpg"

    try:
        # Stream the image content directly into memory
        resp = requests.get(img_url, stream=True)
        resp.raise_for_status()

        # Open as PIL Image
        img = Image.open(resp.raw).convert("RGB")

        data_samples.append({
            "image": img,
            "references": img_to_caps[img_id], # List of 5 strings
            "id": img_id
        })
    except Exception as e:
        print(f"Skipping Image ID {img_id}: {e}")

print(f"\nData loaded successfully. Total samples: {len(data_samples)}")
print("Sample Reference:", data_samples[0]['references'][0])

# --- 4. Helper Functions ---

def load_model_pipeline(config):
    """Loads model and processor based on type."""
    model_id = config["id"]
    m_type = config["type"]

    print(f"Loading {model_id}...")

    if m_type == "git":
        processor = AutoProcessor.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

    elif m_type == "blip":
        processor = BlipProcessor.from_pretrained(model_id)
        model = BlipForConditionalGeneration.from_pretrained(model_id).to(device)

    elif m_type == "blip2":
        processor = Blip2Processor.from_pretrained(model_id)
        # Load in float16 to save memory for BLIP2
        model = Blip2ForConditionalGeneration.from_pretrained(
            model_id, torch_dtype=torch.float16
        ).to(device)

    elif m_type == "vit-gpt2":
        processor = ViTImageProcessor.from_pretrained(model_id)
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = VisionEncoderDecoderModel.from_pretrained(model_id).to(device)
        return model, processor, tokenizer # Special return for VitGPT2

    return model, processor, None

def generate_captions(model, processor, tokenizer, m_type, images):
    """Generates captions for a batch of images."""
    captions = []

    # Processing one by one to be safe on RAM with large models
    for img in tqdm(images, leave=False):
        if m_type == "git":
            inputs = processor(images=img, return_tensors="pt").to(device)
            generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50)
            cap = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        elif m_type == "blip":
            inputs = processor(img, return_tensors="pt").to(device)
            out = model.generate(**inputs, max_new_tokens=50)
            cap = processor.decode(out[0], skip_special_tokens=True)

        elif m_type == "blip2":
            inputs = processor(img, return_tensors="pt").to(device, torch.float16)
            out = model.generate(**inputs, max_new_tokens=50)
            cap = processor.decode(out[0], skip_special_tokens=True)

        elif m_type == "vit-gpt2":
            pixel_values = processor(images=img, return_tensors="pt").pixel_values.to(device)
            output_ids = model.generate(pixel_values, max_length=50, num_beams=4)
            cap = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        captions.append(cap.strip())
    return captions

# --- 5. Main Evaluation Loop ---

results_registry = {
    "image_ids": [d['id'] for d in data_samples],
    "references": [d['references'] for d in data_samples] # List of lists
}

for model_name, config in MODELS_CONFIG.items():
    print(f"\n--- Processing {model_name} ---")

    # 1. Load Model
    try:
        if config["type"] == "vit-gpt2":
            model, processor, tokenizer = load_model_pipeline(config)
        else:
            model, processor, _ = load_model_pipeline(config)
            tokenizer = None

        # 2. Generate
        images = [d['image'] for d in data_samples]
        generated_captions = generate_captions(model, processor, tokenizer, config["type"], images)

        # Store results
        results_registry[model_name] = generated_captions

        # 3. Cleanup (Crucial for Colab)
        del model
        del processor
        if tokenizer: del tokenizer
        torch.cuda.empty_cache()
        gc.collect()

    except Exception as e:
        print(f"Failed to process {model_name}: {e}")
        results_registry[model_name] = ["FAILED"] * NUM_IMAGES

# --- 6. Metric Calculation ---
# --- 6. Metric Calculation (CORRECTED) ---
print("\n--- Computing Metrics ---")

# Initialize Metric Objects
# 1. CLIP Score (Image-Text similarity)
# Note: Expects images (C,H,W) in 0-255 range and text list
clip_metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch32").to(device)

# 2. Semantic Similarity (Text-Text using SBERT)
sbert_model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# 3. Traditional NLP Metrics
rouge = evaluate.load('rouge')
meteor = evaluate.load('meteor') # Adding METEOR for extra robustness if desired

final_metrics = []

for model_name in MODELS_CONFIG.keys():
    if model_name not in results_registry: continue

    preds = results_registry[model_name]
    if preds[0] == "FAILED": continue

    print(f"Scoring {model_name}...")

    # A. Semantic Similarity (Pred vs Reference)
    sim_scores = []
    # We compare prediction to the references
    for pred, refs in zip(preds, results_registry["references"]):
        pred_emb = sbert_model.encode(pred, convert_to_tensor=True)
        ref_embs = sbert_model.encode(refs, convert_to_tensor=True)

        # Compute cosine sim
        cosine_scores = util.cos_sim(pred_emb, ref_embs)
        # Take the maximum similarity found among the 5 human references
        sim_scores.append(torch.max(cosine_scores).item())

    avg_semantic_sim = sum(sim_scores) / len(sim_scores)

    # B. CLIP Score (Image vs Pred)
    clip_total = 0

    # Process one by one to avoid memory issues
    for img_data, pred in zip(data_samples, preds):
        # Prepare Image: (C, H, W) in range [0, 255]
        # np.array(PIL) is (H,W,C), so we permute to (C,H,W)
        img_tensor = torch.tensor(np.array(img_data['image'])).permute(2, 0, 1)

        # FIX: Pass Image first, then Text list.
        # Do not wrap text in torch.tensor()
        score = clip_metric(img_tensor.unsqueeze(0).to(device), [pred])
        clip_total += score.item()

    # CLIPScore output is often on scale 0-100 or 0-1 depending on version.
    # Torchmetrics usually defaults to 0-100 scale for CLIP. We divide by 100 to match your table (0.3 etc)
    avg_clip = (clip_total / len(preds)) / 100.0

    # C. ROUGE-L (Text Overlap)
    rouge_res = rouge.compute(predictions=preds, references=results_registry["references"])

    # D. METEOR (Optional, good for synonyms)
    meteor_res = meteor.compute(predictions=preds, references=results_registry["references"])

    final_metrics.append({
        "Model": model_name,
        "CLIP Score": round(avg_clip, 4),
        "Semantic Similarity": round(avg_semantic_sim, 4),
        "ROUGE-L": round(rouge_res['rougeL'], 4),
        "METEOR": round(meteor_res['meteor'], 4)
    })
    # --- 6. Metric Calculation (CORRECTED) ---
    # --- 6. Metric Calculation (CORRECTED) ---
print("\n--- Computing Metrics ---")

# Initialize Metric Objects
# 1. CLIP Score (Image-Text similarity)
# Note: Expects images (C,H,W) in 0-255 range and text list
clip_metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch32").to(device)

# 2. Semantic Similarity (Text-Text using SBERT)
sbert_model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# 3. Traditional NLP Metrics
rouge = evaluate.load('rouge')
meteor = evaluate.load('meteor') # Adding METEOR for extra robustness if desired

final_metrics = []

for model_name in MODELS_CONFIG.keys():
    if model_name not in results_registry: continue

    preds = results_registry[model_name]
    if preds[0] == "FAILED": continue

    print(f"Scoring {model_name}...")

    # A. Semantic Similarity (Pred vs Reference)
    sim_scores = []
    # We compare prediction to the references
    for pred, refs in zip(preds, results_registry["references"]):
        pred_emb = sbert_model.encode(pred, convert_to_tensor=True)
        ref_embs = sbert_model.encode(refs, convert_to_tensor=True)

        # Compute cosine sim
        cosine_scores = util.cos_sim(pred_emb, ref_embs)
        # Take the maximum similarity found among the 5 human references
        sim_scores.append(torch.max(cosine_scores).item())

    avg_semantic_sim = sum(sim_scores) / len(sim_scores)

    # B. CLIP Score (Image vs Pred)
    clip_total = 0

    # Process one by one to avoid memory issues
    for img_data, pred in zip(data_samples, preds):
        # Prepare Image: (C, H, W) in range [0, 255]
        # np.array(PIL) is (H,W,C), so we permute to (C,H,W)
        img_tensor = torch.tensor(np.array(img_data['image'])).permute(2, 0, 1)

        # FIX: Pass Image first, then Text list.
        # Do not wrap text in torch.tensor()
        score = clip_metric(img_tensor.unsqueeze(0).to(device), [pred])
        clip_total += score.item()

    # CLIPScore output is often on scale 0-100 or 0-1 depending on version.
    # Torchmetrics usually defaults to 0-100 scale for CLIP. We divide by 100 to match your table (0.3 etc)
    avg_clip = (clip_total / len(preds)) / 100.0

    # C. ROUGE-L (Text Overlap)
    rouge_res = rouge.compute(predictions=preds, references=results_registry["references"])

    # D. METEOR (Optional, good for synonyms)
    meteor_res = meteor.compute(predictions=preds, references=results_registry["references"])

    final_metrics.append({
        "Model": model_name,
        "CLIP Score": round(avg_clip, 4),
        "Semantic Similarity": round(avg_semantic_sim, 4),
        "ROUGE-L": round(rouge_res['rougeL'], 4),
        "METEOR": round(meteor_res['meteor'], 4)
    })

# --- 7. Final Output ---
df_results = pd.DataFrame(final_metrics)
print("\n" + "="*50)
print("REPLICATION RESULTS (COCO/Flickr 200 Samples)")
print("="*50)
print(df_results.to_string(index=False))

# Optional: Save to CSV
df_results.to_csv("captioning_benchmark_results.csv", index=False)
print("\nResults saved to captioning_benchmark_results.csv")

# --- 7. Final Output ---
df_results = pd.DataFrame(final_metrics)
print("\n" + "="*50)
print("REPLICATION RESULTS (COCO/Flickr 200 Samples)")
print("="*50)
print(df_results.to_string(index=False))

# Optional: Save to CSV
df_results.to_csv("captioning_benchmark_results.csv", index=False)
print("\nResults saved to captioning_benchmark_results.csv")

# --- 7. Final Output ---
df_results = pd.DataFrame(final_metrics)
print("\n" + "="*50)
print("REPLICATION RESULTS (COCO/Flickr 200 Samples)")
print("="*50)
print(df_results.to_string(index=False))

# Optional: Save to CSV
df_results.to_csv("captioning_benchmark_results.csv", index=False)
print("\nResults saved to captioning_benchmark_results.csv")

# --- 7. Final Output ---
df_results = pd.DataFrame(final_metrics)
print("\n" + "="*50)
print("REPLICATION RESULTS (COCO 200 Samples)")
print("="*50)
print(df_results.to_string(index=False))

# Optional: Save to CSV
df_results.to_csv("captioning_benchmark_results.csv", index=False)
print("\nResults saved to captioning_benchmark_results.csv")

Installing dependencies... (This may take a minute)
Using device: cuda
Downloading COCO annotations (Ground Truth)...
Parsing annotations...
Downloading 200 images directly from COCO servers...


  0%|          | 0/200 [00:00<?, ?it/s]


Data loaded successfully. Total samples: 200
Sample Reference: A giraffe running across a grass covered field.

--- Processing GIT-base ---
Loading microsoft/git-base...


  0%|          | 0/200 [00:00<?, ?it/s]


--- Processing GIT-large ---
Loading microsoft/git-large...


  0%|          | 0/200 [00:00<?, ?it/s]


--- Processing BLIP-base ---
Loading Salesforce/blip-image-captioning-base...


  0%|          | 0/200 [00:00<?, ?it/s]


--- Processing BLIP-large ---
Loading Salesforce/blip-image-captioning-large...


  0%|          | 0/200 [00:00<?, ?it/s]


--- Processing ViT-GPT-2 ---
Loading nlpconnect/vit-gpt2-image-captioning...


  0%|          | 0/200 [00:00<?, ?it/s]


--- Processing BLIP2-OPT ---
Loading Salesforce/blip2-opt-2.7b...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]


--- Computing Metrics ---


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Scoring GIT-base...
Scoring GIT-large...
Scoring BLIP-base...
Scoring BLIP-large...
Scoring ViT-GPT-2...
Scoring BLIP2-OPT...

--- Computing Metrics ---


Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Scoring GIT-base...
Scoring GIT-large...
Scoring BLIP-base...
Scoring BLIP-large...
Scoring ViT-GPT-2...
Scoring BLIP2-OPT...

REPLICATION RESULTS (COCO/Flickr 200 Samples)
     Model  CLIP Score  Semantic Similarity  ROUGE-L  METEOR
  GIT-base      0.2754               0.6604   0.4503  0.3103
 GIT-large      0.2711               0.6576   0.4362  0.3005
 BLIP-base      0.2870               0.7243   0.5473  0.4237
BLIP-large      0.2936               0.7675   0.5157  0.4992
 ViT-GPT-2      0.2931               0.7406   0.5660  0.4890
 BLIP2-OPT      0.3008               0.7801   0.5903  0.4849

Results saved to captioning_benchmark_results.csv

REPLICATION RESULTS (COCO/Flickr 200 Samples)
     Model  CLIP Score  Semantic Similarity  ROUGE-L  METEOR
  GIT-base      0.2754               0.6604   0.4503  0.3103
 GIT-large      0.2711               0.6576   0.4362  0.3005
 BLIP-base      0.2870               0.7243   0.5473  0.4237
BLIP-large      0.2936               0.7675   0.5157  0.49