In [1]:
from google.colab import drive
import shutil
import os

drive.mount('/content/drive')




Mounted at /content/drive


In [2]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
import json
from PIL import Image
from torchvision import transforms
import matplotlib.pyplot as plt

In [4]:
# For metrics
!pip install -q rouge-score
!pip install -q torchmetrics
import nltk
nltk.download('wordnet')

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m961.5/961.5 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# tqdm for progress bar
from tqdm import tqdm

# NLTK for BLEU and METEOR scores
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.meteor_score import meteor_score
import nltk
nltk.download('wordnet')  # Download WordNet for METEOR

# ROUGE scorer from rouge_score package
from rouge_score import rouge_scorer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
def load_model_and_processor_from_huggingface():
    try:
        model_name = "Salesforce/blip-image-captioning-base"
        model = BlipForConditionalGeneration.from_pretrained(model_name)
        processor = BlipProcessor.from_pretrained(model_name, use_fast=True)

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        model.eval()

        print(f"✅ Model loaded successfully from Hugging Face on {device}")
        return model, processor, device
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        return None, None, None

def load_ground_truth(local_json_path):
    try:
        with open(local_json_path, "r") as f:
            gt_data = json.load(f)
            # Create a dictionary mapping filename to its ground truth description.
            gt_data = {item["filename"]: item["description"] for item in gt_data}
        print(f"Loaded {len(gt_data)} ground truth captions.")
        return gt_data
    except Exception as e:
        print(f"Error loading ground truth: {e}")
        return {}

def load_image(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        return image
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return None

def generate_caption(model, processor, device, image):
    try:
        inputs = processor(images=image, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            # Call the underlying model's generate method
            output_ids = model.generate(**inputs,max_new_tokens=10)
        caption = processor.decode(output_ids[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        print(f"Error generating caption: {e}")
        return ""


In [7]:

def display_image_with_captions(image_path, gt_caption, gen_caption, bleu_score, meteor_score_val, rouge_score_val):
    image = Image.open(image_path).convert("RGB")
    plt.figure(figsize=(8, 6))
    plt.imshow(image)
    plt.axis("off")
    plt.title(f"GT: {gt_caption}\nGen: {gen_caption}\nBLEU: {bleu_score:.4f} | METEOR: {meteor_score_val:.4f} | ROUGE-L: {rouge_score_val:.4f}",
              fontsize=10)
    plt.show()

In [8]:
from nltk.translate.bleu_score import SmoothingFunction

def evaluate_model(image_folder, gt_json_path,max_images=None):
    gt_captions = load_ground_truth(gt_json_path)
    if not gt_captions:
        print("No ground truth data available. Exiting evaluation.")
        return

    model, processor, device = load_model_and_processor_from_huggingface()
    if model is None:
        print("Model loading failed. Exiting evaluation.")
        return

    references = []  # Ground truth tokenized captions
    hypotheses = []  # Generated tokenized captions
    meteor_scores = []
    rouge_scores = []

    image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    print(f"🔹 Found {len(image_files)} images in {image_folder}.")

    if max_images is not None:
        image_files = image_files[:max_images]

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smoothing = SmoothingFunction().method1  # For BLEU smoothing

    for filename in tqdm(image_files, desc="Evaluating Images"):
        if filename not in gt_captions:
            continue

        image_path = os.path.join(image_folder, filename)
        image = load_image(image_path)
        if image is None:
            continue

        gen_caption = generate_caption(model, processor, device, image)
        hypothesis = gen_caption.split()
        reference = [gt_captions[filename].split()]

        # Calculate sentence-level BLEU with smoothing
        bleu_score = sentence_bleu(reference, hypothesis, smoothing_function=smoothing)
        # Calculate METEOR score
        meteor = meteor_score([gt_captions[filename].split()], gen_caption.split())
        meteor_scores.append(meteor)
        # Calculate ROUGE-L F-measure
        rouge = scorer.score(gt_captions[filename], gen_caption)["rougeL"].fmeasure
        rouge_scores.append(rouge)

        references.append(reference)
        hypotheses.append(hypothesis)

        # Print ground truth and generated captions to console
        print(f"\nFilename: {filename}")
        print(f"Ground Truth: {gt_captions[filename]}")
        print(f"Generated: {gen_caption}")
        print(f"BLEU: {bleu_score:.4f} | METEOR: {meteor:.4f} | ROUGE-L: {rouge:.4f}")

        # Display image with overlaid captions and scores
        display_image_with_captions(image_path, gt_captions[filename], gen_caption, bleu_score, meteor, rouge)

    # Compute corpus-level scores
    corpus_bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smoothing)
    avg_meteor_score = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
    avg_rouge_score = sum(rouge_scores) / len(rouge_scores) if rouge_scores else 0

    print("\n--- Evaluation Summary ---")
    print(f"Corpus BLEU Score: {corpus_bleu_score:.4f}")
    print(f"Average METEOR Score: {avg_meteor_score:.4f}")
    print(f"Average ROUGE-L Score: {avg_rouge_score:.4f}")


In [None]:
# Set your evaluation paths
IMAGE_FOLDER = "/content/drive/MyDrive/Gemini_Captions/kg_train"  # Folder containing images for evaluation
CAPTIONS_FILE = "/content/drive/MyDrive/Gemini_Captions/kg_train.json"  # Ground truth captions JSON file

# Run evaluation on a specified number of images (e.g., max_images=10)
evaluate_model(image_folder=IMAGE_FOLDER, gt_json_path=CAPTIONS_FILE,max_images=20)

Loaded 10224 ground truth captions.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

✅ Model loaded successfully from Hugging Face on cuda
🔹 Found 10224 images in /content/drive/MyDrive/Gemini_Captions/kg_train.


Evaluating Images:   0%|          | 0/20 [00:00<?, ?it/s]