<a href="https://colab.research.google.com/github/alierenc/di725-transformers-and-attention-based-deep-networks-term-project/blob/main/Phase%20III/1.1.%20PaliGemma%20-%20Zero-Shot%20Image%20Captioning%20Inference%20with%20Some%20Outputs%20Cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from huggingface_hub import login
hf_token = " " # Huggingface token
login(token = hf_token)

In [None]:
!pip install -U datasets

In [None]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from datasets import load_dataset, DatasetDict
import torch

# Load the dataset of full riscm
ds = load_dataset('caglarmert/full_riscm')

full = ds["train"]

# test   = indices [0, 3150)
test_ds = full.select(range(3150))

# validation = indices [3150, 6300)
val_ds = full.select(range(3150, 6300))

# train  = indices [6300, end)
train_ds = full.select(range(6300, len(full)))

# bundle into a DatasetDict
ds = DatasetDict({
    "val": val_ds,
    "test": test_ds,
    "train": train_ds,
})

# Load the model and the processor
model_id = "google/paligemma-3b-mix-224"
processor = AutoProcessor.from_pretrained(model_id)
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
from tqdm import tqdm

model.eval()

# We add <image> token upon the warning.
prompt = "<image> caption en"
predictions = []

for i in tqdm(range(len(ds["test"])), desc="Generating captions"):
    # Get the image
    image = ds["test"][i]["image"]

    # Preprocess image and prompt
    inputs = processor(image, prompt, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move to GPU

    # Generate caption
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=30)

    # Decode output
    caption = processor.decode(output[0], skip_special_tokens=True)
    if caption.startswith(prompt):
        caption = caption[len(prompt):].strip()

    predictions.append(caption)

Generating captions: 100%|██████████| 3150/3150 [30:45<00:00,  1.71it/s]


In [None]:
# Get the references
# Define a varible to store the reference captions
all_references = []
for i in tqdm(range(len(ds["test"])), desc="Collecting reference captions"):
    # Get the reference
    reference_per_sample = []
    for j in range(1,6):
        reference = ds["test"][i][f"caption_{j}"]
        reference_per_sample.append(reference)
        print(f"The reference caption_{j}:")
        print(repr(reference))

    print()
    all_references.append(reference_per_sample)

In [None]:
# Check the format of the reference captions
print(all_references[:5])

[['A gray plane on the runway and the lawn beside .', 'A grey plane is on the runway by the lawn .', 'There is an airplane on the runway with a large lawn by the runway .', 'A plane is parked on the runway next to the grass .', 'There is a plane on the runway beside the grass .'], ['Three small planes parked in a line on the airport and a big plane behind them .', 'There are four aircraft on the open ground, The largest of which is three times as large as the smallest one .', 'There are many planes of different sizes in a clearing .', 'Four planes are parked on the runway .', 'Four planes of different sizes were on the marked ground .'], ['A plane parked in a line on the airport with some marks .', 'A white plane was parked on the instruction line .', 'An airplane parked in an open area with many containers next to it .', 'A plane is parked on the open space .', 'There is 1 plane on the ground marked .'], ['A small plane and a big plane parked next to boarding bridges .', 'A white plan

In [None]:
# Check the format of the predicted captions. Each sample starts with " caption en\n". This can be better seen when the code is rerun.
print(predictions[:5])

["A large jetliner sits proudly on the runway, its powerful engines roaring. The plane's wing extends gracefully, while the tail gracefully curves upward.", 'A group of four airplanes are parked on a runway, their wings resting on the ground. The runway is made of concrete and has white lines painted on', "A large jetliner sits proudly on the tarmac, its powerful engines idling. The plane's tail and wing extend gracefully, while the white lines on", 'An aerial view of an airport with several planes parked at gates. The tarmac is gray, and the runway is also gray. There are several jet bridges', 'Two airplanes are parked on the tarmac at an airport. The planes are white, with red and white stripes on their tails. The wing of the plane']


In [None]:
# Try to clean the new lines from the predicted captions for a better evaluation at later stages
for i in range(len(predictions)):
    predictions[i] = predictions[i].lstrip("\n")
    predictions[i] = predictions[i].lstrip(" caption en\n")

predictions[:5]

["A large jetliner sits proudly on the runway, its powerful engines roaring. The plane's wing extends gracefully, while the tail gracefully curves upward.",
 'A group of four airplanes are parked on a runway, their wings resting on the ground. The runway is made of concrete and has white lines painted on',
 "A large jetliner sits proudly on the tarmac, its powerful engines idling. The plane's tail and wing extend gracefully, while the white lines on",
 'An aerial view of an airport with several planes parked at gates. The tarmac is gray, and the runway is also gray. There are several jet bridges',
 'Two airplanes are parked on the tarmac at an airport. The planes are white, with red and white stripes on their tails. The wing of the plane']

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

nltk.download('punkt_tab')
nltk.download('punkt')

# Tokenize references and predictions:
tokenized_refs = [
    [nltk.word_tokenize(ref.lower()) for ref in refs]
    for refs in all_references
]

tokenized_hyps = [nltk.word_tokenize(pred.lower()) for pred in predictions]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
tokenized_refs[0]

[['a',
  'gray',
  'plane',
  'on',
  'the',
  'runway',
  'and',
  'the',
  'lawn',
  'beside',
  '.'],
 ['a', 'grey', 'plane', 'is', 'on', 'the', 'runway', 'by', 'the', 'lawn', '.'],
 ['there',
  'is',
  'an',
  'airplane',
  'on',
  'the',
  'runway',
  'with',
  'a',
  'large',
  'lawn',
  'by',
  'the',
  'runway',
  '.'],
 ['a',
  'plane',
  'is',
  'parked',
  'on',
  'the',
  'runway',
  'next',
  'to',
  'the',
  'grass',
  '.'],
 ['there',
  'is',
  'a',
  'plane',
  'on',
  'the',
  'runway',
  'beside',
  'the',
  'grass',
  '.']]

In [None]:
# Sentence-level BLEU-2
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/2, 1/2),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-2: {max_score*100:.2f}")

In [None]:
# Corpus-level BLEU-2
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/2, 1/2),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-2: {corpus_score*100:.2f}")


Corpus BLEU-2: 18.37


In [None]:
# Sentence-level BLEU-3
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/3, 1/3, 1/3),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-3: {max_score*100:.2f}")

In [None]:
# Corpus-level BLEU-3
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/3, 1/3, 1/3),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-3: {corpus_score*100:.2f}")


Corpus BLEU-3: 8.91


In [None]:
# Sentence-level BLEU-4
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/4, 1/4, 1/4, 1/4),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-4: {max_score*100:.2f}")

In [None]:
# Corpus-level BLEU-4
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/4, 1/4, 1/4, 1/4),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-4: {corpus_score*100:.2f}")


Corpus BLEU-4: 4.12


In [None]:
# Go on to calculate ROUGE scores
!pip install rouge-score



In [None]:
import nltk
from collections import Counter

# Ensure tokenizer
nltk.download('punkt', quiet=True)

def rouge_n(ref: str, hyp: str, n: int = 4):
    ref_toks = nltk.word_tokenize(ref.lower())
    hyp_toks = nltk.word_tokenize(hyp.lower())
    ref_ngrams = list(nltk.ngrams(ref_toks, n))
    hyp_ngrams = list(nltk.ngrams(hyp_toks, n))
    ref_counts = Counter(ref_ngrams)
    hyp_counts = Counter(hyp_ngrams)
    overlap = sum(min(ref_counts[ng], hyp_counts[ng]) for ng in ref_counts)
    recall = overlap / max(len(ref_ngrams), 1)
    precision = overlap / max(len(hyp_ngrams), 1)
    f1 = 2 * recall * precision / (recall + precision + 1e-8)
    return (recall, precision, f1)


# Compute ROUGE-2
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=2)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-2 Recall:    {recalls_per_sample[max_index]*100:.2f}")
    print(f"   ROUGE-2 Precision: {precisions_per_sample[max_index]*100:.2f}")
    print(f"   ROUGE-2 F1:        {f1s_per_sample[max_index]*100:.2f}\n")

In [None]:
# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-2 METRICS ===")
print(f"Recall:    {avg_r*100:.2f}")
print(f"Precision: {avg_p*100:.2f}")
print(f"F1:        {avg_f*100:.2f}")

=== AVERAGE ROUGE-2 METRICS ===
Recall:    10.81
Precision: 5.30
F1:        6.66


In [None]:
# Compute ROUGE-3
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=3)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-3 Recall:    {recalls_per_sample[max_index]*100:.2f}")
    print(f"   ROUGE-3 Precision: {precisions_per_sample[max_index]*100:.2f}")
    print(f"   ROUGE-3 F1:        {f1s_per_sample[max_index]*100:.2f}\n")

In [None]:
# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-3 METRICS ===")
print(f"Recall:    {avg_r*100:.2f}")
print(f"Precision: {avg_p*100:.2f}")
print(f"F1:        {avg_f*100:.2f}")

=== AVERAGE ROUGE-3 METRICS ===
Recall:    3.17
Precision: 1.38
F1:        1.79


In [None]:
# Compute ROUGE-4
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=4)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-4 Recall:    {recalls_per_sample[max_index]*100:.2f}")
    print(f"   ROUGE-4 Precision: {precisions_per_sample[max_index]*100:.2f}")
    print(f"   ROUGE-4 F1:        {f1s_per_sample[max_index]*100:.2f}\n")

In [None]:

# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-4 METRICS ===")
print(f"Recall:    {avg_r*100:.2f}")
print(f"Precision: {avg_p*100:.2f}")
print(f"F1:        {avg_f*100:.2f}")

=== AVERAGE ROUGE-4 METRICS ===
Recall:    0.71
Precision: 0.31
F1:        0.39
