In [1]:
from huggingface_hub import login
hf_token = " " # Huggingface token
login(token = hf_token)

In [None]:
!pip install -U datasets

In [6]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

from datasets import load_dataset, DatasetDict

# Load the dataset of full riscm
ds = load_dataset('caglarmert/full_riscm')

full = ds["train"]

# test   = indices [0, 3150)
test_ds = full.select(range(3150))

# validation = indices [3150, 6300)
val_ds = full.select(range(3150, 6300))

# train  = indices [6300, end)
train_ds = full.select(range(6300, len(full)))

# bundle into a DatasetDict
ds = DatasetDict({
    "val": val_ds,
    "test": test_ds,
    "train": train_ds,
})

# Load the model and the processor
model_id = "google/paligemma-3b-mix-224"
processor = AutoProcessor.from_pretrained(model_id)
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Do the inference on full dataset
prompt = "caption en"

# Define a variable to store the predicitons
predictions = []
for i in range(len(ds["test"])):
    print(f"Predicting a caption for sample {i + 1}")
    # Get the image
    image_file = ds["test"][i]["image"]
    inputs = processor(image_file, prompt, return_tensors="pt")
    # Get the prediction
    output = model.generate(**inputs, max_new_tokens=40)
    predicted_caption = processor.decode(output[0], skip_special_tokens=True)[len(prompt):]
    # Store the predicted caption and print it
    predictions.append(predicted_caption)
    print("The predicted caption:")
    print(repr(predicted_caption))
    print()

In [None]:
# Get the references
# Define a varible to store the reference captions
all_references = []
for i in range(len(ds["test"])):
    # Get the reference
    reference_per_sample = []
    for j in range(1,6):
        reference = ds["test"][i][f"caption_{j}"]
        reference_per_sample.append(reference)
        print(f"The reference caption_{j}:")
        print(repr(reference))

    print()
    all_references.append(reference_per_sample)

In [18]:
# Check the format of the reference captions
print(all_references[:5])

[['A gray plane on the runway and the lawn beside .', 'A grey plane is on the runway by the lawn .', 'There is an airplane on the runway with a large lawn by the runway .', 'A plane is parked on the runway next to the grass .', 'There is a plane on the runway beside the grass .'], ['Three small planes parked in a line on the airport and a big plane behind them .', 'There are four aircraft on the open ground, The largest of which is three times as large as the smallest one .', 'There are many planes of different sizes in a clearing .', 'Four planes are parked on the runway .', 'Four planes of different sizes were on the marked ground .'], ['A plane parked in a line on the airport with some marks .', 'A white plane was parked on the instruction line .', 'An airplane parked in an open area with many containers next to it .', 'A plane is parked on the open space .', 'There is 1 plane on the ground marked .'], ['A small plane and a big plane parked next to boarding bridges .', 'A white plan

In [19]:
# Check the format of the predicted captions. Each sample starts with a new line
print(predictions[:5])

['A plane sits on the runway at an airport, its wings level with the ground. The runway is long and straight, with a grassy area on one side and a dirt area on the other. The', 'A group of four airplanes are parked on a runway. The planes are white and have a wing on each side. The wing of the plane is white and the tail of the plane is also white.', "A large jetliner sits proudly on the tarmac, its powerful engines idling. The plane's tail and wing extend gracefully, while the white lines on the ground guide its path. The black and white", 'An aerial view of an airport with planes parked at gates. The tarmac is gray, and the planes are white. There are several planes parked at gates, including a large jetliner and a small plane', 'Two airplanes are parked on the tarmac at an airport. The planes are white and have red stripes on their tails. The wing of the plane is visible, as well as the tail and the engine on']


In [20]:
# Try to clean the new lines from the predicted captions for a better evaluation at later stages
for i in range(len(predictions)):
    predictions[i] = predictions[i].lstrip("\n")

predictions[:5]

['A plane sits on the runway at an airport, its wings level with the ground. The runway is long and straight, with a grassy area on one side and a dirt area on the other. The',
 'A group of four airplanes are parked on a runway. The planes are white and have a wing on each side. The wing of the plane is white and the tail of the plane is also white.',
 "A large jetliner sits proudly on the tarmac, its powerful engines idling. The plane's tail and wing extend gracefully, while the white lines on the ground guide its path. The black and white",
 'An aerial view of an airport with planes parked at gates. The tarmac is gray, and the planes are white. There are several planes parked at gates, including a large jetliner and a small plane',
 'Two airplanes are parked on the tarmac at an airport. The planes are white and have red stripes on their tails. The wing of the plane is visible, as well as the tail and the engine on']

In [21]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

nltk.download('punkt_tab')
nltk.download('punkt')

# Tokenize references and predictions:
tokenized_refs = [
    [nltk.word_tokenize(ref.lower()) for ref in refs]
    for refs in all_references
]

tokenized_hyps = [nltk.word_tokenize(pred.lower()) for pred in predictions]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
tokenized_refs[0]

[['a',
  'gray',
  'plane',
  'on',
  'the',
  'runway',
  'and',
  'the',
  'lawn',
  'beside',
  '.'],
 ['a', 'grey', 'plane', 'is', 'on', 'the', 'runway', 'by', 'the', 'lawn', '.'],
 ['there',
  'is',
  'an',
  'airplane',
  'on',
  'the',
  'runway',
  'with',
  'a',
  'large',
  'lawn',
  'by',
  'the',
  'runway',
  '.'],
 ['a',
  'plane',
  'is',
  'parked',
  'on',
  'the',
  'runway',
  'next',
  'to',
  'the',
  'grass',
  '.'],
 ['there',
  'is',
  'a',
  'plane',
  'on',
  'the',
  'runway',
  'beside',
  'the',
  'grass',
  '.']]

In [48]:
# Sentence-level BLEU-2
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/2, 1/2),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-2: {max_score*100:.2f}")

# Corpus-level BLEU-2
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/2, 1/2),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-2: {corpus_score*100:.2f}")

Example  1 BLEU-2: 16.01
Example  2 BLEU-2: 14.32
Example  3 BLEU-2: 8.43
Example  4 BLEU-2: 16.43
Example  5 BLEU-2: 8.77
Example  6 BLEU-2: 7.35
Example  7 BLEU-2: 12.40
Example  8 BLEU-2: 20.41
Example  9 BLEU-2: 12.40
Example 10 BLEU-2: 11.31
Example 11 BLEU-2: 9.81
Example 12 BLEU-2: 22.04
Example 13 BLEU-2: 40.45
Example 14 BLEU-2: 11.60
Example 15 BLEU-2: 50.03
Example 16 BLEU-2: 15.19
Example 17 BLEU-2: 13.87
Example 18 BLEU-2: 29.42
Example 19 BLEU-2: 13.16
Example 20 BLEU-2: 15.19
Example 21 BLEU-2: 15.19
Example 22 BLEU-2: 10.83
Example 23 BLEU-2: 7.16
Example 24 BLEU-2: 29.88
Example 25 BLEU-2: 16.79
Example 26 BLEU-2: 15.19
Example 27 BLEU-2: 12.40
Example 28 BLEU-2: 20.41
Example 29 BLEU-2: 13.86
Example 30 BLEU-2: 2.07
Example 31 BLEU-2: 16.98
Example 32 BLEU-2: 14.32
Example 33 BLEU-2: 16.41
Example 34 BLEU-2: 13.16
Example 35 BLEU-2: 13.87
Example 36 BLEU-2: 10.67
Example 37 BLEU-2: 11.32
Example 38 BLEU-2: 15.11
Example 39 BLEU-2: 17.90
Example 40 BLEU-2: 9.00
Example

In [49]:
# Sentence-level BLEU-3
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/3, 1/3, 1/3),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-3: {max_score*100:.2f}")

# Corpus-level BLEU-3
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/3, 1/3, 1/3),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-3: {corpus_score*100:.2f}")

Example  1 BLEU-3: 8.77
Example  2 BLEU-3: 8.14
Example  3 BLEU-3: 5.82
Example  4 BLEU-3: 11.34
Example  5 BLEU-3: 2.73
Example  6 BLEU-3: 2.44
Example  7 BLEU-3: 9.32
Example  8 BLEU-3: 8.41
Example  9 BLEU-3: 7.40
Example 10 BLEU-3: 3.29
Example 11 BLEU-3: 7.97
Example 12 BLEU-3: 13.80
Example 13 BLEU-3: 33.13
Example 14 BLEU-3: 7.08
Example 15 BLEU-3: 36.99
Example 16 BLEU-3: 8.47
Example 17 BLEU-3: 7.97
Example 18 BLEU-3: 18.98
Example 19 BLEU-3: 9.69
Example 20 BLEU-3: 3.93
Example 21 BLEU-3: 12.21
Example 22 BLEU-3: 4.25
Example 23 BLEU-3: 2.38
Example 24 BLEU-3: 11.42
Example 25 BLEU-3: 9.05
Example 26 BLEU-3: 8.47
Example 27 BLEU-3: 7.40
Example 28 BLEU-3: 8.41
Example 29 BLEU-3: 6.19
Example 30 BLEU-3: 1.06
Example 31 BLEU-3: 13.16
Example 32 BLEU-3: 10.26
Example 33 BLEU-3: 7.40
Example 34 BLEU-3: 7.40
Example 35 BLEU-3: 7.97
Example 36 BLEU-3: 3.16
Example 37 BLEU-3: 6.96
Example 38 BLEU-3: 5.94
Example 39 BLEU-3: 11.90
Example 40 BLEU-3: 2.80
Example 41 BLEU-3: 14.48
Examp

In [50]:
# Sentence-level BLEU-4
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/4, 1/4, 1/4, 1/4),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-4: {max_score*100:.2f}")

# Corpus-level BLEU-4
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/4, 1/4, 1/4, 1/4),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-4: {corpus_score*100:.2f}")

Example  1 BLEU-4: 3.67
Example  2 BLEU-4: 3.48
Example  3 BLEU-4: 2.74
Example  4 BLEU-4: 7.98
Example  5 BLEU-4: 1.53
Example  6 BLEU-4: 1.42
Example  7 BLEU-4: 6.84
Example  8 BLEU-4: 5.61
Example  9 BLEU-4: 3.23
Example 10 BLEU-4: 1.79
Example 11 BLEU-4: 3.42
Example 12 BLEU-4: 5.20
Example 13 BLEU-4: 25.97
Example 14 BLEU-4: 3.13
Example 15 BLEU-4: 29.80
Example 16 BLEU-4: 3.58
Example 17 BLEU-4: 3.42
Example 18 BLEU-4: 11.66
Example 19 BLEU-4: 7.04
Example 20 BLEU-4: 2.01
Example 21 BLEU-4: 9.96
Example 22 BLEU-4: 2.85
Example 23 BLEU-4: 1.38
Example 24 BLEU-4: 7.39
Example 25 BLEU-4: 3.76
Example 26 BLEU-4: 3.58
Example 27 BLEU-4: 3.23
Example 28 BLEU-4: 5.61
Example 29 BLEU-4: 2.87
Example 30 BLEU-4: 0.76
Example 31 BLEU-4: 10.53
Example 32 BLEU-4: 7.35
Example 33 BLEU-4: 3.23
Example 34 BLEU-4: 3.23
Example 35 BLEU-4: 3.42
Example 36 BLEU-4: 1.73
Example 37 BLEU-4: 3.09
Example 38 BLEU-4: 3.87
Example 39 BLEU-4: 8.22
Example 40 BLEU-4: 1.57
Example 41 BLEU-4: 11.32
Example 42 

In [None]:
# Go on to calculate ROUGE scores
!pip install rouge-score

In [51]:
import nltk
from collections import Counter

# Ensure tokenizer
nltk.download('punkt', quiet=True)

def rouge_n(ref: str, hyp: str, n: int = 4):
    ref_toks = nltk.word_tokenize(ref.lower())
    hyp_toks = nltk.word_tokenize(hyp.lower())
    ref_ngrams = list(nltk.ngrams(ref_toks, n))
    hyp_ngrams = list(nltk.ngrams(hyp_toks, n))
    ref_counts = Counter(ref_ngrams)
    hyp_counts = Counter(hyp_ngrams)
    overlap = sum(min(ref_counts[ng], hyp_counts[ng]) for ng in ref_counts)
    recall = overlap / max(len(ref_ngrams), 1)
    precision = overlap / max(len(hyp_ngrams), 1)
    f1 = 2 * recall * precision / (recall + precision + 1e-8)
    return (recall, precision, f1)


# Compute ROUGE-2
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=2)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-2 Recall:    {recalls_per_sample[max_index]:.2f}")
    print(f"   ROUGE-2 Precision: {precisions_per_sample[max_index]:.2f}")
    print(f"   ROUGE-2 F1:        {f1s_per_sample[max_index]:.2f}\n")

# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-2 METRICS ===")
print(f"Recall:    {avg_r:.2f}")
print(f"Precision: {avg_p:.2f}")
print(f"F1:        {avg_f:.2f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   ROUGE-2 Recall:    0.00
   ROUGE-2 Precision: 0.00
   ROUGE-2 F1:        0.00

REF:  'The roundabout is on the grass next to trees and buildings .'
HYP:  'In this image we can see a map. On the map we can see buildings, trees, roads and vehicles.'
   ROUGE-2 Recall:    0.09
   ROUGE-2 Precision: 0.05
   ROUGE-2 F1:        0.06

REF:  'There are several moving vehicles on the road at the roundabout, There are several buildings and some trees and some lawns around the roundabout .'
HYP:  'In this image we can see a house, trees, vehicles on the road, grass and a fence.'
   ROUGE-2 Recall:    0.12
   ROUGE-2 Precision: 0.15
   ROUGE-2 F1:        0.13

REF:  'There are several vehicles driving on the road at the roundabout, There are some buildings and some trees and some lawns around the roundabout, And many cars are parked in the open space beside the roundabout .'
HYP:  'An aerial view of a roundabout with a grassy area

In [52]:
# Compute ROUGE-3
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=3)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-3 Recall:    {recalls_per_sample[max_index]:.2f}")
    print(f"   ROUGE-3 Precision: {precisions_per_sample[max_index]:.2f}")
    print(f"   ROUGE-3 F1:        {f1s_per_sample[max_index]:.2f}\n")

# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-3 METRICS ===")
print(f"Recall:    {avg_r:.2f}")
print(f"Precision: {avg_p:.2f}")
print(f"F1:        {avg_f:.2f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   ROUGE-3 Recall:    0.00
   ROUGE-3 Precision: 0.00
   ROUGE-3 F1:        0.00

REF:  'The roundabout with three exits and entrances is in the residential area .'
HYP:  'In this image we can see a map. On the map we can see buildings, trees, roads and vehicles.'
   ROUGE-3 Recall:    0.00
   ROUGE-3 Precision: 0.00
   ROUGE-3 F1:        0.00

REF:  'There are several moving vehicles on the road at the roundabout, There are several buildings and some trees and some lawns around the roundabout .'
HYP:  'In this image we can see a house, trees, vehicles on the road, grass and a fence.'
   ROUGE-3 Recall:    0.08
   ROUGE-3 Precision: 0.11
   ROUGE-3 F1:        0.09

REF:  'The roundabout is on the grass next to trees and buildings .'
HYP:  'An aerial view of a roundabout with a grassy area in the middle. The roundabout is surrounded by buildings with red roofs and trees. There are also cars parked on the side of the road a

In [53]:
# Compute ROUGE-4
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=4)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-4 Recall:    {recalls_per_sample[max_index]:.2f}")
    print(f"   ROUGE-4 Precision: {precisions_per_sample[max_index]:.2f}")
    print(f"   ROUGE-4 F1:        {f1s_per_sample[max_index]:.2f}\n")

# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-4 METRICS ===")
print(f"Recall:    {avg_r:.2f}")
print(f"Precision: {avg_p:.2f}")
print(f"F1:        {avg_f:.2f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   ROUGE-4 Recall:    0.00
   ROUGE-4 Precision: 0.00
   ROUGE-4 F1:        0.00

REF:  'The roundabout with three exits and entrances is in the residential area .'
HYP:  'In this image we can see a map. On the map we can see buildings, trees, roads and vehicles.'
   ROUGE-4 Recall:    0.00
   ROUGE-4 Precision: 0.00
   ROUGE-4 F1:        0.00

REF:  'There are several moving vehicles on the road at the roundabout, There are several buildings and some trees and some lawns around the roundabout .'
HYP:  'In this image we can see a house, trees, vehicles on the road, grass and a fence.'
   ROUGE-4 Recall:    0.04
   ROUGE-4 Precision: 0.06
   ROUGE-4 F1:        0.05

REF:  'The roundabout connects five roads and some houses are next to the roundabout .'
HYP:  'An aerial view of a roundabout with a grassy area in the middle. The roundabout is surrounded by buildings with red roofs and trees. There are also cars parked on the