<a href="https://colab.research.google.com/github/alierenc/di725-transformers-and-attention-based-deep-networks-term-project/blob/main/Phase%20III/3.1.%20SigLIP-T5-Decoder%20Custom%20VLM%20-%20Image%20Captioning%20Fine-Tuning%20and%20Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from huggingface_hub import login
hf_token = " " # Huggingface token
login(token = hf_token)

In [2]:
# Access google drive to save the model
from google.colab import drive
drive.mount('/content/drive')

# Import and log in wandb
import wandb

wandb.login()
# Initialize W&B run
wandb.init(project="term-project-vision-language-model", name="siglip-t5decoder")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[34m[1mwandb[0m: Currently logged in as: [33maeren[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
!pip install -U datasets
!pip install bitsandbytes --upgrade



In [4]:
from datasets import load_dataset, DatasetDict

# Load the dataset of full riscm
ds = load_dataset('caglarmert/full_riscm')

full = ds["train"]

# test   = indices [0, 3150)
test_ds = full.select(range(3150))

# validation = indices [3150, 6300)
val_ds = full.select(range(3150, 6300))

# train  = indices [6300, end)
train_ds = full.select(range(6300, len(full)))

# bundle into a DatasetDict
ds = DatasetDict({
    "test": test_ds,
    "val": val_ds,
    "train": train_ds,
})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
ds["test"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=224x224>,
 'caption_1': 'A gray plane on the runway and the lawn beside .',
 'caption_2': 'A grey plane is on the runway by the lawn .',
 'caption_3': 'There is an airplane on the runway with a large lawn by the runway .',
 'caption_4': 'A plane is parked on the runway next to the grass .',
 'caption_5': 'There is a plane on the runway beside the grass .'}

In [6]:
import torch
import torch.nn as nn
from transformers.modeling_outputs import BaseModelOutput

class CustomVLM(nn.Module):
    def __init__(self, vision_model, language_model, vision_hidden_size, language_hidden_size):
        super().__init__()
        self.vision_model = vision_model
        self.language_model = language_model
        self.vision_proj = nn.Linear(vision_hidden_size, language_hidden_size)

    def forward(self, image, input_ids=None, attention_mask=None, labels=None):
        # Step 1: Encode image
        vision_output = self.vision_model(pixel_values=image).last_hidden_state  # [B, N, D]
        vision_embedding = vision_output.mean(dim=1)                             # [B, D]
        encoder_hidden_states = self.vision_proj(vision_embedding).unsqueeze(1)  # [B, 1, d_model]

        # Step 2: Package encoder output for T5
        encoder_outputs = BaseModelOutput(last_hidden_state=encoder_hidden_states)

        # Step 3: Decode using prompt + image context
        output = self.language_model(
            input_ids=input_ids,                # prompt like "caption en"
            attention_mask=attention_mask,      # attention mask for prompt
            encoder_outputs=encoder_outputs,    # SigLIP embedding as context
            labels=labels                       # optional: ground-truth caption for training
        )

        return output


In [7]:
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    SiglipVisionModel,
    AutoImageProcessor,
    BitsAndBytesConfig
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import bitsandbytes
import torch

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# QLoRA quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load full T5 model (needed for decoder + lm_head)
language_model = T5ForConditionalGeneration.from_pretrained(
    "t5-base",
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare for QLoRA
language_model = prepare_model_for_kbit_training(language_model)

# Define LoRA config (on full model — affects both encoder & decoder if needed)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],  # extend to 'k', 'o', etc. for more aggressive tuning
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

# Apply LoRA
language_model = get_peft_model(language_model, lora_config)

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
tokenizer.pad_token = tokenizer.eos_token

# Load SigLIP vision encoder
vision_model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
vision_model.requires_grad_(False)

# Load image processor
image_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")

# Determine hidden sizes
vision_hidden_size = vision_model.config.hidden_size     # e.g. 768
language_hidden_size = language_model.config.d_model     # 768 for t5-base

# Initialize CustomVLM with full language model
model = CustomVLM(
    vision_model=vision_model,
    language_model=language_model,
    vision_hidden_size=vision_hidden_size,
    language_hidden_size=language_hidden_size
)

# Move to device
model = model.to(device)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [8]:
print("Language model device:", next(model.language_model.parameters()).device)
print("Vision model device:", next(model.vision_model.parameters()).device)
print("Vision projection layer device:", next(model.vision_proj.parameters()).device)

Language model device: cuda:0
Vision model device: cuda:0
Vision projection layer device: cuda:0


In [9]:
def count_parameters(module):
    total = sum(p.numel() for p in module.parameters())
    trainable = sum(p.numel() for p in module.parameters() if p.requires_grad)
    return {"Total": total, "Trainable": trainable}

print("Printing the total number of parameters and the number of trainable parameters:")
print("Vision Encoder (SigLIP):", count_parameters(model.vision_model))
print("Vision Projection Layer:", count_parameters(model.vision_proj))
print("Language Model (full T5):", count_parameters(model.language_model))

# Count only T5 decoder (within the full model)
print("T5 Decoder (only):", count_parameters(model.language_model.base_model.decoder))



Printing the total number of parameters and the number of trainable parameters:
Vision Encoder (SigLIP): {'Total': 92884224, 'Trainable': 0}
Vision Projection Layer: {'Total': 590592, 'Trainable': 590592}
Language Model (full T5): {'Total': 153009408, 'Trainable': 884736}
T5 Decoder (only): {'Total': 96071808, 'Trainable': 589824}


In [10]:
from transformers.modeling_outputs import BaseModelOutput

# We check whether the model can produce captions
# Set model to evaluation mode
model.eval()
max_new_tokens = 30
eos_token_id = tokenizer.eos_token_id

for i in range(10):
    print(f"Generating caption for sample {i + 1}")

    # Preprocess image
    image = ds["test"][i]["image"]
    pixel_values = image_processor(image, return_tensors="pt")["pixel_values"].to(device)

    with torch.no_grad():
        # Vision encoding inside VLM forward
        vision_output = model.vision_model(pixel_values=pixel_values).last_hidden_state
        vision_embedding = vision_output.mean(dim=1)
        encoder_hidden_states = model.vision_proj(vision_embedding).unsqueeze(1)

        # Wrap as BaseModelOutput
        encoder_outputs = BaseModelOutput(last_hidden_state=encoder_hidden_states)

        # Prompt setup
        prompt = "caption en"
        tokenized = tokenizer(prompt, return_tensors="pt").to(device)
        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]

        # Remove <pad> if it appears as the first token
        if input_ids[0, 0] == tokenizer.pad_token_id:
            input_ids = input_ids[:, 1:]
            attention_mask = attention_mask[:, 1:]

        # Generate
        generated_ids = model.language_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_outputs=encoder_outputs,
            max_new_tokens=max_new_tokens,
            eos_token_id=eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=False
        )

        # Decode output
        decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        caption = decoded.replace(prompt, "").strip()
        caption = decoded.replace("<pad>", "").strip()
        print("Generated caption:", repr(caption))
        print()


Generating caption for sample 1
Generated caption: 'jurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjurjur'

Generating caption for sample 2
Generated caption: ''

Generating caption for sample 3
Generated caption: 'dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés dés'

Generating caption for sample 4
Generated caption: 'virvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvirvir'

Generating caption for sample 5
Generated caption: 'drondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondrondron'

Generating caption for sample 6
Generated caption: 'Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi Zi'

Generating caption for sample 7
Generated caption: ''

Generating caption for sample 8
Generated caption: 'rumrumrumrumrumrumrumrumrumrumrumrumrumrumrumrumrumrumrumrumr

In [11]:
import wandb
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from torch.nn import functional as F

# Collate function for T5 captioning
def collate_fn(batch):
    images = [image_processor(example["image"], return_tensors="pt")["pixel_values"].squeeze(0) for example in batch]
    captions = [example["caption_3"] for example in batch]

    pixel_values = torch.stack(images)

    # For T5: prompt goes into input_ids, caption goes into labels
    prompts = ["caption en"] * len(captions)
    tokenized_input = tokenizer(prompts, padding=True, return_tensors="pt", truncation=True, max_length=512)
    tokenized_labels = tokenizer(captions, padding=True, return_tensors="pt", truncation=True, max_length=512)

    return {
        "pixel_values": pixel_values,
        "input_ids": tokenized_input["input_ids"],
        "attention_mask": tokenized_input["attention_mask"],
        "labels": tokenized_labels["input_ids"]  # T5 will shift internally
    }

# DataLoaders
train_loader = DataLoader(ds["train"], batch_size=256, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(ds["val"], batch_size=256, shuffle=False, collate_fn=collate_fn)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_samples = 0

    for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")):
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Replace <pad> tokens in labels with -100 so they are ignored in loss
        labels[labels == tokenizer.pad_token_id] = -100

        optimizer.zero_grad()

        outputs = model(
            image=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        batch_size = input_ids.size(0)
        total_loss += loss.item() * batch_size
        total_samples += batch_size

        wandb.log({
            "train/loss": loss.item(),
            "train/step": epoch * len(train_loader) + step
        })


    avg_train_loss = total_loss / total_samples
    print(f"Epoch {epoch+1} completed. Average Train Loss: {avg_train_loss:.4f}")
    wandb.log({"train/avg_epoch_loss": avg_train_loss, "epoch": epoch + 1})

    # Validation Loop
    model.eval()
    val_loss = 0.0
    val_samples = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            pixel_values = batch["pixel_values"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Same label cleaning
            labels[labels == tokenizer.pad_token_id] = -100

            outputs = model(
                image=pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            batch_size = input_ids.size(0)
            val_loss += outputs.loss.item() * batch_size
            val_samples += batch_size

    avg_val_loss = val_loss / val_samples
    print(f"Average Validation Loss: {avg_val_loss:.4f}")
    wandb.log({"val/loss": avg_val_loss, "epoch": epoch + 1})



Epoch 1/10:   0%|          | 0/150 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
Epoch 1/10: 100%|██████████| 150/150 [04:39<00:00,  1.86s/it]


Epoch 1 completed. Average Train Loss: 5.7107


Validating:   0%|          | 0/13 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Validating: 100%|██████████| 13/13 [00:17<00:00,  1.34s/it]


Average Validation Loss: 4.3752


Epoch 2/10: 100%|██████████| 150/150 [04:40<00:00,  1.87s/it]


Epoch 2 completed. Average Train Loss: 4.2606


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.36s/it]


Average Validation Loss: 2.8540


Epoch 3/10: 100%|██████████| 150/150 [04:40<00:00,  1.87s/it]


Epoch 3 completed. Average Train Loss: 3.0588


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.33s/it]


Average Validation Loss: 2.0175


Epoch 4/10: 100%|██████████| 150/150 [04:40<00:00,  1.87s/it]


Epoch 4 completed. Average Train Loss: 2.5203


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.34s/it]


Average Validation Loss: 1.7390


Epoch 5/10: 100%|██████████| 150/150 [04:41<00:00,  1.88s/it]


Epoch 5 completed. Average Train Loss: 2.2820


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.34s/it]


Average Validation Loss: 1.5744


Epoch 6/10: 100%|██████████| 150/150 [04:40<00:00,  1.87s/it]


Epoch 6 completed. Average Train Loss: 2.1308


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.34s/it]


Average Validation Loss: 1.4601


Epoch 7/10: 100%|██████████| 150/150 [04:41<00:00,  1.88s/it]


Epoch 7 completed. Average Train Loss: 2.0189


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.34s/it]


Average Validation Loss: 1.3625


Epoch 8/10: 100%|██████████| 150/150 [04:40<00:00,  1.87s/it]


Epoch 8 completed. Average Train Loss: 1.9280


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.34s/it]


Average Validation Loss: 1.2900


Epoch 9/10: 100%|██████████| 150/150 [04:41<00:00,  1.88s/it]


Epoch 9 completed. Average Train Loss: 1.8566


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.34s/it]


Average Validation Loss: 1.2275


Epoch 10/10: 100%|██████████| 150/150 [04:40<00:00,  1.87s/it]


Epoch 10 completed. Average Train Loss: 1.7949


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.34s/it]

Average Validation Loss: 1.1789





In [15]:
import os

save_dir = "/content/drive/MyDrive/DI725 - Transformers and Attention-based Deep Networks/Term Project/siglip-t5-custom_vlm_finetuned"
os.makedirs(save_dir, exist_ok=True)

# Save tokenizer
tokenizer.save_pretrained(save_dir)

# Save merged language model (now a clean T5)
model.language_model.save_pretrained(save_dir)

# Save vision encoder and image processor
model.vision_model.save_pretrained(f"{save_dir}/vision_encoder")
image_processor.save_pretrained(f"{save_dir}/vision_encoder")

# Save vision projection layer
torch.save(model.vision_proj.state_dict(), f"{save_dir}/vision_proj.pt")

# Save config for reinitialization
import json
config = {
    "vision_encoder_path": "vision_encoder",
    "language_model_path": ".",
    "vision_proj_path": "vision_proj.pt",
    "vision_hidden_size": model.vision_proj.in_features,
    "language_hidden_size": model.vision_proj.out_features
}
with open(os.path.join(save_dir, "custom_vlm_config.json"), "w") as f:
    json.dump(config, f, indent=2)


In [16]:
# Set model to evaluation mode
model.eval()
max_new_tokens = 30
eos_token_id = tokenizer.eos_token_id
predictions = []

for i in tqdm(range(len(ds["test"])), desc="Generating captions"):
    # Preprocess image
    image = ds["test"][i]["image"]
    pixel_values = image_processor(image, return_tensors="pt")["pixel_values"].to(device)

    with torch.no_grad():
        # Vision encoding inside VLM forward
        vision_output = model.vision_model(pixel_values=pixel_values).last_hidden_state
        vision_embedding = vision_output.mean(dim=1)
        encoder_hidden_states = model.vision_proj(vision_embedding).unsqueeze(1)

        # Wrap as BaseModelOutput
        encoder_outputs = BaseModelOutput(last_hidden_state=encoder_hidden_states)

        # Prompt setup
        prompt = "caption en"
        tokenized = tokenizer(prompt, return_tensors="pt").to(device)
        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]

        # Remove <pad> if it appears as the first token
        if input_ids[0, 0] == tokenizer.pad_token_id:
            input_ids = input_ids[:, 1:]
            attention_mask = attention_mask[:, 1:]

        # Generate
        generated_ids = model.language_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_outputs=encoder_outputs,
            max_new_tokens=max_new_tokens,
            eos_token_id=eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=False
        )

        # Decode output
        decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        caption = decoded.replace(prompt, "").strip()
        caption = decoded.replace("<pad>", "").strip()
        predictions.append(caption)

Generating captions: 100%|██████████| 3150/3150 [54:16<00:00,  1.03s/it]


In [17]:
# Get the references
# Define a varible to store the reference captions
all_references = []
for i in range(len(ds["test"])):
    # Get the reference
    reference_per_sample = []
    for j in range(1,6):
        reference = ds["test"][i][f"caption_{j}"]
        reference_per_sample.append(reference)
        print(f"The reference caption_{j}:")
        print(repr(reference))

    print()
    all_references.append(reference_per_sample)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
'The sparse residential is on the grass next to some trees .'
The reference caption_4:
'Many trees are beside the sparse residential .'
The reference caption_5:
'A sparse residential area with some green trees and a brown building .'

The reference caption_1:
'There are blocks of residential areas between the jungles .'
The reference caption_2:
'In sparse residential areas there are several houses of different sizes and many green trees surrounded by grass .'
The reference caption_3:
'The sparse residential is on the grass next to some trees and a road .'
The reference caption_4:
'Several buildings are in sparse residential .'
The reference caption_5:
'A sparse residential area with three brown buildings and a black building .'

The reference caption_1:
'Two paths lead to a small black house .'
The reference caption_2:
'There are several houses of different sizes and several green trees in the sparse residential area, Sur

In [18]:
# Check the format of the reference captions
print(all_references[:5])

[['A gray plane on the runway and the lawn beside .', 'A grey plane is on the runway by the lawn .', 'There is an airplane on the runway with a large lawn by the runway .', 'A plane is parked on the runway next to the grass .', 'There is a plane on the runway beside the grass .'], ['Three small planes parked in a line on the airport and a big plane behind them .', 'There are four aircraft on the open ground, The largest of which is three times as large as the smallest one .', 'There are many planes of different sizes in a clearing .', 'Four planes are parked on the runway .', 'Four planes of different sizes were on the marked ground .'], ['A plane parked in a line on the airport with some marks .', 'A white plane was parked on the instruction line .', 'An airplane parked in an open area with many containers next to it .', 'A plane is parked on the open space .', 'There is 1 plane on the ground marked .'], ['A small plane and a big plane parked next to boarding bridges .', 'A white plan

In [19]:
# Check the format of the predicted captions. Each sample starts with a new line
print(predictions[:5])

['There are several runways on the leased land next to the road . There are several cars on the freeway . There are several cars', 'There were two airplanes in the clearing at the airport . Two airplanes were cleared by the airport . There were three airplanes parked', 'There are several airplanes in the airport . Two airplanes parked in the airport . There are many buildings . There are many buildings', 'An airplane is parked in the airport . There are many buildings parked in the airport . There are many planes parked in the', 'There were two planes of different sizes of different sizes in the airport . Two different planes . Two different sizes of different sizes .']


In [20]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

nltk.download('punkt_tab')
nltk.download('punkt')

# Tokenize references and predictions:
tokenized_refs = [
    [nltk.word_tokenize(ref.lower()) for ref in refs]
    for refs in all_references
]

tokenized_hyps = [nltk.word_tokenize(pred.lower()) for pred in predictions]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
tokenized_refs[0]

[['a',
  'gray',
  'plane',
  'on',
  'the',
  'runway',
  'and',
  'the',
  'lawn',
  'beside',
  '.'],
 ['a', 'grey', 'plane', 'is', 'on', 'the', 'runway', 'by', 'the', 'lawn', '.'],
 ['there',
  'is',
  'an',
  'airplane',
  'on',
  'the',
  'runway',
  'with',
  'a',
  'large',
  'lawn',
  'by',
  'the',
  'runway',
  '.'],
 ['a',
  'plane',
  'is',
  'parked',
  'on',
  'the',
  'runway',
  'next',
  'to',
  'the',
  'grass',
  '.'],
 ['there',
  'is',
  'a',
  'plane',
  'on',
  'the',
  'runway',
  'beside',
  'the',
  'grass',
  '.']]

In [22]:
# Sentence-level BLEU-2
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/2, 1/2),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-2: {max_score*100:.2f}")

Example  1 BLEU-2: 17.32
Example  2 BLEU-2: 10.43
Example  3 BLEU-2: 13.46
Example  4 BLEU-2: 22.12
Example  5 BLEU-2: 32.40
Example  6 BLEU-2: 19.50
Example  7 BLEU-2: 22.36
Example  8 BLEU-2: 13.46
Example  9 BLEU-2: 21.60
Example 10 BLEU-2: 15.81
Example 11 BLEU-2: 16.64
Example 12 BLEU-2: 27.74
Example 13 BLEU-2: 19.74
Example 14 BLEU-2: 45.57
Example 15 BLEU-2: 15.28
Example 16 BLEU-2: 18.67
Example 17 BLEU-2: 25.16
Example 18 BLEU-2: 21.60
Example 19 BLEU-2: 16.33
Example 20 BLEU-2: 24.17
Example 21 BLEU-2: 24.08
Example 22 BLEU-2: 25.82
Example 23 BLEU-2: 11.94
Example 24 BLEU-2: 25.54
Example 25 BLEU-2: 22.19
Example 26 BLEU-2: 21.35
Example 27 BLEU-2: 23.53
Example 28 BLEU-2: 19.22
Example 29 BLEU-2: 19.61
Example 30 BLEU-2: 8.16
Example 31 BLEU-2: 14.68
Example 32 BLEU-2: 19.97
Example 33 BLEU-2: 13.93
Example 34 BLEU-2: 15.10
Example 35 BLEU-2: 18.06
Example 36 BLEU-2: 16.63
Example 37 BLEU-2: 22.19
Example 38 BLEU-2: 21.60
Example 39 BLEU-2: 14.06
Example 40 BLEU-2: 18.06
E

In [23]:
# Corpus-level BLEU-2
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/2, 1/2),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-2: {corpus_score*100:.2f}")


Corpus BLEU-2: 37.53


In [24]:
# Sentence-level BLEU-3
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/3, 1/3, 1/3),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-3: {max_score*100:.2f}")

Example  1 BLEU-3: 10.93
Example  2 BLEU-3: 3.67
Example  3 BLEU-3: 4.35
Example  4 BLEU-3: 16.44
Example  5 BLEU-3: 28.37
Example  6 BLEU-3: 15.12
Example  7 BLEU-3: 12.95
Example  8 BLEU-3: 4.35
Example  9 BLEU-3: 18.26
Example 10 BLEU-3: 12.95
Example 11 BLEU-3: 10.49
Example 12 BLEU-3: 21.26
Example 13 BLEU-3: 15.74
Example 14 BLEU-3: 37.31
Example 15 BLEU-3: 8.98
Example 16 BLEU-3: 12.74
Example 17 BLEU-3: 20.78
Example 18 BLEU-3: 18.26
Example 19 BLEU-3: 4.88
Example 20 BLEU-3: 18.01
Example 21 BLEU-3: 19.05
Example 22 BLEU-3: 20.56
Example 23 BLEU-3: 8.29
Example 24 BLEU-3: 16.64
Example 25 BLEU-3: 16.01
Example 26 BLEU-3: 15.39
Example 27 BLEU-3: 16.65
Example 28 BLEU-3: 11.54
Example 29 BLEU-3: 11.54
Example 30 BLEU-3: 3.07
Example 31 BLEU-3: 9.65
Example 32 BLEU-3: 11.68
Example 33 BLEU-3: 4.00
Example 34 BLEU-3: 4.50
Example 35 BLEU-3: 14.37
Example 36 BLEU-3: 5.09
Example 37 BLEU-3: 12.71
Example 38 BLEU-3: 12.66
Example 39 BLEU-3: 9.80
Example 40 BLEU-3: 11.40
Example 41 B

In [25]:
# Corpus-level BLEU-3
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/3, 1/3, 1/3),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-3: {corpus_score*100:.2f}")


Corpus BLEU-3: 29.04


In [26]:
# Sentence-level BLEU-4
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/4, 1/4, 1/4, 1/4),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-4: {max_score*100:.2f}")

Example  1 BLEU-4: 4.93
Example  2 BLEU-4: 2.20
Example  3 BLEU-4: 2.50
Example  4 BLEU-4: 12.06
Example  5 BLEU-4: 23.62
Example  6 BLEU-4: 11.33
Example  7 BLEU-4: 5.61
Example  8 BLEU-4: 2.50
Example  9 BLEU-4: 15.34
Example 10 BLEU-4: 9.97
Example 11 BLEU-4: 4.73
Example 12 BLEU-4: 14.30
Example 13 BLEU-4: 11.97
Example 14 BLEU-4: 28.69
Example 15 BLEU-4: 4.26
Example 16 BLEU-4: 6.02
Example 17 BLEU-4: 18.50
Example 18 BLEU-4: 15.34
Example 19 BLEU-4: 2.69
Example 20 BLEU-4: 13.24
Example 21 BLEU-4: 16.02
Example 22 BLEU-4: 16.77
Example 23 BLEU-4: 3.93
Example 24 BLEU-4: 6.85
Example 25 BLEU-4: 11.56
Example 26 BLEU-4: 11.10
Example 27 BLEU-4: 6.69
Example 28 BLEU-4: 5.09
Example 29 BLEU-4: 5.03
Example 30 BLEU-4: 1.91
Example 31 BLEU-4: 4.44
Example 32 BLEU-4: 5.08
Example 33 BLEU-4: 2.17
Example 34 BLEU-4: 2.48
Example 35 BLEU-4: 10.90
Example 36 BLEU-4: 2.85
Example 37 BLEU-4: 5.46
Example 38 BLEU-4: 5.51
Example 39 BLEU-4: 4.66
Example 40 BLEU-4: 5.15
Example 41 BLEU-4: 5.78
E

In [27]:
# Corpus-level BLEU-4
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/4, 1/4, 1/4, 1/4),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-4: {corpus_score*100:.2f}")


Corpus BLEU-4: 22.73


In [28]:
# Go on to calculate ROUGE scores
!pip install rouge-score



In [29]:
import nltk
from collections import Counter

# Ensure tokenizer
nltk.download('punkt', quiet=True)

def rouge_n(ref: str, hyp: str, n: int = 4):
    ref_toks = nltk.word_tokenize(ref.lower())
    hyp_toks = nltk.word_tokenize(hyp.lower())
    ref_ngrams = list(nltk.ngrams(ref_toks, n))
    hyp_ngrams = list(nltk.ngrams(hyp_toks, n))
    ref_counts = Counter(ref_ngrams)
    hyp_counts = Counter(hyp_ngrams)
    overlap = sum(min(ref_counts[ng], hyp_counts[ng]) for ng in ref_counts)
    recall = overlap / max(len(ref_ngrams), 1)
    precision = overlap / max(len(hyp_ngrams), 1)
    f1 = 2 * recall * precision / (recall + precision + 1e-8)
    return (recall, precision, f1)

# Compute ROUGE-2
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=2)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-2 Recall:    {recalls_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-2 Precision: {precisions_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-2 F1:        {f1s_per_sample[max_index] * 100:.2f}%\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   ROUGE-2 F1:        47.06%

REF:  'The roundabout is next to grass and parking lot .'
HYP:  'The roundabout is on the grass next to some trees . There are many trees on the grass . There are many trees on the grass'
   ROUGE-2 Recall:    33.33%
   ROUGE-2 Precision: 12.00%
   ROUGE-2 F1:        17.65%

REF:  'The roundabout is on the grass next to trees and buildings .'
HYP:  'The roundabout is on the grass next to trees and trees . There are many buildings and trees . There are many trees on the grass'
   ROUGE-2 Recall:    81.82%
   ROUGE-2 Precision: 36.00%
   ROUGE-2 F1:        50.00%

REF:  'The roundabout is on the bare land next to buildings and trees .'
HYP:  'The soaring building is on the grass next to some trees and trees . The road is surrounded by trees and trees . The road'
   ROUGE-2 Recall:    41.67%
   ROUGE-2 Precision: 20.83%
   ROUGE-2 F1:        27.78%

REF:  'The roundabout is on the grass next to 

In [30]:
# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-2 METRICS ===")
print(f"Recall:    {avg_r*100:.2f}")
print(f"Precision: {avg_p*100:.2f}")
print(f"F1:        {avg_f*100:.2f}")

=== AVERAGE ROUGE-2 METRICS ===
Recall:    47.23
Precision: 22.40
F1:        29.56


In [31]:
# Compute ROUGE-3
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=3)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-3 Recall:    {recalls_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-3 Precision: {precisions_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-3 F1:        {f1s_per_sample[max_index] * 100:.2f}%\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   ROUGE-3 F1:        37.50%

REF:  'The roundabout is next to grass and parking lot .'
HYP:  'The roundabout is on the grass next to some trees . There are many trees on the grass . There are many trees on the grass'
   ROUGE-3 Recall:    12.50%
   ROUGE-3 Precision: 4.17%
   ROUGE-3 F1:        6.25%

REF:  'The roundabout is on the grass next to trees and buildings .'
HYP:  'The roundabout is on the grass next to trees and trees . There are many buildings and trees . There are many trees on the grass'
   ROUGE-3 Recall:    80.00%
   ROUGE-3 Precision: 33.33%
   ROUGE-3 F1:        47.06%

REF:  'The roundabout is on the bare land next to buildings and trees .'
HYP:  'The soaring building is on the grass next to some trees and trees . The road is surrounded by trees and trees . The road'
   ROUGE-3 Recall:    18.18%
   ROUGE-3 Precision: 8.70%
   ROUGE-3 F1:        11.76%

REF:  'The roundabout is on the grass next to tre

In [32]:
# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-3 METRICS ===")
print(f"Recall:    {avg_r*100:.2f}")
print(f"Precision: {avg_p*100:.2f}")
print(f"F1:        {avg_f*100:.2f}")

=== AVERAGE ROUGE-3 METRICS ===
Recall:    34.11
Precision: 14.85
F1:        20.10


In [33]:
# Compute ROUGE-4
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=4)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-4 Recall:    {recalls_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-4 Precision: {precisions_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-4 F1:        {f1s_per_sample[max_index] * 100:.2f}%\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   ROUGE-4 F1:        33.33%

REF:  'The roundabout connects three roads and a sculpture is in the middle of the roundabout .'
HYP:  'The roundabout is on the grass next to some trees . There are many trees on the grass . There are many trees on the grass'
   ROUGE-4 Recall:    0.00%
   ROUGE-4 Precision: 0.00%
   ROUGE-4 F1:        0.00%

REF:  'The roundabout is on the grass next to trees and buildings .'
HYP:  'The roundabout is on the grass next to trees and trees . There are many buildings and trees . There are many trees on the grass'
   ROUGE-4 Recall:    77.78%
   ROUGE-4 Precision: 30.43%
   ROUGE-4 F1:        43.75%

REF:  'The roundabout connects four roads and a building is next to the roundabout .'
HYP:  'The soaring building is on the grass next to some trees and trees . The road is surrounded by trees and trees . The road'
   ROUGE-4 Recall:    0.00%
   ROUGE-4 Precision: 0.00%
   ROUGE-4 F1:        0.00%



In [34]:
# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-4 METRICS ===")
print(f"Recall:    {avg_r*100:.2f}")
print(f"Precision: {avg_p*100:.2f}")
print(f"F1:        {avg_f*100:.2f}")

=== AVERAGE ROUGE-4 METRICS ===
Recall:    24.80
Precision: 10.08
F1:        13.90


In [35]:
import os

save_dir = "/content/drive/MyDrive/DI725 - Transformers and Attention-based Deep Networks/Term Project/siglip-t5-custom_vlm_finetuned"
os.makedirs(save_dir, exist_ok=True)

# Save tokenizer
tokenizer.save_pretrained(save_dir)

# Save merged language model (now a clean T5)
model.language_model.save_pretrained(save_dir)

# Save vision encoder and image processor
model.vision_model.save_pretrained(f"{save_dir}/vision_encoder")
image_processor.save_pretrained(f"{save_dir}/vision_encoder")

# Save vision projection layer
torch.save(model.vision_proj.state_dict(), f"{save_dir}/vision_proj.pt")

# Save config for reinitialization
import json
config = {
    "vision_encoder_path": "vision_encoder",
    "language_model_path": ".",
    "vision_proj_path": "vision_proj.pt",
    "vision_hidden_size": model.vision_proj.in_features,
    "language_hidden_size": model.vision_proj.out_features
}
with open(os.path.join(save_dir, "custom_vlm_config.json"), "w") as f:
    json.dump(config, f, indent=2)


In [36]:
print("DONE")

DONE
