In [1]:
!pip install sacrebleu
!pip install gTTS
!pip install nltk
!pip install tqdm
!pip install gradio

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1
Collecting gTTS
  Downloading gTTS-2.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import os
import random
import nltk
from PIL import Image
import sacrebleu
from tqdm import tqdm

# Download NLTK tokenizer data
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Using device: cpu


In [4]:
# Vocabulary class (as in the repository)
class Vocabulary:
    def __init__(self, freq_threshold):
        self.freq_threshold = freq_threshold
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {v: k for k, v in self.itos.items()}

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.lower() for tok in word_tokenize(text)]

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                frequencies[word] = frequencies.get(word, 0) + 1
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)
        return [self.stoi.get(token, self.stoi["<UNK>"]) for token in tokenized_text]


# Custom Dataset with train/val/test splitting (for .txt file with CSV format)
class FlickrDataset(Dataset):
    def __init__(self, root_dir, captions_file, vocabulary, transform=None, split="train", split_ratio=(0.80, 0.08, 0.12)):
        """
        Args:
            root_dir: Directory with images.
            captions_file: Path to the .txt file with captions.
                           Expected format:
                           image,caption
                           1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .
                           ...
            vocabulary: A Vocabulary object.
            transform: Image transformations.
            split: One of 'train', 'val', or 'test'.
            split_ratio: Tuple for train/val/test splits.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.vocabulary = vocabulary

        # Read the file and split lines
        with open(captions_file, 'r') as f:
            lines = f.readlines()

        imgs = []
        caps = []
        # Skip header if present (assuming first line starts with "image")
        if lines[0].strip().lower().startswith("image"):
            lines = lines[1:]

        for line in lines:
            line = line.strip()
            if not line:
                continue
            parts = line.split(',', 1)  # split into two parts at the first comma
            if len(parts) < 2:
                continue
            img_name = parts[0].strip()
            caption = parts[1].strip()
            imgs.append(img_name)
            caps.append(caption)

        # Map images to their captions
        self.img2caps = {}
        for img, cap in zip(imgs, caps):
            if img not in self.img2caps:
                self.img2caps[img] = []
            self.img2caps[img].append(cap)
        self.imgs = list(self.img2caps.keys())
        print("Total images found:", len(self.imgs))

        # Split dataset into train/val/test
        random.seed(42)
        random.shuffle(self.imgs)
        total = len(self.imgs)
        train_end = int(split_ratio[0] * total)
        val_end = train_end + int(split_ratio[1] * total)

        if split == "train":
            self.imgs = self.imgs[:train_end]
        elif split == "val":
            self.imgs = self.imgs[train_end:val_end]
        elif split == "test":
            self.imgs = self.imgs[val_end:]
        else:
            raise Exception("split must be one of 'train', 'val', or 'test'")

        self.split = split

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, index):
        img_id = self.imgs[index]
        caps = self.img2caps[img_id]
        # For training, pick a random caption; for validation/testing, use the first caption
        caption = random.choice(caps) if self.split == "train" else caps[0]
        img_path = os.path.join(self.root_dir, img_id)
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        # Add start and end tokens
        numericalized_caption = [self.vocabulary.stoi["<SOS>"]]
        numericalized_caption += self.vocabulary.numericalize(caption)
        numericalized_caption.append(self.vocabulary.stoi["<EOS>"])
        return image, torch.tensor(numericalized_caption)

# Collate function to pad sequences
def collate_fn(batch):
    images, captions = zip(*batch)
    images = torch.stack(images, 0)
    captions = pad_sequence(captions, batch_first=True, padding_value=0)
    return images, captions

# InceptionV3 expects 299x299 images.
train_transform = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.RandomHorizontalFlip(p=0.5),           # New augmentation
    transforms.RandomRotation(10),                    # New augmentation
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),  # New
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Keep original transform for validation/test
val_test_transform = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
# Build vocabulary from the captions file (skip header if present)
def build_vocab(captions_file, freq_threshold):
    with open(captions_file, 'r') as f:
        lines = f.readlines()
    # Skip header line if it starts with "image"
    if lines[0].strip().lower().startswith("image"):
        lines = lines[1:]
    captions = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        parts = line.split(',', 1)
        if len(parts) < 2:
            continue
        captions.append(parts[1].strip())
    vocab = Vocabulary(freq_threshold)
    vocab.build_vocabulary(captions)
    return vocab

# Update these paths to your dataset locations
captions_path = "/content/drive/MyDrive/Flickr/captions.txt"
images_root = "/content/drive/MyDrive/Flickr/Images"

# Build vocabulary (adjust frequency threshold as needed)
vocab = build_vocab(captions_path, freq_threshold=5)
print("Vocabulary size:", len(vocab))
num_workers = 0

# Create dataset objects for train, validation, and test splits
train_dataset = FlickrDataset(root_dir=images_root, captions_file=captions_path, vocabulary=vocab, transform=train_transform, split="train")
val_dataset   = FlickrDataset(root_dir=images_root, captions_file=captions_path, vocabulary=vocab, transform=val_test_transform, split="val")
test_dataset  = FlickrDataset(root_dir=images_root, captions_file=captions_path, vocabulary=vocab, transform=val_test_transform, split="test")

# Create DataLoaders
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn, num_workers=num_workers)
val_loader   = DataLoader(dataset=val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=num_workers)
test_loader  = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=num_workers)

print("Train, val, test sizes:", len(train_dataset), len(val_dataset), len(test_dataset))

Vocabulary size: 3006
Total images found: 8091
Total images found: 8091
Total images found: 8091
Train, val, test sizes: 6472 647 972


In [5]:
import torch
import torch.nn as nn
from torchvision.models import inception_v3, Inception_V3_Weights

from torchvision.models import inception_v3, Inception_V3_Weights

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        # Instantiate InceptionV3 with default weights and aux_logits=True (as required by the weights API)
        inception = inception_v3(weights=Inception_V3_Weights.DEFAULT, aux_logits=True)
        # Override the avgpool and fc layers so they don't alter the feature map.
        inception.avgpool = nn.Identity()
        inception.fc = nn.Identity()
        # Manually assemble the layers up to Mixed_7c using the correct attribute names.
        self.features = nn.Sequential(
            inception.Conv2d_1a_3x3,
            inception.Conv2d_2a_3x3,
            inception.Conv2d_2b_3x3,
            inception.maxpool1,             # previously MaxPool_3a_3x3
            inception.Conv2d_3b_1x1,
            inception.Conv2d_4a_3x3,
            inception.maxpool2,             # previously MaxPool_4a_3x3
            inception.Mixed_5b,
            inception.Mixed_5c,
            inception.Mixed_5d,
            inception.Mixed_6a,
            inception.Mixed_6b,
            inception.Mixed_6c,
            inception.Mixed_6d,
            inception.Mixed_6e,
            inception.Mixed_7a,
            inception.Mixed_7b,
            inception.Mixed_7c
        )
        # Linear projection: project 2048-dim features to embed_size for each spatial location.
        self.linear = nn.Linear(2048, embed_size)

    def forward(self, images):
        # images: (batch, 3, 299, 299)
        x = self.features(images)  # Expected shape: (batch, 2048, H, W), e.g., (batch, 2048, 8, 8)
        batch_size, C, H, W = x.size()
        # Reshape to (batch, num_pixels, 2048) where num_pixels = H * W.
        x = x.view(batch_size, C, -1).permute(0, 2, 1)
        # Project each spatial feature into the embedding space.
        x = self.linear(x)  # (batch, num_pixels, embed_size)
        return x


# ----- Attention Module -----
class Attention(nn.Module):
    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        super(Attention, self).__init__()
        self.encoder_att = nn.Linear(encoder_dim, attention_dim)  # transforms encoder features
        self.decoder_att = nn.Linear(decoder_dim, attention_dim)  # transforms decoder hidden state
        self.full_att = nn.Linear(attention_dim, 1)               # computes scalar attention energy
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, encoder_out, decoder_hidden):
        # encoder_out: (batch, num_pixels, encoder_dim)
        # decoder_hidden: (batch, decoder_dim)
        att1 = self.encoder_att(encoder_out)               # (batch, num_pixels, attention_dim)
        att2 = self.decoder_att(decoder_hidden).unsqueeze(1) # (batch, 1, attention_dim)
        att = self.full_att(self.relu(att1 + att2)).squeeze(2) # (batch, num_pixels)
        alpha = self.softmax(att)                          # (batch, num_pixels)
        context = (encoder_out * alpha.unsqueeze(2)).sum(dim=1)  # (batch, encoder_dim)
        return context, alpha

# ----- Decoder with Attention and Beam Search -----
class DecoderRNNWithAttention(nn.Module):
    def __init__(self, embed_size, decoder_dim, vocab_size, num_layers, attention_dim):
        """
        embed_size: dimension of word embeddings and encoder output projection.
        decoder_dim: LSTM hidden size.
        vocab_size: number of tokens.
        num_layers: number of LSTM layers.
        attention_dim: dimension used in attention module.
        """
        super(DecoderRNNWithAttention, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.attention = Attention(encoder_dim=embed_size, decoder_dim=decoder_dim, attention_dim=attention_dim)
        # Input dimension to LSTM: word embedding (embed_size) + context vector (embed_size)
        self.lstm = nn.LSTM(embed_size + embed_size, decoder_dim, num_layers, batch_first=True)
        self.linear = nn.Linear(decoder_dim, vocab_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder_dim = decoder_dim

    def forward(self, encoder_out, captions):
        """
        Teacher forcing: captions is ground-truth. We do not incorporate attention in a vectorized
        way for all time steps in this simple implementation.
        """
        embeddings = self.embed(captions[:, :-1])  # (batch, seq_len, embed_size)
        batch_size = encoder_out.size(0)
        seq_len = embeddings.size(1)
        outputs = []
        hidden, cell = None, None
        for t in range(seq_len):
            # For the first time step, or if hidden is not defined, use zeros.
            if hidden is None:
                hidden_state = torch.zeros(batch_size, self.decoder_dim).to(encoder_out.device)
            else:
                hidden_state = hidden[-1]  # (batch, decoder_dim)
            # Compute attention context vector
            context, alpha = self.attention(encoder_out, hidden_state)
            # Get embedding for current word
            word_emb = embeddings[:, t, :]  # (batch, embed_size)
            lstm_input = torch.cat([word_emb, context], dim=1).unsqueeze(1)  # (batch, 1, 2*embed_size)
            output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell)) if hidden is not None else self.lstm(lstm_input)
            output = self.linear(output.squeeze(1))
            outputs.append(output)
        outputs = torch.stack(outputs, dim=1)
        return outputs

    def sample(self, encoder_out, max_len=20):
        """Greedy decoding with attention (for batch size = 1)."""
        batch_size = encoder_out.size(0)
        assert batch_size == 1, "sample() supports only batch_size = 1"
        sampled_ids = []
        hidden, cell = None, None
        # Start with <SOS> token.
        inputs = self.embed(torch.tensor([vocab.stoi["<SOS>"]], device=encoder_out.device)).unsqueeze(1)
        for t in range(max_len):
            if hidden is None:
                hidden_state = torch.zeros(1, self.decoder_dim).to(encoder_out.device)
            else:
                hidden_state = hidden[-1]
            context, alpha = self.attention(encoder_out, hidden_state)
            lstm_input = torch.cat([inputs.squeeze(1), context], dim=1).unsqueeze(1)
            output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell)) if hidden is not None else self.lstm(lstm_input)
            output = self.linear(output.squeeze(1))
            predicted = output.argmax(1)
            sampled_ids.append(predicted.item())
            if predicted.item() == vocab.stoi["<EOS>"]:
                break
            inputs = self.embed(predicted).unsqueeze(1)
        return sampled_ids

    def sample_beam(self, encoder_out, beam_size=3, max_len=20):
        """
        Beam search decoding with attention for batch_size = 1.
        Returns the best sequence as a list of token ids.
        """
        k = beam_size
        vocab_size = self.linear.out_features
        # Start with <SOS> token.
        start_token = torch.tensor([vocab.stoi["<SOS>"]], device=encoder_out.device)
        inputs = self.embed(start_token).unsqueeze(1)  # (1, 1, embed_size)
        # Initialize hidden state with zeros.
        hidden, cell = None, None
        # For initial step, define a zero hidden state for attention.
        hidden_state = torch.zeros(1, self.decoder_dim).to(encoder_out.device)
        context, alpha = self.attention(encoder_out, hidden_state)
        lstm_input = torch.cat([inputs.squeeze(1), context], dim=1).unsqueeze(1)
        output, (hidden, cell) = self.lstm(lstm_input, None)
        output = self.linear(output.squeeze(1))
        log_probs = torch.log_softmax(output, dim=1)
        topk_log_probs, topk_indices = log_probs.topk(k)
        beams = []
        for i in range(k):
            token = topk_indices[0, i].unsqueeze(0)
            score = topk_log_probs[0, i].item()
            beams.append((token, score, hidden, cell))
        for t in range(max_len - 1):
            candidates = []
            for seq, score, hidden, cell in beams:
                if seq[-1].item() == vocab.stoi["<EOS>"]:
                    candidates.append((seq, score, hidden, cell))
                    continue
                last_token = seq[-1].unsqueeze(0)
                emb = self.embed(last_token).unsqueeze(1)
                hidden_state = hidden[-1]
                context, alpha = self.attention(encoder_out, hidden_state)
                lstm_input = torch.cat([emb.squeeze(1), context], dim=1).unsqueeze(1)
                output, (new_hidden, new_cell) = self.lstm(lstm_input, (hidden, cell))
                output = self.linear(output.squeeze(1))
                log_probs = torch.log_softmax(output, dim=1)
                topk_log_probs, topk_indices = log_probs.topk(k)
                for i in range(k):
                    new_token = topk_indices[0, i].unsqueeze(0)
                    new_seq = torch.cat([seq, new_token])
                    new_score = score + topk_log_probs[0, i].item()
                    candidates.append((new_seq, new_score, new_hidden, new_cell))
            beams = sorted(candidates, key=lambda tup: tup[1], reverse=True)[:k]
            if all(seq[-1].item() == vocab.stoi["<EOS>"] for seq, _, _, _ in beams):
                break
        best_seq = beams[0][0]
        return best_seq.tolist()

In [29]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

def evaluate(encoder, decoder, criterion, data_loader, device):
    encoder.eval()
    decoder.eval()
    total_loss = 0.0
    with torch.no_grad():
        pbar = tqdm(data_loader, desc="Evaluating", leave=False)
        for images, captions in pbar:
            images, captions = images.to(device), captions.to(device)
            features = encoder(images)
            outputs = decoder(features, captions)
            # Fixed version - use shifted captions for targets
            loss = criterion(outputs.reshape(-1, outputs.shape[2]), captions[:, 1:].reshape(-1))
            total_loss += loss.item()
            pbar.set_postfix({"val_loss": loss.item()})
    avg_loss = total_loss / len(data_loader)
    return avg_loss

def train(encoder, decoder, criterion, optimizer, train_loader, val_loader, num_epochs, device, save_path="Downloads/Flickr/best_model.pth"):
    encoder.to(device)
    decoder.to(device)
    best_val_loss = float('inf')
    # Add LR scheduler
    scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=2, verbose=True)

    for epoch in range(num_epochs):
        encoder.train()
        decoder.train()
        running_loss = 0.0
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
        for images, captions in pbar:
            images, captions = images.to(device), captions.to(device)
            features = encoder(images)
            outputs = decoder(features, captions)
            loss = criterion(outputs.reshape(-1, outputs.shape[2]), captions[:, 1:].reshape(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            pbar.set_postfix({"Loss": loss.item()})
        avg_train_loss = running_loss / len(train_loader)
        avg_val_loss = evaluate(encoder, decoder, criterion, val_loader, device)

        # ====== ADD SCHEDULER STEP HERE ======
        scheduler.step(avg_val_loss)  # Update learning rate based on validation loss

        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        # Save checkpoint if validation loss decreases
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save({
                'encoder_state_dict': encoder.state_dict(),
                'decoder_state_dict': decoder.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch,
                'val_loss': avg_val_loss,
            }, save_path)
            print(f"Checkpoint saved at epoch {epoch+1} with val loss {avg_val_loss:.4f}")

In [None]:
# Hyperparameters (matching the original repo)
embed_size    = 256
hidden_size   = 512
decoder_dim   = hidden_size   # decoder's LSTM hidden state dimension
attention_dim = 256         # dimension for the attention module
num_layers    = 1
learning_rate = 3e-4
num_epochs    = 100
vocab_size    = len(vocab)

# Initialize encoder and decoder models
encoder = EncoderCNN(embed_size)
decoder = DecoderRNNWithAttention(embed_size, decoder_dim, vocab_size, num_layers, attention_dim)

# Define loss (ignoring the <PAD> token) and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"], label_smoothing=0.1)
# Update only the decoder and the newly added layers of the encoder
params = list(decoder.parameters()) + list(encoder.linear.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate, weight_decay=1e-5)

# Start training
train(encoder, decoder, criterion, optimizer, train_loader, val_loader, num_epochs, device, save_path="Downloads/Flickr/best_model.pth")



Epoch [1/100], Train Loss: 5.2917, Val Loss: 4.7495
Checkpoint saved at epoch 1 with val loss 4.7495




Epoch [2/100], Train Loss: 4.5492, Val Loss: 4.3383
Checkpoint saved at epoch 2 with val loss 4.3383




Epoch [3/100], Train Loss: 4.2770, Val Loss: 4.1528
Checkpoint saved at epoch 3 with val loss 4.1528




Epoch [4/100], Train Loss: 4.1208, Val Loss: 4.0250
Checkpoint saved at epoch 4 with val loss 4.0250




Epoch [5/100], Train Loss: 3.9827, Val Loss: 3.9135
Checkpoint saved at epoch 5 with val loss 3.9135




Epoch [6/100], Train Loss: 3.8923, Val Loss: 3.8531
Checkpoint saved at epoch 6 with val loss 3.8531




Epoch [7/100], Train Loss: 3.8073, Val Loss: 3.7827
Checkpoint saved at epoch 7 with val loss 3.7827




Epoch [8/100], Train Loss: 3.7465, Val Loss: 3.7374
Checkpoint saved at epoch 8 with val loss 3.7374




Epoch [9/100], Train Loss: 3.6902, Val Loss: 3.6853
Checkpoint saved at epoch 9 with val loss 3.6853




Epoch [10/100], Train Loss: 3.6528, Val Loss: 3.6621
Checkpoint saved at epoch 10 with val loss 3.6621




Epoch [11/100], Train Loss: 3.5933, Val Loss: 3.6267
Checkpoint saved at epoch 11 with val loss 3.6267




Epoch [12/100], Train Loss: 3.5561, Val Loss: 3.6131
Checkpoint saved at epoch 12 with val loss 3.6131




Epoch [13/100], Train Loss: 3.5280, Val Loss: 3.5945
Checkpoint saved at epoch 13 with val loss 3.5945




Epoch [14/100], Train Loss: 3.5053, Val Loss: 3.5659
Checkpoint saved at epoch 14 with val loss 3.5659




Epoch [15/100], Train Loss: 3.4696, Val Loss: 3.5522
Checkpoint saved at epoch 15 with val loss 3.5522




Epoch [16/100], Train Loss: 3.4579, Val Loss: 3.5346
Checkpoint saved at epoch 16 with val loss 3.5346




Epoch [17/100], Train Loss: 3.4151, Val Loss: 3.5156
Checkpoint saved at epoch 17 with val loss 3.5156




Epoch [18/100], Train Loss: 3.3961, Val Loss: 3.5072
Checkpoint saved at epoch 18 with val loss 3.5072




Epoch [19/100], Train Loss: 3.3688, Val Loss: 3.4929
Checkpoint saved at epoch 19 with val loss 3.4929




Epoch [20/100], Train Loss: 3.3411, Val Loss: 3.4915
Checkpoint saved at epoch 20 with val loss 3.4915




Epoch [21/100], Train Loss: 3.3330, Val Loss: 3.4800
Checkpoint saved at epoch 21 with val loss 3.4800




Epoch [22/100], Train Loss: 3.3051, Val Loss: 3.4735
Checkpoint saved at epoch 22 with val loss 3.4735




Epoch [23/100], Train Loss: 3.2913, Val Loss: 3.4621
Checkpoint saved at epoch 23 with val loss 3.4621




Epoch [24/100], Train Loss: 3.2755, Val Loss: 3.4658




Epoch [25/100], Train Loss: 3.2667, Val Loss: 3.4588
Checkpoint saved at epoch 25 with val loss 3.4588




Epoch [26/100], Train Loss: 3.2549, Val Loss: 3.4432
Checkpoint saved at epoch 26 with val loss 3.4432




Epoch [27/100], Train Loss: 3.2399, Val Loss: 3.4437




Epoch [28/100], Train Loss: 3.2174, Val Loss: 3.4492




Epoch [29/100], Train Loss: 3.1949, Val Loss: 3.4334
Checkpoint saved at epoch 29 with val loss 3.4334




Epoch [30/100], Train Loss: 3.1801, Val Loss: 3.4308
Checkpoint saved at epoch 30 with val loss 3.4308




Epoch [31/100], Train Loss: 3.1828, Val Loss: 3.4348




Epoch [32/100], Train Loss: 3.1512, Val Loss: 3.4310




Epoch [33/100], Train Loss: 3.1516, Val Loss: 3.4113
Checkpoint saved at epoch 33 with val loss 3.4113




Epoch [34/100], Train Loss: 3.1268, Val Loss: 3.4127




Epoch [35/100], Train Loss: 3.1271, Val Loss: 3.4123




Epoch [36/100], Train Loss: 3.1043, Val Loss: 3.4146




Epoch [37/100], Train Loss: 3.0660, Val Loss: 3.4064
Checkpoint saved at epoch 37 with val loss 3.4064




Epoch [38/100], Train Loss: 3.0536, Val Loss: 3.4068




Epoch [39/100], Train Loss: 3.0393, Val Loss: 3.4008
Checkpoint saved at epoch 39 with val loss 3.4008




Epoch [40/100], Train Loss: 3.0453, Val Loss: 3.4018




Epoch [41/100], Train Loss: 3.0399, Val Loss: 3.4003
Checkpoint saved at epoch 41 with val loss 3.4003




Epoch [42/100], Train Loss: 3.0298, Val Loss: 3.3958
Checkpoint saved at epoch 42 with val loss 3.3958




Epoch [43/100], Train Loss: 3.0171, Val Loss: 3.3923
Checkpoint saved at epoch 43 with val loss 3.3923




Epoch [44/100], Train Loss: 3.0009, Val Loss: 3.3966




Epoch [45/100], Train Loss: 3.0078, Val Loss: 3.3965




Epoch [46/100], Train Loss: 3.0008, Val Loss: 3.3902
Checkpoint saved at epoch 46 with val loss 3.3902




Epoch [47/100], Train Loss: 2.9989, Val Loss: 3.3831
Checkpoint saved at epoch 47 with val loss 3.3831




Epoch [48/100], Train Loss: 2.9906, Val Loss: 3.3949




Epoch [49/100], Train Loss: 2.9754, Val Loss: 3.3917




Epoch [50/100], Train Loss: 2.9742, Val Loss: 3.3968




Epoch [51/100], Train Loss: 2.9484, Val Loss: 3.3882




Epoch [52/100], Train Loss: 2.9490, Val Loss: 3.3796
Checkpoint saved at epoch 52 with val loss 3.3796




Epoch [53/100], Train Loss: 2.9389, Val Loss: 3.3839




Epoch [54/100], Train Loss: 2.9437, Val Loss: 3.3863




Epoch [55/100], Train Loss: 2.9328, Val Loss: 3.3795
Checkpoint saved at epoch 55 with val loss 3.3795




Epoch [56/100], Train Loss: 2.9385, Val Loss: 3.3787
Checkpoint saved at epoch 56 with val loss 3.3787




Epoch [57/100], Train Loss: 2.9272, Val Loss: 3.3799




Epoch [58/100], Train Loss: 2.9230, Val Loss: 3.3823




Epoch [59/100], Train Loss: 2.9220, Val Loss: 3.3792




Epoch [60/100], Train Loss: 2.9143, Val Loss: 3.3770
Checkpoint saved at epoch 60 with val loss 3.3770




Epoch [61/100], Train Loss: 2.9166, Val Loss: 3.3793




Epoch [62/100], Train Loss: 2.9157, Val Loss: 3.3793




Epoch [63/100], Train Loss: 2.9096, Val Loss: 3.3753
Checkpoint saved at epoch 63 with val loss 3.3753




Epoch [64/100], Train Loss: 2.9173, Val Loss: 3.3800




Epoch [65/100], Train Loss: 2.9141, Val Loss: 3.3769




Epoch [66/100], Train Loss: 2.9061, Val Loss: 3.3794




Epoch [67/100], Train Loss: 2.9137, Val Loss: 3.3808




Epoch [68/100], Train Loss: 2.9124, Val Loss: 3.3772




Epoch [69/100], Train Loss: 2.8996, Val Loss: 3.3772




Epoch [70/100], Train Loss: 2.9028, Val Loss: 3.3798




Epoch [71/100], Train Loss: 2.9037, Val Loss: 3.3782




Epoch [72/100], Train Loss: 2.8987, Val Loss: 3.3774




Epoch [73/100], Train Loss: 2.8979, Val Loss: 3.3790




Epoch [74/100], Train Loss: 2.9097, Val Loss: 3.3774




Epoch [75/100], Train Loss: 2.8946, Val Loss: 3.3753




Epoch [76/100], Train Loss: 2.8945, Val Loss: 3.3786




Epoch [77/100], Train Loss: 2.8998, Val Loss: 3.3759




Epoch [78/100], Train Loss: 2.9050, Val Loss: 3.3779




Epoch [79/100], Train Loss: 2.9014, Val Loss: 3.3797




Epoch [80/100], Train Loss: 2.9005, Val Loss: 3.3773




Epoch [81/100], Train Loss: 2.8994, Val Loss: 3.3777




Epoch [82/100], Train Loss: 2.8994, Val Loss: 3.3772




Epoch [83/100], Train Loss: 2.8990, Val Loss: 3.3799




Epoch [84/100], Train Loss: 2.8972, Val Loss: 3.3809




Epoch [85/100], Train Loss: 2.8934, Val Loss: 3.3774




Epoch [86/100], Train Loss: 2.9044, Val Loss: 3.3762




Epoch [87/100], Train Loss: 2.9086, Val Loss: 3.3776




Epoch [88/100], Train Loss: 2.8983, Val Loss: 3.3768




Epoch [89/100], Train Loss: 2.9017, Val Loss: 3.3785




Epoch [90/100], Train Loss: 2.8866, Val Loss: 3.3757




Epoch [91/100], Train Loss: 2.9068, Val Loss: 3.3792




Epoch [92/100], Train Loss: 2.8874, Val Loss: 3.3784




Epoch [93/100], Train Loss: 2.9007, Val Loss: 3.3761




Epoch [94/100], Train Loss: 2.8988, Val Loss: 3.3772




Epoch [95/100], Train Loss: 2.9046, Val Loss: 3.3768




Epoch [96/100], Train Loss: 2.8911, Val Loss: 3.3777




Epoch [97/100], Train Loss: 2.9041, Val Loss: 3.3775




Epoch [98/100], Train Loss: 2.8932, Val Loss: 3.3781




Epoch [99/100], Train Loss: 2.8993, Val Loss: 3.3778


                                                                                                                       

Epoch [100/100], Train Loss: 2.9005, Val Loss: 3.3772




In [6]:
import os
import random
import torch
import gradio as gr
from PIL import Image
from gtts import gTTS

# ----------------------------------------------------------------
# Load Trained Model Checkpoint
# ----------------------------------------------------------------
checkpoint_path = "/content/drive/MyDrive/Flickr/best_model_100_attention.pth"
embed_size    = 256
hidden_size   = 512
decoder_dim   = hidden_size   # decoder's LSTM hidden state dimension
attention_dim = 256         # dimension for the attention module
num_layers    = 1
learning_rate = 3e-4
num_epochs    = 100
vocab_size    = len(vocab)

# Initialize encoder and decoder models
encoder = EncoderCNN(embed_size)
decoder = DecoderRNNWithAttention(embed_size, decoder_dim, vocab_size, num_layers, attention_dim)

checkpoint = torch.load(checkpoint_path, map_location=device)
encoder.load_state_dict(checkpoint['encoder_state_dict'])
decoder.load_state_dict(checkpoint['decoder_state_dict'])
encoder.to(device)
decoder.to(device)
encoder.eval()
decoder.eval()

Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth
100%|██████████| 104M/104M [00:01<00:00, 73.3MB/s]


DecoderRNNWithAttention(
  (embed): Embedding(3006, 256)
  (attention): Attention(
    (encoder_att): Linear(in_features=256, out_features=256, bias=True)
    (decoder_att): Linear(in_features=512, out_features=256, bias=True)
    (full_att): Linear(in_features=256, out_features=1, bias=True)
    (relu): ReLU()
    (softmax): Softmax(dim=1)
  )
  (lstm): LSTM(512, 512, batch_first=True)
  (linear): Linear(in_features=512, out_features=3006, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [8]:
import os
import random
import torch
import gradio as gr
from PIL import Image
from gtts import gTTS
import base64
from io import BytesIO

# [Your existing model loading code here...]

def generate_captions_and_audio(files, beam_size, max_len=20):
    """Generate captions and audio for multiple images, returning HTML with embedded content"""
    if not isinstance(files, list):
        files = [files]

    html_output = ""
    with torch.no_grad():
        for idx, file_path in enumerate(files):
            # Process image
            image = Image.open(file_path).convert("RGB")
            image_tensor = val_test_transform(image).unsqueeze(0).to(device)
            features = encoder(image_tensor)

            # Generate caption
            token_ids = decoder.sample_beam(features, beam_size=beam_size, max_len=max_len)
            caption_words = []
            for token_id in token_ids:
                word = vocab.itos[token_id]
                if word == "<EOS>": break
                if word not in ["<SOS>", "<UNK>"]:
                    caption_words.append(word)
            caption = " ".join(caption_words)

            # Generate audio
            tts = gTTS(text=caption, lang="en")
            audio_bytes = BytesIO()
            tts.write_to_fp(audio_bytes)
            audio_bytes.seek(0)
            audio_b64 = base64.b64encode(audio_bytes.read()).decode()

            # Convert image to base64
            buffered = BytesIO()
            image.save(buffered, format="JPEG")
            img_b64 = base64.b64encode(buffered.getvalue()).decode()

            # Create HTML block
            html_output += f"""
                        <div style="margin: 1rem; padding: 1rem; border: 1px solid #ddd; border-radius: 8px;">
                            <div style="display: flex; gap: 1rem; align-items: center;">
                                <img src="data:image/jpeg;base64,{img_b64}" style="max-width: 200px; height: auto;"/>
                                <div>
                                    <p style="font-size: 18px; margin: 0.5rem 0;">
                                        <strong style="font-size: 16px;">Caption:</strong>
                                        <span style="font-size: 16px;">{caption}</span>
                                    </p>
                                    <audio controls style="margin-top: 0.5rem;">
                                        <source src="data:audio/mpeg;base64,{audio_b64}" type="audio/mpeg">
                                    </audio>
                                </div>
                            </div>
                      </div>
                      """

    return f"<div style='margin: 1rem;'>{html_output}</div>"

# Create Gradio interface
with gr.Blocks(title="Image Captioning with Audio") as interface:
    gr.Markdown("# Image Captioning with Audio")
    gr.Markdown("Upload Images to Generate Captions with Playable Audio. (Up to 10 Images)")
    gr.Markdown("Model used is InceptionV3 and Training Dataset is Flickr8k.")
    gr.Markdown("Beam Search is used as the Decoding Strategy.")

    with gr.Row():
        inputs = [
            gr.File(label="Upload Images", file_count="multiple", file_types=["image"], type="filepath"),
            gr.Slider(1, 10, step=1, value=3, label="Beam Size"),
            gr.Slider(10, 50, step=1, value=20, label="Max Caption Length")
        ]
        submit = gr.Button("Generate", variant="primary")

    output = gr.HTML(label="Results")

    # Validation function
    def validate_files(files):
        if len(files) < 1:
            raise gr.Error("Please upload at least 1 image!")
        if len(files) > 10:
            raise gr.Error("Maximum 10 images allowed!")
        return files

        # Add validation to file input
    inputs[0].upload(
        validate_files,
        inputs[0],
        inputs[0]
    )

    submit.click(
        generate_captions_and_audio,
        inputs=inputs,
        outputs=output
    )

if __name__ == "__main__":
    interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5bf6e6e5347bd3cff4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-2.0.3-py3-none-any.whl.metadata (9.0 kB)
Collecting numpy>=1.25 (from bayesian-optimization)
  Downloading numpy-2.2.4-cp310-cp310-win_amd64.whl.metadata (60 kB)
Downloading bayesian_optimization-2.0.3-py3-none-any.whl (31 kB)
Downloading numpy-2.2.4-cp310-cp310-win_amd64.whl (12.9 MB)
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   ----- ---------------------------------- 1.8/12.9 MB 10.1 MB/s eta 0:00:02
   -------------- ------------------------- 4.7/12.9 MB 13.0 MB/s eta 0:00:01
   ----------------------- ---------------- 7.6/12.9 MB 13.4 MB/s eta 0:00:01
   ------------------------------ --------- 10.0/12.9 MB 12.9 MB/s eta 0:00:01
   ------------------------------------ --- 11.8/12.9 MB 12.3 MB/s eta 0:00:01
   ---------------------------------------- 12.9/12.9 MB 11.6 MB/s eta 0:00:00
Installing collected packages: numpy, bayesian-optimization
  Attempting uninstall: numpy
    Found exi

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.59.1 requires numpy<1.27,>=1.22, but you have numpy 2.2.4 which is incompatible.
tensorflow-intel 2.13.0 requires numpy<=1.24.3,>=1.22, but you have numpy 2.2.4 which is incompatible.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.12.2 which is incompatible.


In [None]:
from bayes_opt import BayesianOptimization

def train_and_evaluate(learning_rate, hidden_size, dropout_rate, num_epochs):
    # Convert hyperparameters to proper types
    hidden_size = int(hidden_size)
    num_epochs = int(num_epochs)
    dropout_rate = max(min(dropout_rate, 1), 0)  # ensure dropout is in [0,1]

    # Fixed parameters
    embed_size = 256
    num_layers = 1
    attention_dim = 256
    vocab_size = len(vocab)

    # Initialize models using the given hyperparameters
    encoder_model = EncoderCNN(embed_size)
    decoder_model = DecoderRNNWithAttention(embed_size, hidden_size, vocab_size, num_layers, attention_dim)
    # Adjust dropout if your decoder has a dropout layer attribute
    decoder_model.dropout = nn.Dropout(dropout_rate)

    encoder_model.to(device)
    decoder_model.to(device)

    # Define loss and optimizer using the current learning rate
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"])
    params = list(decoder_model.parameters()) + list(encoder_model.linear.parameters()) + list(encoder_model.bn.parameters())
    optimizer_model = torch.optim.Adam(params, lr=learning_rate)

    # Train for the given number of epochs
    train(encoder_model, decoder_model, criterion, optimizer_model, train_loader, val_loader, num_epochs, device, save_path="temp_model.pth")

    # Evaluate on the validation set (we aim to minimize loss, so we return negative loss for maximization)
    val_loss = evaluate(encoder_model, decoder_model, criterion, val_loader, device)

    # Return negative validation loss (so higher is better)
    return -val_loss

# Define the search space for each hyperparameter, including num_epochs
pbounds = {
    'learning_rate': (1e-5, 1e-3),  # Learning rate between 0.00001 and 0.001
    'hidden_size': (256, 1024),       # LSTM hidden size between 256 and 1024
    'dropout_rate': (0.0, 0.5),       # Dropout between 0 and 0.5
    'num_epochs': (3, 10)             # Number of epochs between 3 and 10 (as a continuous parameter, cast to int)
}

optimizer_bo = BayesianOptimization(
    f=train_and_evaluate,
    pbounds=pbounds,
    random_state=42,
)

# Run optimization: for example, 5 random initial points and 10 iterations
optimizer_bo.maximize(
    init_points=5,
    n_iter=10,
)

print("Best hyperparameters found:")
print(optimizer_bo.max)

In [None]:
# Cell 7: Load the best model checkpoint and evaluate sacreBLEU score on the test set
# also model loading code for gradio UI and HF Spaces Deployment

# Load checkpoint
checkpoint = torch.load("/content/drive/MyDrive/Flickr/best_model.pth", map_location=torch.device('cpu'))

# Instantiate your models with the same architecture and hyperparameters
encoder = EncoderCNN(embed_size)
decoder = DecoderRNNWithAttention(embed_size, decoder_dim, vocab_size, num_layers, attention_dim)

# Load the saved weights into the models
encoder.load_state_dict(checkpoint['encoder_state_dict'])
decoder.load_state_dict(checkpoint['decoder_state_dict'])

# Move the models to the correct device and set them to evaluation mode
encoder.to(device)
decoder.to(device)
encoder.eval()
decoder.eval()

In [None]:
# Compute and print the BLEU score using the test loader
bleu_score = compute_bleu_score(encoder, decoder, test_loader, vocab, device)
print("Test sacreBLEU score: {:.2f}".format(bleu_score))

# Compute METEOR score
meteor = compute_meteor_score(encoder, decoder, test_loader, vocab, device)
print("Test METEOR score: {:.4f}".format(meteor))

# Compute CIDEr score
cider = compute_cider_score(encoder, decoder, test_loader, vocab, device)
print("Test CIDEr score: {:.4f}".format(cider))

In [12]:
import os
from PIL import Image

# Folder path containing test images
folder_path = "/content/drive/MyDrive/Flickr/test_images/"

# Dictionary to store captions for each file
captions_dict = {}

encoder.eval()
decoder.eval()

with torch.no_grad():
    for filename in os.listdir(folder_path):
        if filename.lower().endswith((".png", ".jpg", ".jpeg")):
            image_path = os.path.join(folder_path, filename)
            image = Image.open(image_path).convert("RGB")
            image_tensor = val_test_transform(image).unsqueeze(0).to(device)

            features = encoder(image_tensor)
            # Use beam search for caption generation; you can switch to greedy with decoder.sample(features)
            best_seq = decoder.sample_beam(features, beam_size=3, max_len=20)

            # Convert token IDs to words, filtering out special tokens
            caption_words = []
            for token_id in best_seq:
                word = vocab.itos[token_id]
                if word == "<EOS>":
                    break
                if word in ["<SOS>", "<UNK>"]:
                    continue
                caption_words.append(word)
            caption = " ".join(caption_words)
            captions_dict[filename] = caption
            print(f"{filename}: {caption}")

# Optionally, captions_dict now holds the mapping for further processing.

boat.png: a boat is in the water .
bus.png: a man in a white shirt is standing in front of a bus .
child.jpg: a little boy in a red shirt is playing with a toy .
dog.jpg: a brown dog is running on the beach .
horse.png: a man and a dog are standing on a beach .


In [33]:
# Inference Cell: Generate a caption for a sample test image using Beam Search with Attention

from PIL import Image

# Update with the path to your sample test image
sample_image_path = "/content/drive/MyDrive/Flickr/test_images/dog.jpg"
sample_image = Image.open(sample_image_path).convert("RGB")
sample_image = val_test_transform(sample_image).unsqueeze(0).to(device)

encoder.eval()
decoder.eval()

with torch.no_grad():
    # Obtain spatial features from the encoder; shape: (1, num_pixels, embed_size)
    features = encoder(sample_image)
    # Use beam search to generate a caption (adjust beam_size and max_len as needed)
    best_seq = decoder.sample_beam(features, beam_size=3, max_len=20)

# Convert the sequence of token IDs to words, skipping unwanted tokens
caption_words = []
for token_id in best_seq:
    word = vocab.itos[token_id]
    if word == "<EOS>":
        break
    if word in ["<SOS>", "<UNK>"]:
        continue
    caption_words.append(word)
caption = " ".join(caption_words)
print("Generated Caption (Beam Search with Attention):", caption)

Generated Caption (Beam Search with Attention): a brown dog is running on the beach .


In [None]:
from gtts import gTTS
from IPython.display import Audio, display

# Suppose 'caption' is the generated caption from your inference cell.
# If it's not defined, set it manually for testing.
# caption = "A child in a pink dress is climbing up a set of stairs in an entry way."

# Convert text to speech
tts = gTTS(text=caption, lang='en')

# Save the audio file
audio_filename = "caption_audio.mp3"
tts.save(audio_filename)

# Play the audio in the notebook
display(Audio(audio_filename, autoplay=True))

In [None]:
import gradio as gr
from PIL import Image
from gtts import gTTS
import torch

def generate_caption_and_audio(image):
    """
    Accepts an image (PIL), preprocesses it, runs inference using the attention-enabled decoder
    with beam search, converts the predicted tokens to a caption (excluding <SOS> and <UNK>),
    and converts the caption to audio.

    Returns:
        caption (str): Generated caption text.
        audio_file (str): Path to the generated audio file (MP3).
    """
    # Ensure the image is RGB and preprocessed
    image = image.convert("RGB")
    image_tensor = transform(image).unsqueeze(0).to(device)

    # Set models to eval mode (redundant if already done globally)
    encoder.eval()
    decoder.eval()

    with torch.no_grad():
        # Obtain encoder spatial features and run beam search decoding
        features = encoder(image_tensor)
        best_seq = decoder.sample_beam(features, beam_size=3, max_len=20)

    # Convert token IDs to words and filter unwanted tokens
    caption_words = []
    for token_id in best_seq:
        word = vocab.itos[token_id]
        if word == "<EOS>":
            break
        if word in ["<SOS>", "<UNK>"]:
            continue
        caption_words.append(word)
    caption = " ".join(caption_words)

    # Convert the caption text to speech using gTTS and save to an MP3 file
    tts = gTTS(text=caption, lang='en')
    audio_file = "caption_audio.mp3"
    tts.save(audio_file)

    return caption, audio_file

# Create a Gradio Interface with an image input and text + audio outputs.
interface = gr.Interface(
    fn=generate_caption_and_audio,
    inputs=gr.Image(type="pil", label="Input Image"),
    outputs=[
        gr.Textbox(label="Generated Caption"),
        gr.Audio(label="Caption Audio")
    ],
    title="Image Captioning with Audio Output",
    description="Upload an image to get a generated caption (text) along with an audio version of the caption."
)

interface.launch()

In [None]:
def compute_bleu_score(encoder, decoder, data_loader, vocab, device):
    """
    Computes sacreBLEU score using all available reference captions for each image.

    Assumes that:
      - data_loader.dataset.img2caps maps image ids to a list of reference captions.
      - data_loader.dataset.imgs is an ordered list of image ids.
      - The DataLoader is used with shuffle=False.
    """
    encoder.eval()
    decoder.eval()
    all_hypotheses = []
    all_references = []  # list of lists of reference captions, one per image
    global_idx = 0      # index to track image id order in dataset

    with torch.no_grad():
        for images, _ in data_loader:
            images = images.to(device)
            features = encoder(images)
            batch_size = images.size(0)
            for i in range(batch_size):
                feature = features[i].unsqueeze(0)
                # Generate caption using greedy decoding (or replace with beam search if desired)
                sampled_ids = decoder.sample(feature)
                sampled_caption = []
                for token_id in sampled_ids:
                    word = vocab.itos[token_id]
                    if word == "<EOS>":
                        break
                    if word in ["<SOS>", "<UNK>"]:
                        continue
                    sampled_caption.append(word)
                hypothesis = " ".join(sampled_caption)
                all_hypotheses.append(hypothesis)

                # Retrieve all reference captions for the current image
                image_id = data_loader.dataset.imgs[global_idx]
                references_for_image = data_loader.dataset.img2caps[image_id]
                all_references.append(references_for_image)
                global_idx += 1

    bleu = sacrebleu.corpus_bleu(all_hypotheses, all_references)
    return bleu.score