In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#!pip install efficientnet_pytorch

In [3]:
!pip show efficientnet_pytorch

Name: efficientnet_pytorch
Version: 0.7.1
Summary: EfficientNet implemented in PyTorch.
Home-page: https://github.com/lukemelas/EfficientNet-PyTorch
Author: Luke
Author-email: lmelaskyriazi@college.harvard.edu
License: Apache
Location: /usr/local/lib/python3.11/dist-packages
Requires: torch
Required-by: 


In [4]:
import sys
import os

In [5]:
import json
import pickle
from pathlib import Path

import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader

In [6]:
# Enable full CUDA error reporting
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [7]:
# Change this to your actual Drive path
base_path = "/content/drive/MyDrive/UTS/SEM 4/DL/AT3/AT3-DL-image-captioning"

In [8]:
# Append utils path to Python path
sys.path.append(os.path.join(base_path))

# Set paths
image_folder = os.path.join(base_path, "data/Flicker8k_Dataset")
text_folder = os.path.join(base_path, "data/Flickr8k_text")
processed_folder = os.path.join(base_path, "data/processed")

In [9]:
from utils.dataloader import get_transforms, load_split_ids, build_caption_dataset
from utils.caption_dataset import CaptionDataset

In [10]:
from nltk.translate.bleu_score import corpus_bleu


In [11]:
# Load vocabulary
with open(os.path.join(processed_folder, "word2idx.json"), "r") as f:
    word2idx = json.load(f)

with open(os.path.join(processed_folder, "image_caption_seqs.pkl"), "rb") as f:
    image_caption_seqs = pickle.load(f)

In [12]:
# Fix invalid token indices
vocab_size = len(word2idx)
UNK_IDX = word2idx.get("<unk>", word2idx["<pad>"])
for img_id, seqs in image_caption_seqs.items():
    image_caption_seqs[img_id] = [
        [w if w < vocab_size else UNK_IDX for w in seq] for seq in seqs
    ]

In [13]:
# Load splits
train_ids = load_split_ids(os.path.join(text_folder, "Flickr_8k.trainImages.txt"))
val_ids   = load_split_ids(os.path.join(text_folder, "Flickr_8k.devImages.txt"))
test_ids  = load_split_ids(os.path.join(text_folder, "Flickr_8k.testImages.txt"))

In [14]:
# Define transforms
#transform_train = get_transforms("train")
#transform_val = get_transforms("val")

from torchvision import transforms

transform_train = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

transform_val = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [15]:
# Build datasets
train_dataset = build_caption_dataset(train_ids, image_caption_seqs, word2idx, image_folder, transform_train)
val_dataset = build_caption_dataset(val_ids, image_caption_seqs, word2idx, image_folder, transform_val)
test_dataset = build_caption_dataset(test_ids, image_caption_seqs, word2idx, image_folder, transform_val)

In [16]:
# === Positional Encoding ===
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [17]:
# === Encoder: EfficientNet ===
class EncoderEfficientNet(nn.Module):
    def __init__(self, encoded_image_size=14, output_dim=256):
        super().__init__()
        self.backbone = EfficientNet.from_pretrained('efficientnet-b3')
        self.pool = nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))
        self.project = nn.Linear(1536, output_dim)  # Project to match decoder embed_dim

    def forward(self, images):
        features = self.backbone.extract_features(images)               # (B, 1536, H, W)
        pooled = self.pool(features)                                    # (B, 1536, 14, 14)
        flattened = pooled.flatten(2).permute(0, 2, 1)                  # (B, S, 1536)
        projected = self.project(flattened)                             # (B, S, 256)
        return projected

In [18]:
# === Decoder: Transformer ===
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, n_heads=4, num_layers=2, ff_dim=256, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight.requires_grad = False  # Freeze embedding layer
        self.pos_encoder = PositionalEncoding(embed_dim)

        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=n_heads,
                                                   dim_feedforward=ff_dim, dropout=dropout)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask=None, tgt_key_padding_mask=None):
        tgt = self.embedding(tgt)
        tgt = self.pos_encoder(tgt)
        output = self.transformer_decoder(tgt.transpose(0, 1), memory.transpose(0, 1),
                                          tgt_mask=tgt_mask,
                                          tgt_key_padding_mask=tgt_key_padding_mask)
        return self.fc(output.transpose(0, 1))

In [19]:
# === Full Model ===
class TransformerCaptioningModel(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, images, captions):
        memory = self.encoder(images)  # (B, S, D)
        return self.decoder(captions, memory)

In [20]:
from torch.cuda.amp import autocast, GradScaler

In [21]:
# === Training Loop with AMP + Assertion ===
def train_model(model, train_dataset, val_dataset, word2idx, device, batch_size=8, epochs=20, patience=3, lr=1e-4):
    pad_idx = word2idx['<pad>']
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

    scaler = GradScaler()
    #train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    #val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, drop_last=True, num_workers=2, pin_memory=True)
    val_loader   = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=2, pin_memory=True)

    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_losses = []
        for images, captions, _ in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
            images, captions = images.to(device), captions.to(device)

            # === Assertion ===
            assert captions[:, 1:].max().item() < len(word2idx), "Target index exceeds vocab size"
            assert captions[:, 1:].min().item() >= 0, "Negative target index detected"

            optimizer.zero_grad()
            with autocast():
                outputs = model(images, captions[:, :-1])
                loss = criterion(outputs.reshape(-1, outputs.size(-1)), captions[:, 1:].reshape(-1))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_losses.append(loss.item())

        val_losses = []
        references, hypotheses = [], []
        model.eval()
        with torch.no_grad():
            for images, captions, _ in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
                images, captions = images.to(device), captions.to(device)
                with autocast():
                    outputs = model(images, captions[:, :-1])
                    loss = criterion(outputs.reshape(-1, outputs.size(-1)), captions[:, 1:].reshape(-1))
                val_losses.append(loss.item())

                preds = torch.argmax(outputs, dim=2)
                for ref, pred in zip(captions, preds):
                    ref_tokens = [w for w in ref.tolist() if w not in {pad_idx, word2idx['<start>'], word2idx['<end>']}]
                    pred_tokens = [w for w in pred.tolist() if w not in {pad_idx, word2idx['<start>'], word2idx['<end>']}]
                    references.append([ref_tokens])
                    hypotheses.append(pred_tokens)

        avg_train_loss = np.mean(train_losses)
        avg_val_loss = np.mean(val_losses)
        scheduler.step(avg_val_loss)

        bleu1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
        bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
        bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0))
        bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25))

        print(f"\nEpoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")
        print(f"BLEU-1 = {bleu1:.4f}, BLEU-2 = {bleu2:.4f}, BLEU-3 = {bleu3:.4f}, BLEU-4 = {bleu4:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), os.path.join(base_path, "transformer_captioning_best.pt"))
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

    print("Training complete.")
    return model

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [23]:
!nvidia-smi

Mon May 19 14:02:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   61C    P8             18W /   72W |       3MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [24]:
from efficientnet_pytorch import EfficientNet
from tqdm import tqdm

In [25]:
# === Run Training ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

vocab_size = len(word2idx)
encoder = EncoderEfficientNet()
decoder = TransformerDecoder(vocab_size=vocab_size)
model = TransformerCaptioningModel(encoder, decoder).to(device)

trained_model = train_model(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    word2idx=word2idx,
    device=device,
    batch_size=8,
    epochs=20,
    patience=3,
    lr=1e-4
)

Using device: cuda


  scaler = GradScaler()


Loaded pretrained weights for efficientnet-b3


  with autocast():
Epoch 1 [Train]: 100%|██████████| 3750/3750 [36:38<00:00,  1.71it/s]
  with autocast():
Epoch 1 [Val]: 100%|██████████| 625/625 [07:39<00:00,  1.36it/s]



Epoch 1: Train Loss = 2.5361, Val Loss = 0.9982
BLEU-1 = 0.7849, BLEU-2 = 0.7290, BLEU-3 = 0.6788, BLEU-4 = 0.6263


Epoch 2 [Train]: 100%|██████████| 3750/3750 [09:48<00:00,  6.37it/s]
Epoch 2 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.22it/s]



Epoch 2: Train Loss = 0.6350, Val Loss = 0.3562
BLEU-1 = 0.8595, BLEU-2 = 0.8348, BLEU-3 = 0.8121, BLEU-4 = 0.7860


Epoch 3 [Train]: 100%|██████████| 3750/3750 [09:49<00:00,  6.36it/s]
Epoch 3 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.44it/s]



Epoch 3: Train Loss = 0.2440, Val Loss = 0.1785
BLEU-1 = 0.7375, BLEU-2 = 0.7195, BLEU-3 = 0.7035, BLEU-4 = 0.6818


Epoch 4 [Train]: 100%|██████████| 3750/3750 [09:50<00:00,  6.35it/s]
Epoch 4 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.04it/s]



Epoch 4: Train Loss = 0.1085, Val Loss = 0.1041
BLEU-1 = 0.7666, BLEU-2 = 0.7526, BLEU-3 = 0.7404, BLEU-4 = 0.7227


Epoch 5 [Train]: 100%|██████████| 3750/3750 [09:50<00:00,  6.35it/s]
Epoch 5 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.48it/s]



Epoch 5: Train Loss = 0.0570, Val Loss = 0.0820
BLEU-1 = 0.8127, BLEU-2 = 0.8008, BLEU-3 = 0.7903, BLEU-4 = 0.7752


Epoch 6 [Train]: 100%|██████████| 3750/3750 [09:50<00:00,  6.35it/s]
Epoch 6 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.07it/s]



Epoch 6: Train Loss = 0.0346, Val Loss = 0.0718
BLEU-1 = 0.7172, BLEU-2 = 0.7038, BLEU-3 = 0.6922, BLEU-4 = 0.6743


Epoch 7 [Train]: 100%|██████████| 3750/3750 [09:51<00:00,  6.34it/s]
Epoch 7 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.34it/s]



Epoch 7: Train Loss = 0.0245, Val Loss = 0.0600
BLEU-1 = 0.7299, BLEU-2 = 0.7174, BLEU-3 = 0.7066, BLEU-4 = 0.6897


Epoch 8 [Train]: 100%|██████████| 3750/3750 [09:50<00:00,  6.35it/s]
Epoch 8 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.16it/s]



Epoch 8: Train Loss = 0.0202, Val Loss = 0.0521
BLEU-1 = 0.7159, BLEU-2 = 0.7032, BLEU-3 = 0.6923, BLEU-4 = 0.6751


Epoch 9 [Train]: 100%|██████████| 3750/3750 [09:51<00:00,  6.34it/s]
Epoch 9 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.03it/s]



Epoch 9: Train Loss = 0.0165, Val Loss = 0.0500
BLEU-1 = 0.6853, BLEU-2 = 0.6725, BLEU-3 = 0.6617, BLEU-4 = 0.6441


Epoch 10 [Train]: 100%|██████████| 3750/3750 [09:50<00:00,  6.35it/s]
Epoch 10 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.21it/s]



Epoch 10: Train Loss = 0.0141, Val Loss = 0.0511
BLEU-1 = 0.9462, BLEU-2 = 0.9408, BLEU-3 = 0.9359, BLEU-4 = 0.9294


Epoch 11 [Train]: 100%|██████████| 3750/3750 [09:50<00:00,  6.35it/s]
Epoch 11 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.33it/s]



Epoch 11: Train Loss = 0.0123, Val Loss = 0.0439
BLEU-1 = 0.7058, BLEU-2 = 0.6935, BLEU-3 = 0.6830, BLEU-4 = 0.6661


Epoch 12 [Train]: 100%|██████████| 3750/3750 [09:50<00:00,  6.35it/s]
Epoch 12 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.17it/s]



Epoch 12: Train Loss = 0.0119, Val Loss = 0.0415
BLEU-1 = 0.7313, BLEU-2 = 0.7196, BLEU-3 = 0.7096, BLEU-4 = 0.6935


Epoch 13 [Train]: 100%|██████████| 3750/3750 [09:51<00:00,  6.35it/s]
Epoch 13 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.21it/s]



Epoch 13: Train Loss = 0.0109, Val Loss = 0.0409
BLEU-1 = 0.8955, BLEU-2 = 0.8886, BLEU-3 = 0.8824, BLEU-4 = 0.8732


Epoch 14 [Train]: 100%|██████████| 3750/3750 [09:52<00:00,  6.33it/s]
Epoch 14 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.34it/s]



Epoch 14: Train Loss = 0.0101, Val Loss = 0.0448
BLEU-1 = 0.9574, BLEU-2 = 0.9530, BLEU-3 = 0.9490, BLEU-4 = 0.9436


Epoch 15 [Train]: 100%|██████████| 3750/3750 [09:51<00:00,  6.34it/s]
Epoch 15 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.21it/s]



Epoch 15: Train Loss = 0.0098, Val Loss = 0.0405
BLEU-1 = 0.9030, BLEU-2 = 0.8966, BLEU-3 = 0.8907, BLEU-4 = 0.8822


Epoch 16 [Train]: 100%|██████████| 3750/3750 [09:51<00:00,  6.34it/s]
Epoch 16 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.32it/s]



Epoch 16: Train Loss = 0.0094, Val Loss = 0.0407
BLEU-1 = 0.8052, BLEU-2 = 0.7954, BLEU-3 = 0.7867, BLEU-4 = 0.7732


Epoch 17 [Train]: 100%|██████████| 3750/3750 [09:50<00:00,  6.35it/s]
Epoch 17 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.35it/s]



Epoch 17: Train Loss = 0.0084, Val Loss = 0.0355
BLEU-1 = 0.7622, BLEU-2 = 0.7515, BLEU-3 = 0.7422, BLEU-4 = 0.7273


Epoch 18 [Train]: 100%|██████████| 3750/3750 [09:52<00:00,  6.33it/s]
Epoch 18 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.00it/s]



Epoch 18: Train Loss = 0.0083, Val Loss = 0.0372
BLEU-1 = 0.9093, BLEU-2 = 0.9032, BLEU-3 = 0.8978, BLEU-4 = 0.8898


Epoch 19 [Train]: 100%|██████████| 3750/3750 [09:51<00:00,  6.34it/s]
Epoch 19 [Val]: 100%|██████████| 625/625 [00:29<00:00, 20.99it/s]



Epoch 19: Train Loss = 0.0080, Val Loss = 0.0460
BLEU-1 = 0.8537, BLEU-2 = 0.8453, BLEU-3 = 0.8378, BLEU-4 = 0.8264


Epoch 20 [Train]: 100%|██████████| 3750/3750 [09:50<00:00,  6.35it/s]
Epoch 20 [Val]: 100%|██████████| 625/625 [00:29<00:00, 21.19it/s]



Epoch 20: Train Loss = 0.0073, Val Loss = 0.0393
BLEU-1 = 0.8539, BLEU-2 = 0.8457, BLEU-3 = 0.8384, BLEU-4 = 0.8273
Early stopping at epoch 20
Training complete.


In [26]:
save_path = "/content/drive/MyDrive/UTS/SEM 4/DL/AT3/transformer_captioning_best.pt"

In [27]:
torch.save(model.state_dict(), save_path)