In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install efficientnet_pytorch

Collecting efficientnet_pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->efficientnet_pytorch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->efficientnet_pytorch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->efficientnet_pytorch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->efficientnet_pytorch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->efficientnet_pytorch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metada

In [2]:
!pip show efficientnet_pytorch

Name: efficientnet_pytorch
Version: 0.7.1
Summary: EfficientNet implemented in PyTorch.
Home-page: https://github.com/lukemelas/EfficientNet-PyTorch
Author: Luke
Author-email: lmelaskyriazi@college.harvard.edu
License: Apache
Location: /usr/local/lib/python3.11/dist-packages
Requires: torch
Required-by: 


In [3]:
import sys
import os

In [4]:
import json
import pickle
from pathlib import Path

import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader

In [5]:
# Enable full CUDA error reporting
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
# Change this to your actual Drive path
base_path = "/content/drive/MyDrive/UTS/SEM 4/DL/AT3/AT3-DL-image-captioning"

In [7]:
# Append utils path to Python path
sys.path.append(os.path.join(base_path))

# Set paths
image_folder = os.path.join(base_path, "data/Flicker8k_Dataset")
text_folder = os.path.join(base_path, "data/Flickr8k_text")
processed_folder = os.path.join(base_path, "data/processed")

In [8]:
from utils.dataloader import get_transforms, load_split_ids, build_caption_dataset
from utils.caption_dataset import CaptionDataset

In [9]:
from nltk.translate.bleu_score import corpus_bleu


In [10]:
# Load vocabulary
with open(os.path.join(processed_folder, "word2idx.json"), "r") as f:
    word2idx = json.load(f)

with open(os.path.join(processed_folder, "image_caption_seqs.pkl"), "rb") as f:
    image_caption_seqs = pickle.load(f)

In [11]:
# Fix invalid token indices
vocab_size = len(word2idx)
UNK_IDX = word2idx.get("<unk>", word2idx["<pad>"])
for img_id, seqs in image_caption_seqs.items():
    image_caption_seqs[img_id] = [
        [w if w < vocab_size else UNK_IDX for w in seq] for seq in seqs
    ]

In [12]:
# Load splits
train_ids = load_split_ids(os.path.join(text_folder, "Flickr_8k.trainImages.txt"))
val_ids   = load_split_ids(os.path.join(text_folder, "Flickr_8k.devImages.txt"))
test_ids  = load_split_ids(os.path.join(text_folder, "Flickr_8k.testImages.txt"))

In [13]:
# Define transforms
transform_train = get_transforms("train")
transform_val = get_transforms("val")

In [14]:
# Build datasets
train_dataset = build_caption_dataset(train_ids, image_caption_seqs, word2idx, image_folder, transform_train)
val_dataset = build_caption_dataset(val_ids, image_caption_seqs, word2idx, image_folder, transform_val)
test_dataset = build_caption_dataset(test_ids, image_caption_seqs, word2idx, image_folder, transform_val)

In [15]:
# === Positional Encoding ===
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [16]:
# === Encoder: EfficientNet ===
class EncoderEfficientNet(nn.Module):
    def __init__(self, encoded_image_size=14, output_dim=256):
        super().__init__()
        self.backbone = EfficientNet.from_pretrained('efficientnet-b3')
        self.pool = nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))
        self.project = nn.Linear(1536, output_dim)  # Project to match decoder embed_dim

    def forward(self, images):
        features = self.backbone.extract_features(images)               # (B, 1536, H, W)
        pooled = self.pool(features)                                    # (B, 1536, 14, 14)
        flattened = pooled.flatten(2).permute(0, 2, 1)                  # (B, S, 1536)
        projected = self.project(flattened)                             # (B, S, 256)
        return projected

In [17]:
# === Decoder: Transformer ===
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, n_heads=8, num_layers=3, ff_dim=512, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoder = PositionalEncoding(embed_dim)

        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=n_heads,
                                                   dim_feedforward=ff_dim, dropout=dropout)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask=None, tgt_key_padding_mask=None):
        tgt = self.embedding(tgt)
        tgt = self.pos_encoder(tgt)
        output = self.transformer_decoder(tgt.transpose(0, 1), memory.transpose(0, 1),
                                          tgt_mask=tgt_mask,
                                          tgt_key_padding_mask=tgt_key_padding_mask)
        return self.fc(output.transpose(0, 1))

In [18]:
# === Full Model ===
class TransformerCaptioningModel(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, images, captions):
        memory = self.encoder(images)  # (B, S, D)
        return self.decoder(captions, memory)

In [19]:
from torch.cuda.amp import autocast, GradScaler

In [20]:
# === Training Loop with AMP + Assertion ===
def train_model(model, train_dataset, val_dataset, word2idx, device, batch_size=8, epochs=20, patience=3, lr=1e-4):
    pad_idx = word2idx['<pad>']
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

    scaler = GradScaler()
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_losses = []
        for images, captions, _ in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
            images, captions = images.to(device), captions.to(device)

            # === Assertion ===
            assert captions[:, 1:].max().item() < len(word2idx), "Target index exceeds vocab size"
            assert captions[:, 1:].min().item() >= 0, "Negative target index detected"

            optimizer.zero_grad()
            with autocast():
                outputs = model(images, captions[:, :-1])
                loss = criterion(outputs.reshape(-1, outputs.size(-1)), captions[:, 1:].reshape(-1))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_losses.append(loss.item())

        val_losses = []
        references, hypotheses = [], []
        model.eval()
        with torch.no_grad():
            for images, captions, _ in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
                images, captions = images.to(device), captions.to(device)
                with autocast():
                    outputs = model(images, captions[:, :-1])
                    loss = criterion(outputs.reshape(-1, outputs.size(-1)), captions[:, 1:].reshape(-1))
                val_losses.append(loss.item())

                preds = torch.argmax(outputs, dim=2)
                for ref, pred in zip(captions, preds):
                    ref_tokens = [w for w in ref.tolist() if w not in {pad_idx, word2idx['<start>'], word2idx['<end>']}]
                    pred_tokens = [w for w in pred.tolist() if w not in {pad_idx, word2idx['<start>'], word2idx['<end>']}]
                    references.append([ref_tokens])
                    hypotheses.append(pred_tokens)

        avg_train_loss = np.mean(train_losses)
        avg_val_loss = np.mean(val_losses)
        scheduler.step(avg_val_loss)

        bleu1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
        bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
        bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0))
        bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25))

        print(f"\nEpoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")
        print(f"BLEU-1 = {bleu1:.4f}, BLEU-2 = {bleu2:.4f}, BLEU-3 = {bleu3:.4f}, BLEU-4 = {bleu4:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), os.path.join(base_path, "transformer_captioning_best.pt"))
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

    print("Training complete.")
    return model

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [22]:
!nvidia-smi

Sun May 18 06:51:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   64C    P8             10W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [23]:
from efficientnet_pytorch import EfficientNet
from tqdm import tqdm

In [None]:
# === Run Training ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

vocab_size = len(word2idx)
encoder = EncoderEfficientNet()
decoder = TransformerDecoder(vocab_size=vocab_size)
model = TransformerCaptioningModel(encoder, decoder).to(device)

trained_model = train_model(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    word2idx=word2idx,
    device=device,
    batch_size=8,
    epochs=20,
    patience=3,
    lr=1e-4
)

Using device: cuda


  scaler = GradScaler()


Loaded pretrained weights for efficientnet-b3


  with autocast():
Epoch 1 [Train]: 100%|██████████| 3750/3750 [18:04<00:00,  3.46it/s]
  with autocast():
Epoch 1 [Val]: 100%|██████████| 625/625 [01:25<00:00,  7.29it/s]



Epoch 1: Train Loss = 2.2492, Val Loss = 0.8029
BLEU-1 = 0.8711, BLEU-2 = 0.8229, BLEU-3 = 0.7788, BLEU-4 = 0.7330


Epoch 2 [Train]: 100%|██████████| 3750/3750 [18:02<00:00,  3.47it/s]
Epoch 2 [Val]: 100%|██████████| 625/625 [01:26<00:00,  7.22it/s]



Epoch 2: Train Loss = 0.5109, Val Loss = 0.2793
BLEU-1 = 0.9548, BLEU-2 = 0.9371, BLEU-3 = 0.9207, BLEU-4 = 0.9029


Epoch 3 [Train]: 100%|██████████| 3750/3750 [17:54<00:00,  3.49it/s]
Epoch 3 [Val]: 100%|██████████| 625/625 [01:24<00:00,  7.36it/s]



Epoch 3: Train Loss = 0.1799, Val Loss = 0.1415
BLEU-1 = 0.9691, BLEU-2 = 0.9593, BLEU-3 = 0.9504, BLEU-4 = 0.9405


Epoch 4 [Train]: 100%|██████████| 3750/3750 [17:50<00:00,  3.50it/s]
Epoch 4 [Val]: 100%|██████████| 625/625 [01:24<00:00,  7.40it/s]



Epoch 4: Train Loss = 0.0708, Val Loss = 0.0852
BLEU-1 = 0.9831, BLEU-2 = 0.9777, BLEU-3 = 0.9727, BLEU-4 = 0.9671


Epoch 5 [Train]: 100%|██████████| 3750/3750 [17:43<00:00,  3.53it/s]
Epoch 5 [Val]: 100%|██████████| 625/625 [01:25<00:00,  7.31it/s]



Epoch 5: Train Loss = 0.0318, Val Loss = 0.0642
BLEU-1 = 0.9905, BLEU-2 = 0.9867, BLEU-3 = 0.9833, BLEU-4 = 0.9797


Epoch 6 [Train]:  88%|████████▊ | 3287/3750 [15:33<02:12,  3.48it/s]