In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
import sys, os, json, pickle
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torchvision.models as models
from torch.utils.data import DataLoader
from PIL import Image
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu

In [4]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [5]:
# === PATHS ===
base_path = "/content/drive/MyDrive/UTS/SEM 4/DL/AT3/AT3-DL-image-captioning"
sys.path.append(os.path.join(base_path))
image_folder = os.path.join(base_path, "data/Flicker8k_Dataset")
text_folder = os.path.join(base_path, "data/Flickr8k_text")
processed_folder = os.path.join(base_path, "data/processed")

In [6]:
# === LOAD UTILS ===
from utils.dataloader import get_transforms, load_split_ids, build_caption_dataset
from utils.caption_dataset import CaptionDataset

In [7]:
# === LOAD VOCAB & SEQUENCES ===
with open(os.path.join(processed_folder, "word2idx.json"), "r") as f:
    word2idx = json.load(f)

with open(os.path.join(processed_folder, "image_caption_seqs.pkl"), "rb") as f:
    image_caption_seqs = pickle.load(f)

In [8]:
# === LOAD SPLITS & TRANSFORMS ===
train_ids = load_split_ids(os.path.join(text_folder, "Flickr_8k.trainImages.txt"))
val_ids   = load_split_ids(os.path.join(text_folder, "Flickr_8k.devImages.txt"))
test_ids  = load_split_ids(os.path.join(text_folder, "Flickr_8k.testImages.txt"))

transform_train = get_transforms("train")
transform_val   = get_transforms("val")

train_dataset = build_caption_dataset(train_ids, image_caption_seqs, word2idx, image_folder, transform_train)
val_dataset   = build_caption_dataset(val_ids, image_caption_seqs, word2idx, image_folder, transform_val)
test_dataset  = build_caption_dataset(test_ids, image_caption_seqs, word2idx, image_folder, transform_val)

In [9]:
# === ENCODER: DENSENET121 ===
from torchvision.models import densenet121

class DenseNetEncoder(nn.Module):
    def __init__(self, encoded_image_size=7, fine_tune=True):
        super(DenseNetEncoder, self).__init__()
        self.enc_image_size = encoded_image_size
        densenet = densenet121(pretrained=True)
        self.features = densenet.features
        self.pool = nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))
        self.fine_tune(fine_tune)

    def forward(self, images):
        features = self.features(images)
        out = self.pool(features)
        out = out.permute(0, 2, 3, 1)
        return out

    def fine_tune(self, fine_tune=True):
        for p in self.features.parameters():
            p.requires_grad = False
        if fine_tune:
            for c in list(self.features.children())[-4:]:
                for p in c.parameters():
                    p.requires_grad = True

In [10]:
# === ATTENTION ===
class Attention(nn.Module):
    def __init__(self, encoder_dim, hidden_dim, attention_dim):
        super(Attention, self).__init__()
        self.encoder_att = nn.Linear(encoder_dim, attention_dim)
        self.decoder_att = nn.Linear(hidden_dim, attention_dim)
        self.full_att = nn.Linear(attention_dim, 1)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, encoder_out, decoder_hidden):
        att1 = self.encoder_att(encoder_out)
        att2 = self.decoder_att(decoder_hidden).unsqueeze(1)
        att = self.full_att(self.relu(att1 + att2)).squeeze(2)
        alpha = self.softmax(att)
        context = (encoder_out * alpha.unsqueeze(2)).sum(dim=1)
        return context, alpha

In [11]:
# === DECODER ===
class DecoderRNNWithAttention(nn.Module):
    def __init__(self, attention_dim, embed_dim, hidden_dim, vocab_size, encoder_dim=1024, dropout=0.5):
        super().__init__()
        self.encoder_dim = encoder_dim
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.dropout = dropout
        self.attention = Attention(encoder_dim, hidden_dim, attention_dim)

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.dropout_layer = nn.Dropout(p=dropout)

        self.decode_step = nn.LSTMCell(embed_dim + encoder_dim, hidden_dim, bias=True)
        self.init_h = nn.Linear(encoder_dim, hidden_dim)
        self.init_c = nn.Linear(encoder_dim, hidden_dim)

        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, encoder_out, captions):
        batch_size = captions.size(0)
        max_len = captions.size(1)
        encoder_out = encoder_out.view(batch_size, -1, self.encoder_dim)

        # === DEBUG: Check for bad token indices ===
        if torch.max(captions) >= self.vocab_size:
          print("🚨 Invalid token detected in captions!")
          print("Max token index:", torch.max(captions).item(), ">= vocab_size:", self.vocab_size)
          print("Problematic captions:", captions)
          raise ValueError("Token index out of range for embedding.")

        embeddings = self.embedding(captions)
        h, c = self.init_hidden_state(encoder_out.mean(dim=1))
        outputs = torch.zeros(batch_size, max_len, self.vocab_size).to(captions.device)

        for t in range(max_len):
            context, _ = self.attention(encoder_out, h)
            lstm_input = torch.cat([embeddings[:, t, :], context], dim=1)
            h, c = self.decode_step(lstm_input, (h, c))
            preds = self.fc(self.dropout_layer(h))
            outputs[:, t, :] = preds

        return outputs

    def init_hidden_state(self, mean_encoder_out):
        h = self.init_h(mean_encoder_out)
        c = self.init_c(mean_encoder_out)
        return h, c

In [12]:
# === WRAPPER ===
class CaptioningModel(nn.Module):
    def __init__(self, encoder, decoder):
        super(CaptioningModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, images, captions):
        encoder_out = self.encoder(images)
        outputs = self.decoder(encoder_out, captions)
        return outputs

In [13]:
# === TRAINING LOOP ===
def train_model(model, train_dataset, val_dataset, word2idx, device='cuda',
                batch_size=32, epochs=20, patience=3, lr=1e-4):
    pad_idx = word2idx['<pad>']
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_losses = []
        tqdm_train = tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]")
        for images, captions, _ in tqdm_train:
            images, captions = images.to(device), captions.to(device)
            optimizer.zero_grad()
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs.reshape(-1, outputs.size(-1)), captions[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
            tqdm_train.set_postfix(loss=np.mean(train_losses))

        avg_train_loss = np.mean(train_losses)

        # === VALIDATION ===
        model.eval()
        val_losses = []
        references = []
        hypotheses = []

        tqdm_val = tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]")
        with torch.no_grad():
            for images, captions, _ in tqdm_val:
                images, captions = images.to(device), captions.to(device)
                outputs = model(images, captions[:, :-1])
                loss = criterion(outputs.reshape(-1, outputs.size(-1)), captions[:, 1:].reshape(-1))
                val_losses.append(loss.item())

                preds = torch.argmax(outputs, dim=2)
                for ref, pred in zip(captions, preds):
                    ref_tokens = [w for w in ref.tolist() if w not in {pad_idx, word2idx['<start>'], word2idx['<end>']}]
                    pred_tokens = [w for w in pred.tolist() if w not in {pad_idx, word2idx['<start>'], word2idx['<end>']}]
                    references.append([ref_tokens])
                    hypotheses.append(pred_tokens)

        avg_val_loss = np.mean(val_losses)
        scheduler.step(avg_val_loss)

        bleu1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
        bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
        bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0))
        bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25))

        print(f"\nEpoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")
        print(f"BLEU-1 = {bleu1:.4f}, BLEU-2 = {bleu2:.4f}, BLEU-3 = {bleu3:.4f}, BLEU-4 = {bleu4:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_model_densenet.pt")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break

    print("Training complete.")
    return model

In [14]:
# === DEVICE SETUP ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [15]:
!nvidia-smi

Thu May 22 02:44:54 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   56C    P8             18W /   72W |       3MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [16]:
# === DEVICE SETUP ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [17]:
# === BUILD AND TRAIN MODEL ===
embed_dim = 256
hidden_dim = 512
attention_dim = 256
dropout = 0.5
#vocab_size = len(word2idx) + 1
vocab_size = max(word2idx.values()) + 2  # allows for max token ID 2989

encoder = DenseNetEncoder(encoded_image_size=7, fine_tune=True)
decoder = DecoderRNNWithAttention(attention_dim, embed_dim, hidden_dim, vocab_size, encoder_dim=1024, dropout=dropout)
model = nn.Sequential()  # Dummy fix to avoid Colab bug
model = CaptioningModel(encoder, decoder).to(device)

trained_model = train_model(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    word2idx=word2idx,
    device=device,
    batch_size=8,
    epochs=20,
    patience=3,
    lr=1e-4
)

Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /root/.cache/torch/hub/checkpoints/densenet121-a639ec97.pth
100%|██████████| 30.8M/30.8M [00:00<00:00, 185MB/s]
Epoch 1 [Train]: 100%|██████████| 3750/3750 [1:28:34<00:00,  1.42s/it, loss=4.15]
Epoch 1 [Val]: 100%|██████████| 625/625 [13:23<00:00,  1.29s/it]



Epoch 1: Train Loss = 4.1511, Val Loss = 3.5983
BLEU-1 = 0.3358, BLEU-2 = 0.1771, BLEU-3 = 0.0963, BLEU-4 = 0.0522


Epoch 2 [Train]: 100%|██████████| 3750/3750 [16:24<00:00,  3.81it/s, loss=3.41]
Epoch 2 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.24it/s]



Epoch 2: Train Loss = 3.4084, Val Loss = 3.2914
BLEU-1 = 0.3568, BLEU-2 = 0.1988, BLEU-3 = 0.1144, BLEU-4 = 0.0643


Epoch 3 [Train]: 100%|██████████| 3750/3750 [16:23<00:00,  3.81it/s, loss=3.13]
Epoch 3 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.26it/s]



Epoch 3: Train Loss = 3.1274, Val Loss = 3.1383
BLEU-1 = 0.3744, BLEU-2 = 0.2108, BLEU-3 = 0.1229, BLEU-4 = 0.0697


Epoch 4 [Train]: 100%|██████████| 3750/3750 [16:23<00:00,  3.81it/s, loss=2.95]
Epoch 4 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.25it/s]



Epoch 4: Train Loss = 2.9469, Val Loss = 3.0448
BLEU-1 = 0.3529, BLEU-2 = 0.1985, BLEU-3 = 0.1165, BLEU-4 = 0.0662


Epoch 5 [Train]: 100%|██████████| 3750/3750 [16:23<00:00,  3.81it/s, loss=2.81]
Epoch 5 [Val]: 100%|██████████| 625/625 [01:16<00:00,  8.21it/s]



Epoch 5: Train Loss = 2.8103, Val Loss = 2.9901
BLEU-1 = 0.3744, BLEU-2 = 0.2128, BLEU-3 = 0.1258, BLEU-4 = 0.0721


Epoch 6 [Train]: 100%|██████████| 3750/3750 [16:23<00:00,  3.81it/s, loss=2.7]
Epoch 6 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.28it/s]



Epoch 6: Train Loss = 2.6995, Val Loss = 2.9448
BLEU-1 = 0.3732, BLEU-2 = 0.2118, BLEU-3 = 0.1245, BLEU-4 = 0.0704


Epoch 7 [Train]: 100%|██████████| 3750/3750 [16:23<00:00,  3.81it/s, loss=2.61]
Epoch 7 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.27it/s]



Epoch 7: Train Loss = 2.6062, Val Loss = 2.9235
BLEU-1 = 0.3943, BLEU-2 = 0.2269, BLEU-3 = 0.1344, BLEU-4 = 0.0767


Epoch 8 [Train]: 100%|██████████| 3750/3750 [16:24<00:00,  3.81it/s, loss=2.52]
Epoch 8 [Val]: 100%|██████████| 625/625 [01:16<00:00,  8.22it/s]



Epoch 8: Train Loss = 2.5229, Val Loss = 2.8999
BLEU-1 = 0.3531, BLEU-2 = 0.2003, BLEU-3 = 0.1174, BLEU-4 = 0.0658


Epoch 9 [Train]: 100%|██████████| 3750/3750 [16:25<00:00,  3.81it/s, loss=2.45]
Epoch 9 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.25it/s]



Epoch 9: Train Loss = 2.4516, Val Loss = 2.8871
BLEU-1 = 0.2992, BLEU-2 = 0.1702, BLEU-3 = 0.1003, BLEU-4 = 0.0565


Epoch 10 [Train]: 100%|██████████| 3750/3750 [16:23<00:00,  3.81it/s, loss=2.39]
Epoch 10 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.26it/s]



Epoch 10: Train Loss = 2.3863, Val Loss = 2.8712
BLEU-1 = 0.2923, BLEU-2 = 0.1664, BLEU-3 = 0.0988, BLEU-4 = 0.0561


Epoch 11 [Train]: 100%|██████████| 3750/3750 [16:22<00:00,  3.82it/s, loss=2.32]
Epoch 11 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.28it/s]



Epoch 11: Train Loss = 2.3228, Val Loss = 2.8702
BLEU-1 = 0.3591, BLEU-2 = 0.2059, BLEU-3 = 0.1230, BLEU-4 = 0.0708


Epoch 12 [Train]: 100%|██████████| 3750/3750 [16:22<00:00,  3.82it/s, loss=2.27]
Epoch 12 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.29it/s]



Epoch 12: Train Loss = 2.2698, Val Loss = 2.8585
BLEU-1 = 0.3176, BLEU-2 = 0.1815, BLEU-3 = 0.1081, BLEU-4 = 0.0624


Epoch 13 [Train]: 100%|██████████| 3750/3750 [16:22<00:00,  3.82it/s, loss=2.22]
Epoch 13 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.26it/s]



Epoch 13: Train Loss = 2.2172, Val Loss = 2.8639
BLEU-1 = 0.3367, BLEU-2 = 0.1924, BLEU-3 = 0.1143, BLEU-4 = 0.0657


Epoch 14 [Train]: 100%|██████████| 3750/3750 [16:22<00:00,  3.82it/s, loss=2.17]
Epoch 14 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.31it/s]



Epoch 14: Train Loss = 2.1698, Val Loss = 2.8620
BLEU-1 = 0.2942, BLEU-2 = 0.1679, BLEU-3 = 0.1003, BLEU-4 = 0.0578


Epoch 15 [Train]: 100%|██████████| 3750/3750 [16:22<00:00,  3.82it/s, loss=2.12]
Epoch 15 [Val]: 100%|██████████| 625/625 [01:15<00:00,  8.27it/s]



Epoch 15: Train Loss = 2.1242, Val Loss = 2.8593
BLEU-1 = 0.3351, BLEU-2 = 0.1924, BLEU-3 = 0.1148, BLEU-4 = 0.0662
Early stopping triggered at epoch 15
Training complete.


In [18]:
# === SAVE FINAL MODEL ===
torch.save(model.state_dict(), os.path.join(base_path, "data/roy_densenet_model.pt"))
torch.save(model, os.path.join(base_path, "data/roy_densenet_model_full.pt"))

In [19]:
print("Max token ID in dataset:", max(max(seq) for seqs in image_caption_seqs.values() for seq in seqs))
print("Vocab size:", vocab_size)

Max token ID in dataset: 2989
Vocab size: 2991
