In [1]:
!pip install sacrebleu
!pip install gTTS
!pip install nltk
!pip install tqdm
!pip install gradio

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1
Collecting gTTS
  Downloading gTTS-2.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import os
import random
import nltk
from PIL import Image
import sacrebleu
from tqdm import tqdm

# Download NLTK tokenizer data
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using device: cpu


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
if (device == 'cuda'):
  print(f"Device name: {torch.cuda.get_device_name(device.index)}")
  print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
device = torch.device(device)

Using device: cpu


In [4]:
# Vocabulary class (as in the repository)
class Vocabulary:
    def __init__(self, freq_threshold):
        self.freq_threshold = freq_threshold
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {v: k for k, v in self.itos.items()}

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.lower() for tok in word_tokenize(text)]

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                frequencies[word] = frequencies.get(word, 0) + 1
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)
        return [self.stoi.get(token, self.stoi["<UNK>"]) for token in tokenized_text]


# Custom Dataset with train/val/test splitting (for .txt file with CSV format)
class FlickrDataset(Dataset):
    def __init__(self, root_dir, captions_file, vocabulary, transform=None, split="train", split_ratio=(0.80, 0.08, 0.12)):
        """
        Args:
            root_dir: Directory with images.
            captions_file: Path to the .txt file with captions.
                           Expected format:
                           image,caption
                           1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .
                           ...
            vocabulary: A Vocabulary object.
            transform: Image transformations.
            split: One of 'train', 'val', or 'test'.
            split_ratio: Tuple for train/val/test splits.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.vocabulary = vocabulary

        # Read the file and split lines
        with open(captions_file, 'r') as f:
            lines = f.readlines()

        imgs = []
        caps = []
        # Skip header if present (assuming first line starts with "image")
        if lines[0].strip().lower().startswith("image"):
            lines = lines[1:]

        for line in lines:
            line = line.strip()
            if not line:
                continue
            parts = line.split(',', 1)  # split into two parts at the first comma
            if len(parts) < 2:
                continue
            img_name = parts[0].strip()
            caption = parts[1].strip()
            imgs.append(img_name)
            caps.append(caption)

        # Map images to their captions
        self.img2caps = {}
        for img, cap in zip(imgs, caps):
            if img not in self.img2caps:
                self.img2caps[img] = []
            self.img2caps[img].append(cap)
        self.imgs = list(self.img2caps.keys())
        print("Total images found:", len(self.imgs))

        # Split dataset into train/val/test
        random.seed(42)
        random.shuffle(self.imgs)
        total = len(self.imgs)
        train_end = int(split_ratio[0] * total)
        val_end = train_end + int(split_ratio[1] * total)

        if split == "train":
            self.imgs = self.imgs[:train_end]
        elif split == "val":
            self.imgs = self.imgs[train_end:val_end]
        elif split == "test":
            self.imgs = self.imgs[val_end:]
        else:
            raise Exception("split must be one of 'train', 'val', or 'test'")

        self.split = split

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, index):
        img_id = self.imgs[index]
        caps = self.img2caps[img_id]
        # For training, pick a random caption; for validation/testing, use the first caption
        caption = random.choice(caps) if self.split == "train" else caps[0]
        img_path = os.path.join(self.root_dir, img_id)
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        # Add start and end tokens
        numericalized_caption = [self.vocabulary.stoi["<SOS>"]]
        numericalized_caption += self.vocabulary.numericalize(caption)
        numericalized_caption.append(self.vocabulary.stoi["<EOS>"])
        return image, torch.tensor(numericalized_caption)

# Collate function to pad sequences
def collate_fn(batch):
    images, captions = zip(*batch)
    images = torch.stack(images, 0)
    captions = pad_sequence(captions, batch_first=True, padding_value=0)
    return images, captions

# InceptionV3 expects 299x299 images.
transform = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Build vocabulary from the captions file (skip header if present)
def build_vocab(captions_file, freq_threshold):
    with open(captions_file, 'r') as f:
        lines = f.readlines()
    # Skip header line if it starts with "image"
    if lines[0].strip().lower().startswith("image"):
        lines = lines[1:]
    captions = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        parts = line.split(',', 1)
        if len(parts) < 2:
            continue
        captions.append(parts[1].strip())
    vocab = Vocabulary(freq_threshold)
    vocab.build_vocabulary(captions)
    return vocab

# Update these paths to your dataset locations
captions_path = "/content/drive/MyDrive/Flickr/captions.txt"
images_root = "/content/drive/MyDrive/Flickr/Images"

# Build vocabulary (adjust frequency threshold as needed)
vocab = build_vocab(captions_path, freq_threshold=5)
print("Vocabulary size:", len(vocab))
num_workers = 0

# Create dataset objects for train, validation, and test splits
train_dataset = FlickrDataset(root_dir=images_root, captions_file=captions_path, vocabulary=vocab, transform=transform, split="train")
val_dataset   = FlickrDataset(root_dir=images_root, captions_file=captions_path, vocabulary=vocab, transform=transform, split="val")
test_dataset  = FlickrDataset(root_dir=images_root, captions_file=captions_path, vocabulary=vocab, transform=transform, split="test")

# Create DataLoaders
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn, num_workers=num_workers)
val_loader   = DataLoader(dataset=val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=num_workers)
test_loader  = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=num_workers)

print("Train, val, test sizes:", len(train_dataset), len(val_dataset), len(test_dataset))

Vocabulary size: 3006
Total images found: 8091
Total images found: 8091
Total images found: 8091
Train, val, test sizes: 6472 647 972


In [5]:
for idx,img_name in enumerate(test_dataset.imgs):
    if idx==20:
      break
    else:
      print(img_name)

2363419943_717e6b119d.jpg
754852108_72f80d421f.jpg
1295669416_21cabf594d.jpg
3542418447_7c337360d6.jpg
135235570_5698072cd4.jpg
3375134059_7e9eb2ef01.jpg
358607894_5abb1250d3.jpg
1514957266_a19827c538.jpg
3372340429_91c4f4af30.jpg
2422018883_336519b5c6.jpg
2757803246_8aa3499d26.jpg
1419286010_b59af3962a.jpg
707941195_4386109029.jpg
3534548254_7bee952a0e.jpg
1176580356_9810d877bf.jpg
3053415073_5b667230ed.jpg
1691573772_1adef8e40e.jpg
3679341667_936769fd0c.jpg
2490365757_b869282cb3.jpg
2937497894_e3664a9513.jpg


In [6]:
from torchvision.models import Inception_V3_Weights

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        # Instantiate InceptionV3 with the default weights and aux_logits=True (required by the weights API)
        self.inception = models.inception_v3(weights=Inception_V3_Weights.DEFAULT, aux_logits=True)
        # Replace the final fully connected layer with an identity so that we get features directly
        self.inception.fc = nn.Identity()
        # Now define our own linear layer to map the 2048-dim features to embed_size
        self.linear = nn.Linear(2048, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        # In training mode, inception returns a tuple (main_output, aux_output)
        if self.training:
            x, _ = self.inception(images)
        else:
            x = self.inception(images)
        # Flatten the features and pass through our linear+BN layers
        x = x.view(x.size(0), -1)
        x = self.bn(self.linear(x))
        return x

# Decoder: LSTM-based decoder for caption generation (following the repository)
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, features, captions):
        # Remove the last token (<EOS>) from the caption for input (we add <SOS> manually)
        embeddings = self.embed(captions[:, :-1])
        # Concatenate image features as the first token in the sequence
        embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

    def sample_beam(self, features, beam_size=3, max_len=20):
      """
      Beam search decoding for a single image.
      Uses the image features as the first input (as in greedy decoding).
      Returns the best sequence (list of token ids).
      """
      k = beam_size
      vocab_size = self.linear.out_features

      # Initialize the LSTM with the image features (same as in greedy decoding)
      inputs = features.unsqueeze(1)  # shape: (1, 1, embed_size)
      output, states = self.lstm(inputs, None)
      outputs = self.linear(output.squeeze(1))  # shape: (1, vocab_size)
      log_probs = torch.log_softmax(outputs, dim=1)
      topk_log_probs, topk_indices = log_probs.topk(k)  # (1, k)

      # Initialize beam candidates:
      sequences = []
      for i in range(k):
          token = topk_indices[0, i].unsqueeze(0)  # initial predicted token
          score = topk_log_probs[0, i].item()
          sequences.append((token, score, states))

      # Run decoding for subsequent time steps:
      for t in range(max_len - 1):
          all_candidates = []
          for seq, score, states in sequences:
              # If last token is <EOS>, keep the sequence as is
              if seq[-1].item() == vocab.stoi["<EOS>"]:
                  all_candidates.append((seq, score, states))
                  continue

              # Get the embedding for the last token and run one LSTM step
              last_token = seq[-1].unsqueeze(0)  # shape: (1,)
              emb = self.embed(last_token).unsqueeze(1)  # (1, 1, embed_size)
              output, new_states = self.lstm(emb, states)
              outputs = self.linear(output.squeeze(1))  # (1, vocab_size)
              log_probs = torch.log_softmax(outputs, dim=1)
              topk_log_probs, topk_indices = log_probs.topk(k)

              # Expand each candidate with the top k tokens:
              for i in range(k):
                  new_token = topk_indices[0, i].unsqueeze(0)
                  new_seq = torch.cat([seq, new_token])
                  new_score = score + topk_log_probs[0, i].item()
                  all_candidates.append((new_seq, new_score, new_states))

          # Order all candidates by score and select best k sequences:
          ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
          sequences = ordered[:k]

          # If all sequences end with <EOS>, stop early.
          if all(seq[-1].item() == vocab.stoi["<EOS>"] for seq, _, _ in sequences):
              break

      best_seq, best_score, _ = sequences[0]
      return best_seq.tolist()

In [7]:
def evaluate(encoder, decoder, criterion, data_loader, device):
    encoder.eval()
    decoder.eval()
    total_loss = 0.0
    with torch.no_grad():
        pbar = tqdm(data_loader, desc="Evaluating", leave=False)
        for images, captions in pbar:
            images, captions = images.to(device), captions.to(device)
            features = encoder(images)
            outputs = decoder(features, captions)
            loss = criterion(outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1))
            total_loss += loss.item()
            pbar.set_postfix({"val_loss": loss.item()})
    avg_loss = total_loss / len(data_loader)
    return avg_loss

def train(encoder, decoder, criterion, optimizer, train_loader, val_loader, num_epochs, device, save_path="Downloads/Flickr/best_model.pth"):
    encoder.to(device)
    decoder.to(device)
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        encoder.train()
        decoder.train()
        running_loss = 0.0
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
        for images, captions in pbar:
            images, captions = images.to(device), captions.to(device)
            features = encoder(images)
            outputs = decoder(features, captions)
            loss = criterion(outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            pbar.set_postfix({"Loss": loss.item()})
        avg_train_loss = running_loss / len(train_loader)
        avg_val_loss = evaluate(encoder, decoder, criterion, val_loader, device)
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        # Save checkpoint if validation loss decreases
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save({
                'encoder_state_dict': encoder.state_dict(),
                'decoder_state_dict': decoder.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch,
                'val_loss': avg_val_loss,
            }, save_path)
            print(f"Checkpoint saved at epoch {epoch+1} with val loss {avg_val_loss:.4f}")

In [None]:
# Hyperparameters (matching the original repo)
embed_size    = 256
hidden_size   = 512
num_layers    = 1
learning_rate = 3e-4
num_epochs    = 100
vocab_size    = len(vocab)

# Initialize encoder and decoder models
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

# Define loss (ignoring the <PAD> token) and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"])
# Update only the decoder and the newly added layers of the encoder
params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate)

# Start training
train(encoder, decoder, criterion, optimizer, train_loader, val_loader, num_epochs, device, save_path="Downloads/Flickr/best_model.pth")



Epoch [1/100], Train Loss: 4.7781, Val Loss: 3.8086
Checkpoint saved at epoch 1 with val loss 3.8086




Epoch [2/100], Train Loss: 3.5861, Val Loss: 3.4001
Checkpoint saved at epoch 2 with val loss 3.4001




Epoch [3/100], Train Loss: 3.3098, Val Loss: 3.1964
Checkpoint saved at epoch 3 with val loss 3.1964




Epoch [4/100], Train Loss: 3.1351, Val Loss: 3.0763
Checkpoint saved at epoch 4 with val loss 3.0763




Epoch [5/100], Train Loss: 3.0282, Val Loss: 2.9906
Checkpoint saved at epoch 5 with val loss 2.9906




Epoch [6/100], Train Loss: 2.9370, Val Loss: 2.9363
Checkpoint saved at epoch 6 with val loss 2.9363




Epoch [7/100], Train Loss: 2.8721, Val Loss: 2.8834
Checkpoint saved at epoch 7 with val loss 2.8834




Epoch [8/100], Train Loss: 2.7977, Val Loss: 2.8232
Checkpoint saved at epoch 8 with val loss 2.8232




Epoch [9/100], Train Loss: 2.7596, Val Loss: 2.7850
Checkpoint saved at epoch 9 with val loss 2.7850




Epoch [10/100], Train Loss: 2.7031, Val Loss: 2.7433
Checkpoint saved at epoch 10 with val loss 2.7433




Epoch [11/100], Train Loss: 2.6590, Val Loss: 2.7105
Checkpoint saved at epoch 11 with val loss 2.7105




Epoch [12/100], Train Loss: 2.6152, Val Loss: 2.6740
Checkpoint saved at epoch 12 with val loss 2.6740




Epoch [13/100], Train Loss: 2.5828, Val Loss: 2.6502
Checkpoint saved at epoch 13 with val loss 2.6502




Epoch [14/100], Train Loss: 2.5435, Val Loss: 2.6472
Checkpoint saved at epoch 14 with val loss 2.6472




Epoch [15/100], Train Loss: 2.5073, Val Loss: 2.6131
Checkpoint saved at epoch 15 with val loss 2.6131




Epoch [16/100], Train Loss: 2.4779, Val Loss: 2.6008
Checkpoint saved at epoch 16 with val loss 2.6008




Epoch [17/100], Train Loss: 2.4562, Val Loss: 2.5751
Checkpoint saved at epoch 17 with val loss 2.5751




Epoch [18/100], Train Loss: 2.4242, Val Loss: 2.5613
Checkpoint saved at epoch 18 with val loss 2.5613




Epoch [19/100], Train Loss: 2.4044, Val Loss: 2.5543
Checkpoint saved at epoch 19 with val loss 2.5543




Epoch [20/100], Train Loss: 2.3707, Val Loss: 2.5406
Checkpoint saved at epoch 20 with val loss 2.5406




Epoch [21/100], Train Loss: 2.3548, Val Loss: 2.5228
Checkpoint saved at epoch 21 with val loss 2.5228




Epoch [22/100], Train Loss: 2.3270, Val Loss: 2.5148
Checkpoint saved at epoch 22 with val loss 2.5148




Epoch [23/100], Train Loss: 2.2985, Val Loss: 2.5127
Checkpoint saved at epoch 23 with val loss 2.5127




Epoch [24/100], Train Loss: 2.2872, Val Loss: 2.4989
Checkpoint saved at epoch 24 with val loss 2.4989




Epoch [25/100], Train Loss: 2.2671, Val Loss: 2.4971
Checkpoint saved at epoch 25 with val loss 2.4971




Epoch [26/100], Train Loss: 2.2439, Val Loss: 2.4768
Checkpoint saved at epoch 26 with val loss 2.4768




Epoch [27/100], Train Loss: 2.2248, Val Loss: 2.4802




Epoch [28/100], Train Loss: 2.2093, Val Loss: 2.4760
Checkpoint saved at epoch 28 with val loss 2.4760




Epoch [29/100], Train Loss: 2.1933, Val Loss: 2.4715
Checkpoint saved at epoch 29 with val loss 2.4715




Epoch [30/100], Train Loss: 2.1654, Val Loss: 2.4618
Checkpoint saved at epoch 30 with val loss 2.4618




Epoch [31/100], Train Loss: 2.1576, Val Loss: 2.4574
Checkpoint saved at epoch 31 with val loss 2.4574




Epoch [32/100], Train Loss: 2.1222, Val Loss: 2.4559
Checkpoint saved at epoch 32 with val loss 2.4559




Epoch [33/100], Train Loss: 2.1181, Val Loss: 2.4583




Epoch [34/100], Train Loss: 2.1061, Val Loss: 2.4486
Checkpoint saved at epoch 34 with val loss 2.4486




Epoch [35/100], Train Loss: 2.0955, Val Loss: 2.4424
Checkpoint saved at epoch 35 with val loss 2.4424




Epoch [36/100], Train Loss: 2.0650, Val Loss: 2.4344
Checkpoint saved at epoch 36 with val loss 2.4344




Epoch [37/100], Train Loss: 2.0505, Val Loss: 2.4325
Checkpoint saved at epoch 37 with val loss 2.4325




Epoch [38/100], Train Loss: 2.0331, Val Loss: 2.4298
Checkpoint saved at epoch 38 with val loss 2.4298




Epoch [39/100], Train Loss: 2.0280, Val Loss: 2.4288
Checkpoint saved at epoch 39 with val loss 2.4288




Epoch [40/100], Train Loss: 2.0163, Val Loss: 2.4265
Checkpoint saved at epoch 40 with val loss 2.4265




Epoch [41/100], Train Loss: 1.9970, Val Loss: 2.4310




Epoch [42/100], Train Loss: 1.9834, Val Loss: 2.4253
Checkpoint saved at epoch 42 with val loss 2.4253




Epoch [43/100], Train Loss: 1.9675, Val Loss: 2.4219
Checkpoint saved at epoch 43 with val loss 2.4219




Epoch [44/100], Train Loss: 1.9541, Val Loss: 2.4209
Checkpoint saved at epoch 44 with val loss 2.4209




Epoch [45/100], Train Loss: 1.9268, Val Loss: 2.4184
Checkpoint saved at epoch 45 with val loss 2.4184




Epoch [46/100], Train Loss: 1.9209, Val Loss: 2.4194




Epoch [47/100], Train Loss: 1.9165, Val Loss: 2.4244




Epoch [48/100], Train Loss: 1.8985, Val Loss: 2.4265




Epoch [49/100], Train Loss: 1.8864, Val Loss: 2.4235




Epoch [50/100], Train Loss: 1.8723, Val Loss: 2.4356




Epoch [51/100], Train Loss: 1.8633, Val Loss: 2.4274




Epoch [52/100], Train Loss: 1.8378, Val Loss: 2.4259




Epoch [53/100], Train Loss: 1.8298, Val Loss: 2.4272




Epoch [54/100], Train Loss: 1.8191, Val Loss: 2.4263




Epoch [55/100], Train Loss: 1.8037, Val Loss: 2.4340




Epoch [56/100], Train Loss: 1.7912, Val Loss: 2.4326




Epoch [57/100], Train Loss: 1.7856, Val Loss: 2.4298




Epoch [58/100], Train Loss: 1.7668, Val Loss: 2.4378




Epoch [59/100], Train Loss: 1.7655, Val Loss: 2.4405




Epoch [60/100], Train Loss: 1.7457, Val Loss: 2.4416




Epoch [61/100], Train Loss: 1.7436, Val Loss: 2.4391




Epoch [62/100], Train Loss: 1.7226, Val Loss: 2.4421




Epoch [63/100], Train Loss: 1.7125, Val Loss: 2.4549




Epoch [64/100], Train Loss: 1.7049, Val Loss: 2.4582




Epoch [65/100], Train Loss: 1.6868, Val Loss: 2.4591




Epoch [66/100], Train Loss: 1.6805, Val Loss: 2.4489




Epoch [67/100], Train Loss: 1.6635, Val Loss: 2.4546




Epoch [68/100], Train Loss: 1.6574, Val Loss: 2.4577




Epoch [69/100], Train Loss: 1.6441, Val Loss: 2.4626




Epoch [70/100], Train Loss: 1.6349, Val Loss: 2.4683




Epoch [71/100], Train Loss: 1.6231, Val Loss: 2.4733




Epoch [72/100], Train Loss: 1.6097, Val Loss: 2.4803




Epoch [73/100], Train Loss: 1.5982, Val Loss: 2.4831




Epoch [74/100], Train Loss: 1.5821, Val Loss: 2.4825




Epoch [75/100], Train Loss: 1.5740, Val Loss: 2.4815




Epoch [76/100], Train Loss: 1.5645, Val Loss: 2.4916




Epoch [77/100], Train Loss: 1.5520, Val Loss: 2.4979




Epoch [78/100], Train Loss: 1.5501, Val Loss: 2.5077




Epoch [79/100], Train Loss: 1.5359, Val Loss: 2.4967




Epoch [80/100], Train Loss: 1.5338, Val Loss: 2.5003




Epoch [81/100], Train Loss: 1.5135, Val Loss: 2.5074




Epoch [82/100], Train Loss: 1.5035, Val Loss: 2.5228




Epoch [83/100], Train Loss: 1.4980, Val Loss: 2.5236




Epoch [84/100], Train Loss: 1.4946, Val Loss: 2.5196




Epoch [85/100], Train Loss: 1.4854, Val Loss: 2.5221




Epoch [86/100], Train Loss: 1.4657, Val Loss: 2.5403




Epoch [87/100], Train Loss: 1.4523, Val Loss: 2.5315




Epoch [88/100], Train Loss: 1.4528, Val Loss: 2.5343




Epoch [89/100], Train Loss: 1.4417, Val Loss: 2.5441




Epoch [90/100], Train Loss: 1.4204, Val Loss: 2.5574




Epoch [91/100], Train Loss: 1.4194, Val Loss: 2.5608




Epoch [92/100], Train Loss: 1.4120, Val Loss: 2.5652




Epoch [93/100], Train Loss: 1.4050, Val Loss: 2.5639




Epoch [94/100], Train Loss: 1.4018, Val Loss: 2.5622




Epoch [95/100], Train Loss: 1.3910, Val Loss: 2.5846




Epoch [96/100], Train Loss: 1.3790, Val Loss: 2.5774




Epoch [97/100], Train Loss: 1.3696, Val Loss: 2.5953




Epoch [98/100], Train Loss: 1.3598, Val Loss: 2.5948




Epoch [99/100], Train Loss: 1.3406, Val Loss: 2.6028


                                                                                                                       

Epoch [100/100], Train Loss: 1.3490, Val Loss: 2.5946




In [8]:
# Hyperparameters (matching the original repo)
embed_size    = 256
hidden_size   = 512
num_layers    = 1
learning_rate = 3e-4
num_epochs    = 100
vocab_size    = len(vocab)

# Initialize encoder and decoder models
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

# Cell 7: Load the best model checkpoint and evaluate sacreBLEU score on the test set

# Load checkpoint
checkpoint = torch.load("/content/drive/MyDrive/Flickr/best_model_100.pth", map_location=torch.device('cpu'))
encoder.load_state_dict(checkpoint['encoder_state_dict'])
decoder.load_state_dict(checkpoint['decoder_state_dict'])
encoder.to(device)
decoder.to(device)

Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth
100%|██████████| 104M/104M [00:00<00:00, 122MB/s] 


DecoderRNN(
  (embed): Embedding(3006, 256)
  (lstm): LSTM(256, 512, batch_first=True)
  (linear): Linear(in_features=512, out_features=3006, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [11]:
import os
from PIL import Image

# Folder path containing test images
folder_path = "/content/drive/MyDrive/Flickr/test_images/"

# Dictionary to store captions for each file
captions_dict = {}

encoder.eval()
decoder.eval()

with torch.no_grad():
    for filename in os.listdir(folder_path):
        if filename.lower().endswith((".png", ".jpg", ".jpeg")):
            image_path = os.path.join(folder_path, filename)
            image = Image.open(image_path).convert("RGB")
            image_tensor = transform(image).unsqueeze(0).to(device)

            features = encoder(image_tensor)
            # Use beam search for caption generation; you can switch to greedy with decoder.sample(features)
            best_seq = decoder.sample_beam(features, beam_size=1, max_len=20)

            # Convert token IDs to words, filtering out special tokens
            caption_words = []
            for token_id in best_seq:
                word = vocab.itos[token_id]
                if word == "<EOS>":
                    break
                if word in ["<SOS>", "<UNK>"]:
                    continue
                caption_words.append(word)
            caption = " ".join(caption_words)
            captions_dict[filename] = caption
            print(f"{filename}: {caption}")

# Optionally, captions_dict now holds the mapping for further processing.

boat.png: a man is rowing a canoe down a river .
bus.png: a man in a black jacket and white hat is standing in front of a store .
child.jpg: a little girl in a pink dress is running through a field .
dog.jpg: a dog is running through the water .
horse.png: a man and a dog are standing on a rocky shore .


In [12]:
import os
import random
import torch
import gradio as gr
from PIL import Image
from gtts import gTTS

# ----------------------------------------------------------------
# Load Trained Model Checkpoint
# ----------------------------------------------------------------
checkpoint_path = "/content/drive/MyDrive/Flickr/best_model_100.pth"
embed_size    = 256
hidden_size   = 512
num_layers    = 1
vocab_size    = len(vocab)

encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

checkpoint = torch.load(checkpoint_path, map_location=device)
encoder.load_state_dict(checkpoint['encoder_state_dict'])
decoder.load_state_dict(checkpoint['decoder_state_dict'])
encoder.to(device)
decoder.to(device)
encoder.eval()
decoder.eval()

DecoderRNN(
  (embed): Embedding(3006, 256)
  (lstm): LSTM(256, 512, batch_first=True)
  (linear): Linear(in_features=512, out_features=3006, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [13]:
import os
import random
import torch
import gradio as gr
from PIL import Image
from gtts import gTTS
import base64
from io import BytesIO

# [Your existing model loading code here...]

def generate_captions_and_audio(files, beam_size, max_len=20):
    """Generate captions and audio for multiple images, returning HTML with embedded content"""
    if not isinstance(files, list):
        files = [files]

    html_output = ""
    with torch.no_grad():
        for idx, file_path in enumerate(files):
            # Process image
            image = Image.open(file_path).convert("RGB")
            image_tensor = transform(image).unsqueeze(0).to(device)
            features = encoder(image_tensor)

            # Generate caption
            token_ids = decoder.sample_beam(features, beam_size=beam_size, max_len=max_len)
            caption_words = []
            for token_id in token_ids:
                word = vocab.itos[token_id]
                if word == "<EOS>": break
                if word not in ["<SOS>", "<UNK>"]:
                    caption_words.append(word)
            caption = " ".join(caption_words)

            # Generate audio
            tts = gTTS(text=caption, lang="en")
            audio_bytes = BytesIO()
            tts.write_to_fp(audio_bytes)
            audio_bytes.seek(0)
            audio_b64 = base64.b64encode(audio_bytes.read()).decode()

            # Convert image to base64
            buffered = BytesIO()
            image.save(buffered, format="JPEG")
            img_b64 = base64.b64encode(buffered.getvalue()).decode()

            # Create HTML block
            html_output += f"""
                        <div style="margin: 1rem; padding: 1rem; border: 1px solid #ddd; border-radius: 8px;">
                            <div style="display: flex; gap: 1rem; align-items: center;">
                                <img src="data:image/jpeg;base64,{img_b64}" style="max-width: 200px; height: auto;"/>
                                <div>
                                    <p style="font-size: 18px; margin: 0.5rem 0;">
                                        <strong style="font-size: 16px;">Caption:</strong>
                                        <span style="font-size: 16px;">{caption}</span>
                                    </p>
                                    <audio controls style="margin-top: 0.5rem;">
                                        <source src="data:audio/mpeg;base64,{audio_b64}" type="audio/mpeg">
                                    </audio>
                                </div>
                            </div>
                      </div>
                      """

    return f"<div style='margin: 1rem;'>{html_output}</div>"

# Create Gradio interface
with gr.Blocks(title="Image Captioning with Audio") as interface:
    gr.Markdown("# Image Captioning with Audio")
    gr.Markdown("Upload Images to Generate Captions with Playable Audio. (Up to 10 Images)")
    gr.Markdown("Model used is InceptionV3 and Training Dataset is Flickr8k.")
    gr.Markdown("Beam Search is used as the Decoding Strategy.")

    with gr.Row():
        inputs = [
            gr.File(label="Upload Images", file_count="multiple", file_types=["image"], type="filepath"),
            gr.Slider(1, 10, step=1, value=3, label="Beam Size"),
            gr.Slider(10, 50, step=1, value=20, label="Max Caption Length")
        ]
        submit = gr.Button("Generate", variant="primary")

    output = gr.HTML(label="Results")

    # Validation function
    def validate_files(files):
        if len(files) < 1:
            raise gr.Error("Please upload at least 1 image!")
        if len(files) > 10:
            raise gr.Error("Maximum 10 images allowed!")
        return files

        # Add validation to file input
    inputs[0].upload(
        validate_files,
        inputs[0],
        inputs[0]
    )

    submit.click(
        generate_captions_and_audio,
        inputs=inputs,
        outputs=output
    )

if __name__ == "__main__":
    interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d75a1f555efcf7e8fb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


# Extra Part

In [None]:
from gtts import gTTS
from IPython.display import Audio, display

# Suppose 'caption' is the generated caption from your inference cell.
# If it's not defined, set it manually for testing.
# caption = "A child in a pink dress is climbing up a set of stairs in an entry way."

# Convert text to speech
tts = gTTS(text=caption, lang='en')

# Save the audio file
audio_filename = "caption_audio.mp3"
tts.save(audio_filename)

# Play the audio in the notebook
display(Audio(audio_filename, autoplay=True))