## Import Libraries


In [1]:
import torch
import torchvision as tv
import torch.nn as nn
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import torchvision.models as models
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader
import spacy
import warnings

warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Load Data


In [None]:
BASE_DIRECTORY = "dataset"
# BASE_DIRECTORY = "/kaggle/input/flickr8k"

df = pd.read_csv(BASE_DIRECTORY + "/captions.txt")
df["path"] = df["image"].apply(lambda x: BASE_DIRECTORY + "/Images/" + x)
df = df.rename(columns={"image": "id"})
df["id"] = df["id"].str.replace(".jpg", "")
df.head()

In [None]:
print("Dataframe shape:", df.shape)
unique_id_count = df["id"].nunique()
print("Number of samples", unique_id_count)

### Build Dictionary


In [None]:
data = {}

for i in range(0, len(df), 5):
    id = df["id"][i]
    captions = [df["caption"][j] for j in range(i, i + 5)]
    path = df["path"][i]
    data[id] = {"captions": captions, "path": path}

keys = list(data.keys())

key = keys[0]
value = data[key]

print(f"Key: {key}")
value

### Split the dictionary into train, test, and validation sets


In [None]:
def read_ids_from_file(filename):
    with open(filename, "r") as f:
        ids = [line.strip() for line in f]
    return ids


# BASE_DIRECTORY = "/kaggle/input/id-files"

train_keys = read_ids_from_file(BASE_DIRECTORY + "/train_id.txt")
val_keys = read_ids_from_file(BASE_DIRECTORY + "/val_id.txt")
test_keys = read_ids_from_file(BASE_DIRECTORY + "/test_id.txt")

# Create the training, validation, and testing sets
train_data = {key: data[key] for key in train_keys}
val_data = {key: data[key] for key in val_keys}
test_data = {key: data[key] for key in test_keys}

train_keys = list(train_data.keys())
val_keys = list(val_data.keys())
test_keys = list(test_data.keys())

print("Training set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Testing set size:", len(test_data))

## Read & Show Image Data


In [None]:
def read_image(path):
    return Image.open(path)


def show_image(image):
    plt.imshow(image)
    plt.axis("off")
    plt.show()


def read_from_tensor(tensor):
    img_numpy = tensor.permute(1, 2, 0).numpy()
    img_numpy = np.clip(img_numpy, 0, 1)
    plt.imshow(img_numpy)
    plt.axis("off")
    plt.show()

### Example of an image with captions


In [None]:
index = 0
show_image(read_image(train_data[train_keys[index]]["path"]))
for i in range(5):
    print(train_data[train_keys[index]]["captions"][i])

## Image Preprocessing


### Transformations


In [None]:
train_transform = transforms.Compose(
    [
        transforms.RandomHorizontalFlip(),
        transforms.RandomResizedCrop(
            224, scale=(0.9, 1.0), ratio=(0.95, 1.05), antialias=True
        ),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

val_test_transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

### Data to Tensor Conversion


In [None]:
train_tensors = [
    train_transform(read_image(train_data[key]["path"])) for key in train_keys
]
train_tensors = torch.stack(train_tensors)

val_tensors = [
    val_test_transform(read_image(val_data[key]["path"])) for key in val_keys
]
val_tensors = torch.stack(val_tensors)

test_tensors = [
    val_test_transform(read_image(test_data[key]["path"])) for key in test_keys
]
test_tensors = torch.stack(test_tensors)


print("Training tensor shape:", train_tensors.shape)
print("Validation tensor shape:", val_tensors.shape)
print("Testing tensor shape:", test_tensors.shape)

### Map the captions to the image from tensors


In [None]:
read_from_tensor(train_tensors[index])
for i in range(5):
    print(train_data[train_keys[index]]["captions"][i])

## Dataset & DataLoader


## Dataset


In [None]:
class ImageCaptionDataset(Dataset):
    def __init__(self, data, keys, tensors):
        self.data = data
        self.keys = keys
        self.tensors = tensors

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, index):
        key = self.keys[index]
        tensor = self.tensors[index]
        captions = self.data[key]["captions"]
        return tensor, captions

In [None]:
train_dataset = ImageCaptionDataset(train_data, train_keys, train_tensors)
val_dataset = ImageCaptionDataset(val_data, val_keys, val_tensors)
test_dataset = ImageCaptionDataset(test_data, test_keys, test_tensors)

X, y = train_dataset[index]
print(X)
read_from_tensor(X)
for i in range(5):
    print(y[i])

## DataLoader


In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

## ResNet


In [46]:
class EncoderCNN(nn.Module):
    def __init__(self):
        super(EncoderCNN, self).__init__()

        resnet = models.resnet50(pretrained=True)

        for param in resnet.parameters():
            param.requires_grad_(False)

        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)

        self.fc1 = nn.Linear(2048 * 7 * 7, 1024)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(1024, 512)
        self.relu2 = nn.ReLU()

    def forward(self, images):
        features = self.resnet(images)
        features = features.view(features.size(0), -1)
        features = self.relu1(self.fc1(features))
        features = self.relu2(self.fc2(features))
        print(features.shape)
        return features


encoder = EncoderCNN().to(device)
# torch.save(encoder, 'encoder_model.pth')
summary(encoder, (3, 224, 224))

torch.Size([2, 512])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-

## RNN Preprocessing


In [None]:
nlp = spacy.load("en_core_web_sm")


class Vocab:
    def __init__(self):
        self.vocab_map = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.index_map = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.max_length = 50
        self.sequences = []

    def add_sequence(self, sequence):
        self.sequences.append(sequence)

    def build_vocab(self, train_data):
        for data in train_data:
            _, y = data
            for sequence in y:
                self.add_sequence(sequence)
        index = 4
        for sequence in self.sequences:
            for word in self.tokenize(sequence):
                if word not in self.vocab_map:
                    self.index_map[index] = word
                    self.vocab_map[word] = index
                    index += 1

    def tokenize(self, sequence):
        tokenized_sequence = [token.text.lower() for token in nlp.tokenizer(sequence)]
        return tokenized_sequence

    def add_padd(self, sequence):

        sequence = sequence + [self.vocab_map["<PAD>"]] * (
            self.max_length - len(sequence)
        )
        return sequence

    def __len__(self):
        return len(self.vocab_map)

    def vectorize(self, sequence):
        tokenized_sequence = self.tokenize(sequence)
        vectorized_sequence = [
            (
                self.vocab_map[token]
                if token in self.vocab_map
                else self.vocab_map["<UNK>"]
            )
            for token in tokenized_sequence
        ]
        return vectorized_sequence

## RNN


### Attention Mechanism


In [None]:
class Attention(nn.Module):
    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        super(Attention, self).__init__()
        self.encoder_att = nn.Linear(encoder_dim, attention_dim)
        self.decoder_att = nn.Linear(decoder_dim, attention_dim)
        self.full_att = nn.Linear(attention_dim, 1)
        self.softmax = nn.Softmax(dim=1)
        self.tanh = nn.Tanh()

    def forward(self, encoder_out, decoder_hidden):
        att1 = self.encoder_att(encoder_out)
        att2 = self.decoder_att(decoder_hidden)
        print(att1.shape, att2.shape)
        att = self.full_att(self.tanh(att1 + att2.unsqueeze(0))).squeeze(2)
        alpha = torch.functional.F.softmax(att, dim=1)
        attention_weighted_encoding = (encoder_out * alpha.unsqueeze(2)).sum(dim=1)
        return attention_weighted_encoding, alpha

### RNN Decoder


In [None]:
class DecoderRNN(nn.Module):
    def __init__(
        self, input_size, embed_size, hidden_size, num_layers, vocab_size, attention_dim
    ):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=False)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.attention = Attention(
            encoder_dim=input_size, decoder_dim=hidden_size, attention_dim=attention_dim
        )
        self.init_weights()

    def init_weights(self):
        torch.nn.init.xavier_uniform_(self.fc.weight)
        torch.nn.init.xavier_uniform_(self.embedding.weight)
        self.fc.bias.data.fill_(0)

    def forward(self, features, captions):
        embeddings = self.embedding(captions)
        hidden = torch.zeros(self.num_layers, features.size(0), self.hidden_size).to(
            features.device
        )
        cell = torch.zeros(self.num_layers, features.size(0), self.hidden_size).to(
            features.device
        )
        outputs = torch.zeros(
            embeddings.size(0), embeddings.size(1), self.fc.out_features
        ).to(features.device)

        for t in range(embeddings.size(1) - 1):
            attention_weighted_encoding, _ = self.attention(features, hidden)
            lstm_input = torch.cat(
                (embeddings[:, t], attention_weighted_encoding), dim=1
            )
            _, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
            outputs[:, t, :] = self.fc(hidden)
        return outputs

In [None]:
embed_size = 300
hidden_size = 512
vocab_size = 100
encoder_dim = 2048

attention_dim = 256

batch_size = 2
num_pixels = 1
max_caption_length = 6

# Randomly generated input data
features = torch.randn(batch_size, num_pixels, encoder_dim).to(device)
captions = torch.randint(0, vocab_size, (batch_size, max_caption_length)).to(device)

# flatten features
features = features.view(batch_size, num_pixels, -1)
print(features.shape)
# Initialize the model
decoder = DecoderRNN(
    encoder_dim,
    embed_size,
    hidden_size,
    num_layers=1,
    vocab_size=vocab_size,
    attention_dim=attention_dim,
)

# Forward pass
outputs = decoder(features, captions)

print(
    "Output shape:", outputs.shape
)  # Expected: (batch_size, max_caption_length, vocab_size)
print("Outputs:", outputs)