In [1]:
import os
import torch
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.models as models
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, random_split
from torch.utils.tensorboard import SummaryWriter
import torchvision.datasets as Datasets
import torchvision.transforms as transforms



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
TRAIN_RATIO = 0.7
TEST_RATIO = 0.15
VAL_RATIO = 0.15

BATCH_SIZE = 32
WORKERS = 4
LEARNING_RATE=0.01
EMBED_SIZE = 256
HIDDEN_SIZE = 256
NUM_LAYERS = 1
EPOCHS = 2


In [29]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size, train_CNN=False):
        super(EncoderCNN, self).__init__()
        self.train_CNN = train_CNN
        self.inception = models.inception_v3(weights=True)
        self.inception.fc = nn.Linear(self.inception.fc.in_features, embed_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.batchnorm = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        inception_outputs = self.inception(images)
        features = inception_outputs.logits
        # fine tune
        for name, param in self.inception.named_parameters():
            if "fc.weight" in name or "fc.bias" in name:
                param.requires_grad = True
            else:
                param.requires_grad = self.train_CNN
        return self.batchnorm(self.dropout(self.relu(features)))

In [30]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=40):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size, max_seq_length)
        self.dropout = nn.Dropout(0.5)

    def forward(self, features, captions):
        embeddings = self.dropout(self.embed(captions))
        # embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        embeddings = torch.cat((features.unsqueeze(0), embeddings), dim=0)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs


In [31]:
class CNNtoRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(CNNtoRNN, self).__init__()
        self.encoderCNN = EncoderCNN(embed_size).to(device)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)

    def forward(self, images, captions):
        features = self.encoderCNN(images)
        outputs = self.decoderRNN(features, captions)
        return outputs
    
    def caption_image(self, image, vocabulary, max_length=50):
        result_caption = []
        with torch.no_grad():
            x = self.encoderCNN(image).unsqueeze(0) # so that we have a dimention for batch
            states = None
            for _ in range(max_length):
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens.squeeze(0))
                predicted = output.argmax(1) # take the word with the highest probability

                result_caption.append(predicted.item())
                x = self.decoderRNN.embed(predicted).unsqueeze(0)

                if vocabulary.itos[predicted.item()] == "<EOS>":
                    break
                
        return [vocabulary.itos[idx] for idx in result_caption]

In [32]:
def custom_tokenizer(text):
    # Define patterns for common tokens
    patterns = [
        r"\w+",            # Word tokens
        r"\d+",            # Numeric tokens
        r"\S+"             # Other tokens (non-whitespace)
    ]
    
    # Join patterns with the "|" operator to create a single regex pattern
    pattern = "|".join(patterns)
    
    # Use the regex pattern to tokenize the text
    tokens = re.findall(pattern, text)
    
    tokens = [token.lower() for token in tokens]
    
    return tokens


In [33]:
class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold # Minimum frequency for a word to be included in the vocabulary

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.lower() for tok in custom_tokenizer(text)]

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

class FlickrDataset(Dataset):
    def __init__(self, root_dir, captions_file, transform=None, freq_threshold=5):
        self.root_dir = root_dir
        self.df = pd.read_csv(captions_file)
        self.transform = transform

        # Get img, caption columns
        self.imgs = self.df["image"]
        self.captions = self.df["caption"]

        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.captions.tolist())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        caption = self.captions[index]
        img_id = self.imgs[index]
        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")

        if self.transform is not None:
            img = self.transform(img)

        numericalized_caption = [self.vocab.stoi["<SOS>"]]
        numericalized_caption += self.vocab.numericalize(caption)
        numericalized_caption.append(self.vocab.stoi["<EOS>"])

        return img, torch.tensor(numericalized_caption)

torch.Size([10, 3, 5])

In [42]:
class SelfCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        imgs = torch.cat(imgs, dim=0)
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)

        return imgs, targets

def get_loader(
    root_folder,
    annotation_file,
    transform,
    batch_size=BATCH_SIZE,
    num_workers=WORKERS,
    shuffle=True,
    pin_memory=True,
):
    dataset = FlickrDataset(root_folder, annotation_file, transform=transform)
    total_samples = len(dataset)

    pad_idx = dataset.vocab.stoi["<PAD>"]

    train_size = int(TRAIN_RATIO * total_samples)
    val_size = int(TEST_RATIO * total_samples)
    test_size = total_samples - (train_size + val_size)

    train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])

    train_loader = DataLoader(
        dataset=train_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=SelfCollate(pad_idx=pad_idx),
    )
    
    val_loader = DataLoader(
        dataset=val_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
        pin_memory=pin_memory,
        collate_fn=SelfCollate(pad_idx=pad_idx),
    )
    
    test_loader = DataLoader(
        dataset=test_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
        pin_memory=pin_memory,
        collate_fn=SelfCollate(pad_idx=pad_idx),
    )

    return train_loader, val_loader, test_loader, dataset

In [35]:
transform = transforms.Compose([
        transforms.Resize((356, 356)),
        transforms.RandomCrop((299, 299)), # inception v3 input size
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

train_loader, val_loader, test_loader, dataset = get_loader(
        "../Data/Images/", "../Data/captions.txt", transform=transform, num_workers=WORKERS
    )


In [36]:
vocab_size = len(dataset)
load_model = False
save_model = True
writer = SummaryWriter('runs/flicker')
step = 0

In [37]:
model = CNNtoRNN(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, vocab_size=vocab_size, num_layers=NUM_LAYERS).to(device)



In [38]:
criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=0.0001)


In [39]:
for image, caption in train_loader:
    print(f'{image.shape=}')
    print(f'{caption.shape=}')
    output = model(image.to(device), caption.to(device)[:-1])
    print(f'{output.shape=}')
    # print(output)

    break

# torch.Size([32, 3, 299, 299])
# torch.Size([34, 32, 40455])

image.shape=torch.Size([32, 3, 299, 299])
caption.shape=torch.Size([27, 32])
output.shape=torch.Size([27, 32, 40455])


In [40]:
def train(model, train_loader, epochs, criterion, optimizer, scheduler):
    train_acc = 0.0
    train_loss = 0.0
    
    for epoch in tqdm(range(epochs), "Train: " ):
        total = 0
        running_loss = 0.0
        running_acc = 0.0
        print(f'Epoch: {epoch +1}')
        model.train()
        for idx, (images, captions) in enumerate(train_loader):
            images, captions = images.to(device), captions.to(device)

            optimizer.zero_grad()
            output = model(images, captions[:-1]) 
            # print(image.shape)
            print(output.shape)
            print(output.reshape(-1, output.shape[2]).shape)
            print(caption.shape)
            print(caption.reshape(-1).shape)
            loss = criterion(output.reshape(-1, output.shape[2]), caption.reshape(-1))
            loss.backward(loss)
            optimizer.step()

            running_loss += loss.item()
            total += caption.size(1)

        scheduler.step()
        print(f'Train Loss: {running_loss/total}')




In [41]:
train(model, train_loader, epochs=EPOCHS,  criterion=criterion, optimizer=optimizer, scheduler=scheduler)

Train:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 1
torch.Size([32, 3, 299, 299])
torch.Size([22, 32, 40455])
torch.Size([704, 40455])
torch.Size([27, 32])
torch.Size([864])


Train:   0%|          | 0/2 [00:00<?, ?it/s]


ValueError: Expected input batch_size (704) to match target batch_size (864).

In [None]:
def train(train_loader, val_loader, epochs, criterion, optimiser, scheduler):
    train_acc = 0.0
    train_loss = 0.0
    # valid_acc = 0.0
    # valid_loss = 0.0
    
    for epoch in tqdm(range(EPOCHS), f"Train:{epoch}"):
        total = 0
        running_loss = 0.0
        running_acc = 0.0

        model.train()
        for idx, (images, captions) in enumerate(train_loader):
            images, captions = images.to(device), captions.to(device)

            optimizer.zero_grad()
            output = model(images, captions[:-1]) # has to predict the last word

            loss = criterion(output.reshape(-1, output.shape[2]), caption.reshape(-1))
            loss.backward(loss)
            optimiser.step()

            running_loss += loss.item()
            total += caption.size(1)

        scheduler.step()

        print(f'Train Loss: {running_loss/total}')
        # count = 0
        # running_val_loss = 0.0
        # running_val_acc = 0.0
        
        # model.eval()
        # with torch.not_grad()
        #     for idx, (images, caption) in enumerate(val_loader):
        #         images, caption = images.to(device), caption.to(devices)

                

                




