In [2]:
import os
import torch
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.models as models
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import DataLoader, Dataset, random_split
from torch.utils.tensorboard import SummaryWriter
import torchvision.datasets as Datasets
import torchvision.transforms as transforms



In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
TRAIN_RATIO = 0.7
TEST_RATIO = 0.15
VAL_RATIO = 0.15

BATCH_SIZE = 32
WORKERS = 4
LEARNING_RATE=0.01
EMBED_SIZE = 256
HIDDEN_SIZE = 256
NUM_LAYERS = 1
EPOCHS = 20


In [5]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size, encoded_image_size=14 , train_CNN=False):
        super(EncoderCNN, self).__init__()
        self.train_CNN = train_CNN

        # inception = models.inception_v3(weights=True)
        resnet101 = models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V2)
        modules = list(resnet101.children())[:-2]
        self.resnet = nn.Sequential(*modules)

        self.adaptive_pool = nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))
        self.fc = nn.Linear(encoded_image_size, embed_size)
        self.dropout = nn.Dropout(p=0.5)
        self.batchnorm = nn.BatchNorm1d(embed_size, momentum=0.01)
        self.fine_tune()

    def forward(self, images):
        out = self.resnet(images)
        out = self.adaptive_pool(out)
        # out = self.batchnorm(out)
        out = self.fc(out)
        return out
    
    def fine_tune(self, fine_tune=True):

        for p in self.resnet.parameters():
            p.requires_grad = False
            
        for c in list(self.resnet.children())[5:]:
            for p in c.parameters():
                p.requires_grad = fine_tune

        

In [11]:
resnet101 = models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V2)
modules = list(resnet101.children())[:-2]
ad = nn.AdaptiveAvgPool2d(1)
fc = nn.Linear(in_features=1, out_features=256)
batchnorm = nn.BatchNorm1d(256, momentum=0.01)
# print(modules)
inp = torch.randn(32,3,224,224)
inception =  nn.Sequential(*modules)
out = inception(inp)
out = ad(out)
out = out.reshape(out.shape[0], -1)
out = batchnorm(out)
# out = fc(out)
out.shape
# print(inception(inp))


RuntimeError: running_mean should contain 2048 elements not 256

In [14]:
inception = models.inception_v3(weights=True)
inception.fc = nn.Linear(inception.fc.in_features, 256)
relu = nn.ReLU()
dropout = nn.Dropout(0.5)
batchnorm = nn.BatchNorm1d(256, momentum=0.01)
inp = torch.randn(32,3,299,299)
o = inception(inp)
of = o.logits
o = batchnorm(dropout(relu(o)))
o



TypeError: relu(): argument 'input' (position 1) must be Tensor, not InceptionOutputs

In [46]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=40):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, features, captions):
        embeddings = self.dropout(self.embed(captions))
        embeddings = torch.cat((features.unsqueeze(0), embeddings), dim=0)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs


In [47]:
class CNNtoRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(CNNtoRNN, self).__init__()
        self.encoderCNN = EncoderCNN(embed_size).to(device)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)

    def forward(self, images, captions):
        features = self.encoderCNN(images)
        outputs = self.decoderRNN(features, captions)
        return outputs
    
    def caption_image(self, image, vocabulary, max_length=50):
        result_caption = []
        with torch.no_grad():
            x = self.encoderCNN(image).unsqueeze(0) # so that we have a dimention for batch
            states = None
            for _ in range(max_length):
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens.squeeze(0))
                predicted = output.argmax(1) # take the word with the highest probability

                result_caption.append(predicted.item())
                x = self.decoderRNN.embed(predicted).unsqueeze(0)

                if vocabulary.itos[predicted.item()] == "<EOS>":
                    break
                
        return [vocabulary.itos[idx] for idx in result_caption]

In [48]:
def custom_tokenizer(text):
    # Define patterns for common tokens
    patterns = [
        r"\w+",            # Word tokens
        r"\d+",            # Numeric tokens
        r"\S+"             # Other tokens (non-whitespace)
    ]
    
    # Join patterns with the "|" operator to create a single regex pattern
    pattern = "|".join(patterns)
    
    # Use the regex pattern to tokenize the text
    tokens = re.findall(pattern, text)
    
    tokens = [token.lower() for token in tokens]
    
    return tokens


In [49]:
class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold # Minimum frequency for a word to be included in the vocabulary

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.lower() for tok in custom_tokenizer(text)]

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]
    
    def caption_len(self,text):
        tokenized_text = self.tokenizer_eng(text)
        return len(tokenized_text)
    

class FlickrDataset(Dataset):
    def __init__(self, root_dir, captions_file, transform=None, freq_threshold=3):
        self.root_dir = root_dir
        self.df = pd.read_csv(captions_file)
        self.transform = transform

        # Get img, caption columns
        self.imgs = self.df["image"]
        self.captions = self.df["caption"]

        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.captions.tolist())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        caption = self.captions[index]
        img_id = self.imgs[index]
        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")

        if self.transform is not None:
            img = self.transform(img)
        caplen = self.vocab.caption_len(caption)
        numericalized_caption = [self.vocab.stoi["<SOS>"]]
        numericalized_caption += self.vocab.numericalize(caption)
        numericalized_caption.append(self.vocab.stoi["<EOS>"])

        return img, torch.tensor(numericalized_caption), torch.LongTensor([caplen])

In [50]:
class SelfCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        imgs = torch.cat(imgs, dim=0)
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)
        caplen = [item[2] for item in batch]

        return imgs, targets, caplen

def get_loader(
    root_folder,
    annotation_file,
    transform,
    batch_size=BATCH_SIZE,
    num_workers=WORKERS,
    shuffle=True,
    pin_memory=True,
):
    dataset = FlickrDataset(root_folder, annotation_file, transform=transform)
    total_samples = len(dataset)

    pad_idx = dataset.vocab.stoi["<PAD>"]

    train_size = int(TRAIN_RATIO * total_samples)
    val_size = int(TEST_RATIO * total_samples)
    test_size = total_samples - (train_size + val_size)

    train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])

    train_loader = DataLoader(
        dataset=train_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=SelfCollate(pad_idx=pad_idx),
    )
    
    val_loader = DataLoader(
        dataset=val_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
        pin_memory=pin_memory,
        collate_fn=SelfCollate(pad_idx=pad_idx),
    )
    
    test_loader = DataLoader(
        dataset=test_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
        pin_memory=pin_memory,
        collate_fn=SelfCollate(pad_idx=pad_idx),
    )

    return train_loader, val_loader, test_loader, dataset

In [51]:
transform = transforms.Compose([
        transforms.Resize((356, 356)),
        transforms.RandomCrop((224, 224)), 
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

train_loader, val_loader, test_loader, dataset = get_loader(
        "../Data/Images/", "../Data/captions.txt", transform=transform, num_workers=WORKERS
    )
vocab_size = len(dataset)


In [52]:
model = CNNtoRNN(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, vocab_size=vocab_size, num_layers=NUM_LAYERS).to(device)

In [53]:
criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi['<PAD>']).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=0.0001)


In [67]:
for image, caption, caplen in train_loader:
    # print(f'{image.shape=}')
    # print(f'{caption.shape=}')
    # print(f'{ sum([l.item() for l in caplen])}')
    # cap = " ".join(dataset.vocab.itos[i] for i in caption.numpy()[:,-1])
    # print(cap)
    # output = model(image.to(device), caption.to(device)[:-1:])
    # print(f'{output.shape=}')
    embed = nn.Embedding(vocab_size, 256)
    o = embed(caption)
    print(o.shape)
    break

# torch.Size([32, 3, 299, 299])
# torch.Size([34, 32, 40455])

torch.Size([25, 32, 256])


In [61]:
def train(model, train_loader, epochs, criterion, optimizer, scheduler):
    train_loss = []
    
    for epoch in range(epochs):
        total_train_tokens = 0
        running_train_loss = 0.0
        running_acc = 0.0
        print(f'Epoch: {epoch +1}')
        model.train()
        for images, captions, caplen in tqdm(train_loader, desc="Train:\t"):
            images = images.to(device)
            captions = captions.to(device)
           
            output = model(images, captions[:-1]) 
           
            optimizer.zero_grad()
            loss = criterion(output.reshape(-1, output.shape[2]), captions.reshape(-1))
            loss.backward()
            optimizer.step()

            running_train_loss += loss.item()
            total_train_tokens +=  sum([l.item() for l in caplen])

        scheduler.step()

        model.eval()
        total_val_tokens = 0
        running_val_loss = 0.0
        with torch.no_grad():
            for images, captions, caplen in tqdm(val_loader, desc="Validate:\t"):
                images = images.to(device)
                captions = captions.to(device)

                output = model(images, captions[:-1])

                loss = criterion(output.reshape(-1, output.shape[2]), captions.reshape(-1))

                running_val_loss += loss.item()
                total_val_tokens +=  sum([l.item() for l in caplen])

        print(f'Train Loss: {running_train_loss/total_train_tokens}, Validation Loss: {running_val_loss/total_val_tokens}')

In [62]:
train(model, train_loader, epochs=EPOCHS, criterion=criterion, optimizer=optimizer, scheduler=scheduler)
print("Training Complete!!")

Epoch: 1


Train: 100%|██████████| 885/885 [00:40<00:00, 22.09it/s]
Validate:   0%|          | 0/190 [00:00<?, ?it/s]


AttributeError: 'Tensor' object has no attribute 'logits'