In [1]:
import pandas as pd
import numpy as np
from collections import Counter 
import torchvision
from torchvision import transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch.autograd import Variable
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import math
import torch.nn.functional as F
import pickle
import gc
import random
pd.set_option('display.max_colwidth', None)
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [2]:
import cv2

## Read Data.

In [3]:
df = pd.read_csv("../input/image-caption/train_set_1 (1).csv")
print(len(df))
display(df.head(3))

9736


Unnamed: 0,image,Caption
0,110000.jpg,technician holding a snake removed from a house
1,110005.jpg,a city wears a gown for her wedding by the sea .
2,110006.jpg,fox can show us the skills necessary when it comes to handling people .


In [4]:
df2 = pd.read_csv("../input/image-caption/train_set_2 (1).csv")
print(len(df2))
display(df2.head(3))

6550


Unnamed: 0,image,Caption
0,132282.jpg,the stool is available in a range of colours
1,132283.jpg,"once i tried this homemade mayo , i never turned back ."
2,132289.jpg,"this is an edit , but she 'd look nice with that hair color ."


## Preprocessing -> Remove Single Character and non alpha Words. Add <Start>, <end> and <pad> tokens. <pad> token is appended such that length in max_seq_len (maximum length across all captions which is 33 in our case)  

In [5]:
def remove_single_char_word(word_list):
    lst = []
    for word in word_list:
        if len(word)>1:
            lst.append(word)

    return lst

In [6]:
df['cleaned_caption'] = df['Caption'].apply(lambda Caption : ['<start>'] + [word.lower() if word.isalpha() else '' for word in Caption.split(" ")] + ['<end>'])
df['cleaned_caption']  = df['cleaned_caption'].apply(lambda x : remove_single_char_word(x))

In [7]:
df2['cleaned_caption'] = df2['Caption'].apply(lambda Caption : ['<start>'] + [word.lower() if word.isalpha() else '' for word in Caption.split(" ")] + ['<end>'])
df2['cleaned_caption']  = df2['cleaned_caption'].apply(lambda x : remove_single_char_word(x))

In [8]:
df['seq_len'] = df['cleaned_caption'].apply(lambda x : len(x))
max_seq_len = df['seq_len'].max()
print(max_seq_len)

39


In [9]:
df2['seq_len'] = df2['cleaned_caption'].apply(lambda x : len(x))
max_seq_len = df2['seq_len'].max()
print(max_seq_len)

40


In [10]:
df.drop(['seq_len'], axis = 1, inplace = True)
df['cleaned_caption'] = df['cleaned_caption'].apply(lambda caption : caption + ['<pad>']*(max_seq_len-len(caption)) )

In [11]:
df2.drop(['seq_len'], axis = 1, inplace = True)
df2['cleaned_caption'] = df2['cleaned_caption'].apply(lambda caption : caption + ['<pad>']*(max_seq_len-len(caption)) )

In [12]:
display(df.head(2))

Unnamed: 0,image,Caption,cleaned_caption
0,110000.jpg,technician holding a snake removed from a house,"[<start>, technician, holding, snake, removed, from, house, <end>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>]"
1,110005.jpg,a city wears a gown for her wedding by the sea .,"[<start>, city, wears, gown, for, her, wedding, by, the, sea, <end>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>]"


In [13]:
display(df2.head(2))

Unnamed: 0,image,Caption,cleaned_caption
0,132282.jpg,the stool is available in a range of colours,"[<start>, the, stool, is, available, in, range, of, colours, <end>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>]"
1,132283.jpg,"once i tried this homemade mayo , i never turned back .","[<start>, once, tried, this, homemade, mayo, never, turned, back, <end>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>]"


## Create Vocab and mapping of token to ID

In [14]:
word_list = df['cleaned_caption'].apply(lambda x : " ".join(x)).str.cat(sep = ' ').split(' ')
word_dict = Counter(word_list)
word_dict =  sorted(word_dict, key=word_dict.get, reverse=True)

In [15]:
word_list2 = df2['cleaned_caption'].apply(lambda x : " ".join(x)).str.cat(sep = ' ').split(' ')
word_dict2 = Counter(word_list2)
word_dict2 =  sorted(word_dict2, key=word_dict2.get, reverse=True)

In [16]:
print(len(word_dict))
print(word_dict[:5])

8808
['<pad>', '<start>', '<end>', 'the', 'of']


In [17]:
print(len(word_dict2))
print(word_dict2[:5])

7434
['<pad>', '<start>', '<end>', 'the', 'of']


### Vocab size is 8360

In [18]:
vocab_size = len(word_dict)
print(vocab_size)

8808


In [19]:
vocab_size2 = len(word_dict2)
print(vocab_size2)

7434


In [20]:
index_to_word = {index: word for index, word in enumerate(word_dict)}
word_to_index = {word: index for index, word in enumerate(word_dict)}
print(len(index_to_word), len(word_to_index))

8808 8808


In [21]:
index_to_word2 = {index: word for index, word in enumerate(word_dict2)}
word_to_index2 = {word: index for index, word in enumerate(word_dict2)}
print(len(index_to_word2), len(word_to_index2))

7434 7434


### Covert sequence of tokens to IDs

In [22]:
df['text_seq']  = df['cleaned_caption'].apply(lambda caption : [word_to_index[word] for word in caption] )

In [23]:
df2['text_seq']  = df2['cleaned_caption'].apply(lambda caption : [word_to_index2[word] for word in caption] )

In [24]:
display(df.head(2))

Unnamed: 0,image,Caption,cleaned_caption,text_seq
0,110000.jpg,technician holding a snake removed from a house,"[<start>, technician, holding, snake, removed, from, house, <end>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>]","[1, 5150, 277, 2028, 3689, 15, 43, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,110005.jpg,a city wears a gown for her wedding by the sea .,"[<start>, city, wears, gown, for, her, wedding, by, the, sea, <end>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>]","[1, 44, 931, 372, 10, 35, 59, 16, 3, 176, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [25]:
display(df2.head(2))

Unnamed: 0,image,Caption,cleaned_caption,text_seq
0,132282.jpg,the stool is available in a range of colours,"[<start>, the, stool, is, available, in, range, of, colours, <end>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>]","[1, 3, 2816, 13, 785, 5, 374, 4, 1496, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,132283.jpg,"once i tried this homemade mayo , i never turned back .","[<start>, once, tried, this, homemade, mayo, never, turned, back, <end>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>, <pad>]","[1, 464, 1778, 12, 1272, 4023, 297, 1109, 88, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


## Split In Train and validation data. Same Image should not be present in both training and validation data 

In [26]:
df = df.sort_values(by = 'image')
train = df.iloc[:int(0.9*len(df))]
valid = df.iloc[int(0.9*len(df)):]

In [27]:
df2 = df2.sort_values(by = 'image')
train2 = df2.iloc[:int(0.9*len(df2))]
valid2 = df2.iloc[int(0.9*len(df2)):]

print(len(train), train['id'].nunique())


In [28]:
print(len(train), train['image'].nunique())
print(len(valid), valid['image'].nunique())

8762 8762
974 974


In [29]:
print(len(train2), train2['image'].nunique())
print(len(valid2), valid2['image'].nunique())

5895 5895
655 655


## Extract features from Images Using Resnet

In [30]:
train_samples = len(train)
print(train_samples)

8762


In [31]:
train_samples2 = len(train2)
print(train_samples2)

5895


In [32]:
unq_train_imgs = train[['image']].drop_duplicates()
unq_valid_imgs = valid[['image']].drop_duplicates()
print(len(unq_train_imgs), len(unq_valid_imgs))

8762 974


In [33]:
unq_train_imgs2 = train2[['image']].drop_duplicates()
unq_valid_imgs2 = valid2[['image']].drop_duplicates()
print(len(unq_train_imgs2), len(unq_valid_imgs2))

5895 655


In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [35]:
class extractImageFeatureResNetDataSet():
    def __init__(self, data):
        self.data = data 
        self.scaler = transforms.Resize([224, 224])
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
        self.to_tensor = transforms.ToTensor()
    def __len__(self):  
        return len(self.data)

    def __getitem__(self, idx):

        image_name = self.data.iloc[idx]['image']
        img_loc = '../input/image-caption/Train 1-20220219T110320Z-001/Train 1/'+str((image_name))

        img = Image.open(img_loc).convert('RGB')
        t_img = self.normalize(self.to_tensor(self.scaler(img)))

        return image_name, t_img

In [36]:
class extractImageFeatureResNetDataSet2():
    def __init__(self, data):
        self.data = data 
        self.scaler = transforms.Resize([224, 224])
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
        self.to_tensor = transforms.ToTensor()
    def __len__(self):  
        return len(self.data)

    def __getitem__(self, idx):

        image_name = self.data.iloc[idx]['image']
        img_loc = '../input/image-caption/Train 2-20220219T110322Z-001/Train 2/'+str((image_name))
        img = Image.open(img_loc).convert('RGB')
        t_img = self.normalize(self.to_tensor(self.scaler(img)))

        return image_name, t_img

In [37]:
train_ImageDataset_ResNet = extractImageFeatureResNetDataSet(unq_train_imgs)
train_ImageDataloader_ResNet = DataLoader(train_ImageDataset_ResNet, batch_size = 1, shuffle=False)

In [38]:
train_ImageDataset_ResNet2 = extractImageFeatureResNetDataSet2(unq_train_imgs2)
train_ImageDataloader_ResNet2 = DataLoader(train_ImageDataset_ResNet2, batch_size = 1, shuffle=False)

In [39]:
valid_ImageDataset_ResNet = extractImageFeatureResNetDataSet(unq_valid_imgs)
valid_ImageDataloader_ResNet = DataLoader(valid_ImageDataset_ResNet, batch_size = 1, shuffle=False)

In [40]:
valid_ImageDataset_ResNet2 = extractImageFeatureResNetDataSet2(unq_valid_imgs2)
valid_ImageDataloader_ResNet2 = DataLoader(valid_ImageDataset_ResNet2, batch_size = 1, shuffle=False)

In [41]:
resnet18 = torchvision.models.resnet18(pretrained=True).to(device)
resnet18.eval()
list(resnet18._modules)

Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

['conv1',
 'bn1',
 'relu',
 'maxpool',
 'layer1',
 'layer2',
 'layer3',
 'layer4',
 'avgpool',
 'fc']

In [42]:
resNet18Layer4 = resnet18._modules.get('layer4').to(device)

In [43]:
def get_vector(t_img):
    
    t_img = Variable(t_img)
    my_embedding = torch.zeros(1, 512, 7, 7)
    def copy_data(m, i, o):
        my_embedding.copy_(o.data)
    
    h = resNet18Layer4.register_forward_hook(copy_data)
    resnet18(t_img)
    
    h.remove()
    return my_embedding

In [44]:
extract_imgFtr_ResNet_train = {}
for image_name, t_img in tqdm(train_ImageDataloader_ResNet):
    t_img = t_img.to(device)
    embdg = get_vector(t_img)
    
    extract_imgFtr_ResNet_train[image_name[0]] = embdg
    

  0%|          | 0/8762 [00:00<?, ?it/s]

In [45]:
extract_imgFtr_ResNet_valid = {}
for image_name, t_img in tqdm(valid_ImageDataloader_ResNet):
    t_img = t_img.to(device)
    embdg = get_vector(t_img)
 
    extract_imgFtr_ResNet_valid[image_name[0]] = embdg

  0%|          | 0/974 [00:00<?, ?it/s]

In [46]:
a_file = open("./EncodedImageValidResNet.pkl", "wb")
pickle.dump(extract_imgFtr_ResNet_valid, a_file)
a_file.close()

In [47]:
a_file = open("./EncodedImageTrainResNet.pkl", "wb")
pickle.dump(extract_imgFtr_ResNet_train, a_file)
a_file.close()

In [48]:
extract_imgFtr_ResNet_train2 = {}
for image_name, t_img in tqdm(train_ImageDataloader_ResNet2):
    t_img = t_img.to(device)
    embdg = get_vector(t_img)
    
    extract_imgFtr_ResNet_train2[image_name[0]] = embdg
    

  0%|          | 0/5895 [00:00<?, ?it/s]

In [49]:
a_file2 = open("./EncodedImageTrainResNet2.pkl", "wb")
pickle.dump(extract_imgFtr_ResNet_train2, a_file2)
a_file2.close()

In [50]:
extract_imgFtr_ResNet_valid2 = {}
for image_name, t_img in tqdm(valid_ImageDataloader_ResNet2):
    t_img = t_img.to(device)
    embdg = get_vector(t_img)
 
    extract_imgFtr_ResNet_valid2[image_name[0]] = embdg

  0%|          | 0/655 [00:00<?, ?it/s]

In [51]:
a_file2 = open("./EncodedImageValidResNet2.pkl", "wb")
pickle.dump(extract_imgFtr_ResNet_valid2, a_file2)
a_file2.close()

## Create DataLoader which will be used to load data into Transformer Model.
## FlickerDataSetResnet will return caption sequence, 1 timestep left shifted caption sequence which model will predict and Stored Image features from ResNet.

In [52]:
class FlickerDataSetResnet():
    def __init__(self, data, pkl_file):
        self.data = data
        self.encodedImgs = pd.read_pickle(pkl_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
    
        caption_seq = self.data.iloc[idx]['text_seq']
        target_seq = caption_seq[1:]+[0]

        image_name   = self.data.iloc[idx]['image']
        image_tensor = self.encodedImgs[image_name]
        image_tensor = image_tensor.permute(0,2,3,1)
        image_tensor_view = image_tensor.view(image_tensor.size(0), -1, image_tensor.size(3))

        return torch.tensor(caption_seq), torch.tensor(target_seq), image_tensor_view

In [53]:
train_dataset_resnet = FlickerDataSetResnet(train, 'EncodedImageTrainResNet.pkl')
train_dataloader_resnet = DataLoader(train_dataset_resnet, batch_size = 32, shuffle=True)

In [54]:
valid_dataset_resnet = FlickerDataSetResnet(valid, 'EncodedImageValidResNet.pkl')
valid_dataloader_resnet = DataLoader(valid_dataset_resnet, batch_size = 32, shuffle=True)

In [55]:
train_dataset_resnet2 = FlickerDataSetResnet(train2, 'EncodedImageTrainResNet2.pkl')
train_dataloader_resnet2 = DataLoader(train_dataset_resnet2, batch_size = 32, shuffle=True)

In [56]:
valid_dataset_resnet2 = FlickerDataSetResnet(valid2, 'EncodedImageValidResNet2.pkl')
valid_dataloader_resnet2 = DataLoader(valid_dataset_resnet2, batch_size = 32, shuffle=True)

## Create Transformer Decoder Model. This Model will take caption sequence and the extracted resnet image features as input and ouput 1 timestep shifted (left) caption sequence. 
## In the Transformer decoder, lookAhead and padding mask has also been applied

### Position Embedding

In [57]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=max_seq_len):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        

    def forward(self, x):
        if self.pe.size(0) < x.size(0):
            self.pe = self.pe.repeat(x.size(0), 1, 1).to(device)
        self.pe = self.pe[:x.size(0), : , : ]
        
        x = x + self.pe
        return self.dropout(x)

## Transformer Decoder

In [58]:
class ImageCaptionModel(nn.Module):
    def __init__(self, n_head, n_decoder_layer, vocab_size, embedding_size):
        super(ImageCaptionModel, self).__init__()
        self.pos_encoder = PositionalEncoding(embedding_size, 0.1)
        self.TransformerDecoderLayer = nn.TransformerDecoderLayer(d_model =  embedding_size, nhead = n_head)
        self.TransformerDecoder = nn.TransformerDecoder(decoder_layer = self.TransformerDecoderLayer, num_layers = n_decoder_layer)
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(vocab_size , embedding_size)
        self.last_linear_layer = nn.Linear(embedding_size, vocab_size)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.last_linear_layer.bias.data.zero_()
        self.last_linear_layer.weight.data.uniform_(-initrange, initrange)

    def generate_Mask(self, size, decoder_inp):
        decoder_input_mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
        decoder_input_mask = decoder_input_mask.float().masked_fill(decoder_input_mask == 0, float('-inf')).masked_fill(decoder_input_mask == 1, float(0.0))

        decoder_input_pad_mask = decoder_inp.float().masked_fill(decoder_inp == 0, float(0.0)).masked_fill(decoder_inp > 0, float(1.0))
        decoder_input_pad_mask_bool = decoder_inp == 0

        return decoder_input_mask, decoder_input_pad_mask, decoder_input_pad_mask_bool

    def forward(self, encoded_image, decoder_inp):
        encoded_image = encoded_image.permute(1,0,2)
        

        decoder_inp_embed = self.embedding(decoder_inp)* math.sqrt(self.embedding_size)
        
        decoder_inp_embed = self.pos_encoder(decoder_inp_embed)
        decoder_inp_embed = decoder_inp_embed.permute(1,0,2)
        

        decoder_input_mask, decoder_input_pad_mask, decoder_input_pad_mask_bool = self.generate_Mask(decoder_inp.size(1), decoder_inp)
        decoder_input_mask = decoder_input_mask.to(device)
        decoder_input_pad_mask = decoder_input_pad_mask.to(device)
        decoder_input_pad_mask_bool = decoder_input_pad_mask_bool.to(device)
        

        decoder_output = self.TransformerDecoder(tgt = decoder_inp_embed, memory = encoded_image, tgt_mask = decoder_input_mask, tgt_key_padding_mask = decoder_input_pad_mask_bool)
        
        final_output = self.last_linear_layer(decoder_output)

        return final_output,  decoder_input_pad_mask


##  Train the Model

### The cross entropy loss has been masked at time steps where input token is <'pad'>.

In [59]:
EPOCH = 180

In [60]:
ictModel = ImageCaptionModel(16, 4, vocab_size, 512).to(device)
optimizer = torch.optim.Adam(ictModel.parameters(), lr = 0.00001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = 0.8, patience=2, verbose = True)
criterion = torch.nn.CrossEntropyLoss(reduction='none')
min_val_loss = np.float('Inf')

In [61]:
for epoch in tqdm(range(EPOCH)):
    total_epoch_train_loss = 0
    total_epoch_valid_loss = 0
    total_train_words = 0
    total_valid_words = 0
    ictModel.train()

    ### Train Loop
    for caption_seq, target_seq, image_embed in train_dataloader_resnet:

        optimizer.zero_grad()

        image_embed = image_embed.squeeze(1).to(device)
        caption_seq = caption_seq.to(device)
        target_seq = target_seq.to(device)

        output, padding_mask = ictModel.forward(image_embed, caption_seq)
        output = output.permute(1, 2, 0)

        loss = criterion(output,target_seq)

        loss_masked = torch.mul(loss, padding_mask)

        final_batch_loss = torch.sum(loss_masked)/torch.sum(padding_mask)

        final_batch_loss.backward()
        optimizer.step()
        total_epoch_train_loss += torch.sum(loss_masked).detach().item()
        total_train_words += torch.sum(padding_mask)

 
    total_epoch_train_loss = total_epoch_train_loss/total_train_words
  

    ### Eval Loop
    model = (torch.load("../input/imahecap/BestModel2"))
    model.eval()
    ictModel.eval()
    with torch.no_grad():
        for caption_seq, target_seq, image_embed in valid_dataloader_resnet:

            image_embed = image_embed.squeeze(1).to(device)
            caption_seq = caption_seq.to(device)
            target_seq = target_seq.to(device)

            output, padding_mask = ictModel.forward(image_embed, caption_seq)
            output = output.permute(1, 2, 0)

            loss = criterion(output,target_seq)

            loss_masked = torch.mul(loss, padding_mask)

            total_epoch_valid_loss += torch.sum(loss_masked).detach().item()
            total_valid_words += torch.sum(padding_mask)

    total_epoch_valid_loss = total_epoch_valid_loss/total_valid_words
  
    print("Epoch -> ", epoch," Training Loss -> ", total_epoch_train_loss.item(), "Eval Loss -> ", total_epoch_valid_loss.item() )
  
    if min_val_loss > total_epoch_valid_loss:
        print("Writing Model at epoch ", epoch)
        torch.save(ictModel, './BestModel')
        min_val_loss = total_epoch_valid_loss
  

    scheduler.step(total_epoch_valid_loss.item())

  0%|          | 0/180 [00:00<?, ?it/s]

Epoch ->  0  Training Loss ->  7.286264896392822 Eval Loss ->  6.536016941070557
Writing Model at epoch  0
Epoch ->  1  Training Loss ->  6.434105396270752 Eval Loss ->  6.226938247680664
Writing Model at epoch  1
Epoch ->  2  Training Loss ->  6.171115875244141 Eval Loss ->  6.046815395355225
Writing Model at epoch  2
Epoch ->  3  Training Loss ->  5.995855331420898 Eval Loss ->  5.9235310554504395
Writing Model at epoch  3
Epoch ->  4  Training Loss ->  5.866865158081055 Eval Loss ->  5.8386149406433105
Writing Model at epoch  4
Epoch ->  5  Training Loss ->  5.7616071701049805 Eval Loss ->  5.774142742156982
Writing Model at epoch  5
Epoch ->  6  Training Loss ->  5.669800281524658 Eval Loss ->  5.722581386566162
Writing Model at epoch  6
Epoch ->  7  Training Loss ->  5.585787773132324 Eval Loss ->  5.67850923538208
Writing Model at epoch  7
Epoch ->  8  Training Loss ->  5.507917881011963 Eval Loss ->  5.642116546630859
Writing Model at epoch  8
Epoch ->  9  Training Loss ->  5.43

In [62]:
model = (torch.load("../input/imahecap/BestModel2"))
model.eval()

ImageCaptionModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (TransformerDecoderLayer): TransformerDecoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
    )
    (multihead_attn): MultiheadAttention(
      (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
    )
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
    (dropout3): Dropout(p=0.1, inplace=False)
  )
  (TransformerDecoder): TransformerDecoder(
    (layers): Module

In [63]:
for epoch in tqdm(range(EPOCH)):
    total_epoch_train_loss = 0
    total_epoch_valid_loss = 0
    total_train_words = 0
    total_valid_words = 0
    ictModel.train()

    ### Train Loop
    for caption_seq, target_seq, image_embed in train_dataloader_resnet2:

        optimizer.zero_grad()

        image_embed = image_embed.squeeze(1).to(device)
        caption_seq = caption_seq.to(device)
        target_seq = target_seq.to(device)

        output, padding_mask = ictModel.forward(image_embed, caption_seq)
        output = output.permute(1, 2, 0)

        loss = criterion(output,target_seq)

        loss_masked = torch.mul(loss, padding_mask)

        final_batch_loss = torch.sum(loss_masked)/torch.sum(padding_mask)

        final_batch_loss.backward()
        optimizer.step()
        total_epoch_train_loss += torch.sum(loss_masked).detach().item()
        total_train_words += torch.sum(padding_mask)

 
    total_epoch_train_loss = total_epoch_train_loss/total_train_words
  

    ### Eval Loop
    ictModel.eval()
    with torch.no_grad():
        for caption_seq, target_seq, image_embed in valid_dataloader_resnet2:

            image_embed = image_embed.squeeze(1).to(device)
            caption_seq = caption_seq.to(device)
            target_seq = target_seq.to(device)

            output, padding_mask = ictModel.forward(image_embed, caption_seq)
            output = output.permute(1, 2, 0)

            loss = criterion(output,target_seq)

            loss_masked = torch.mul(loss, padding_mask)

            total_epoch_valid_loss += torch.sum(loss_masked).detach().item()
            total_valid_words += torch.sum(padding_mask)

    total_epoch_valid_loss = total_epoch_valid_loss/total_valid_words
  
    print("Epoch -> ", epoch," Training Loss -> ", total_epoch_train_loss.item(), "Eval Loss -> ", total_epoch_valid_loss.item() )
  
    if min_val_loss > total_epoch_valid_loss:
        print("Writing Model at epoch ", epoch)
        torch.save(ictModel, './BestModel')
        min_val_loss = total_epoch_valid_loss
  

    scheduler.step(total_epoch_valid_loss.item())

  0%|          | 0/180 [00:00<?, ?it/s]

Epoch ->  0  Training Loss ->  6.975173473358154 Eval Loss ->  7.07016134262085
Epoch ->  1  Training Loss ->  6.946278095245361 Eval Loss ->  7.045241832733154
Epoch ->  2  Training Loss ->  6.922368049621582 Eval Loss ->  7.024889945983887
Epoch ->  3  Training Loss ->  6.900909423828125 Eval Loss ->  7.00714111328125
Epoch ->  4  Training Loss ->  6.881504535675049 Eval Loss ->  6.991244316101074
Epoch ->  5  Training Loss ->  6.868131160736084 Eval Loss ->  6.97640323638916
Epoch ->  6  Training Loss ->  6.852450847625732 Eval Loss ->  6.96223783493042
Epoch ->  7  Training Loss ->  6.836029529571533 Eval Loss ->  6.9487223625183105
Epoch ->  8  Training Loss ->  6.827873706817627 Eval Loss ->  6.935393333435059
Epoch ->  9  Training Loss ->  6.809876918792725 Eval Loss ->  6.922924041748047
Epoch ->  10  Training Loss ->  6.803190231323242 Eval Loss ->  6.910210132598877
Epoch ->  11  Training Loss ->  6.789141654968262 Eval Loss ->  6.8980607986450195
Epoch ->  12  Training Loss 

In [64]:
model = torch.load('./BestModel')
start_token = word_to_index['<start>']
end_token = word_to_index['<end>']
pad_token = word_to_index['<pad>']
max_seq_len = 40
print(start_token, end_token, pad_token)
pridiction = []

1 2 0


In [65]:
dft = pd.read_csv("../input/image-caption/sample2.csv")

In [66]:
class extractImageFeatureResNetDataSett():
    def __init__(self, data):
        self.data = data 
        self.scaler = transforms.Resize([224, 224])
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
        self.to_tensor = transforms.ToTensor()
    def __len__(self):  
        return len(self.data)

    def __getitem__(self, idx):

        image_name = self.data.iloc[idx]['id']
        img_loc = '../input/image-caption/Test-20220219T111319Z-001/Test/'+str((image_name))

        img = Image.open(img_loc).convert("RGB")
        t_img = self.normalize(self.to_tensor(self.scaler(img)))

        return image_name, t_img

In [67]:
test_ImageDataset_ResNet = extractImageFeatureResNetDataSett(dft)
test_ImageDataloader_ResNet = DataLoader(test_ImageDataset_ResNet, batch_size = 1, shuffle=False)


In [68]:
extract_imgFtr_ResNet_test = {}
for image_name, t_img in tqdm(test_ImageDataloader_ResNet):
    t_img = t_img.to(device)
    embdg = get_vector(t_img)
    
    extract_imgFtr_ResNet_test[image_name[0]] = embdg

  0%|          | 0/3994 [00:00<?, ?it/s]

In [69]:
a_file = open("./testImageValidResNet.pkl", "wb")
pickle.dump(extract_imgFtr_ResNet_test, a_file)
a_file.close()

In [70]:
test_img_embed = pd.read_pickle('testImageValidResNet.pkl')

In [71]:
def generate_caption(k, img_nm): 

    model.eval() 
    img_embed = test_img_embed[img_nm].to(device)


    img_embed = img_embed.permute(0,2,3,1)
    img_embed = img_embed.view(img_embed.size(0), -1, img_embed.size(3))

    
    input_seq = [pad_token]*max_seq_len
    input_seq[0] = start_token

    input_seq = torch.tensor(input_seq).unsqueeze(0).to(device)
    predicted_sentence = []
    with torch.no_grad():
        for eval_iter in range(0, max_seq_len):

            output, padding_mask = model.forward(img_embed, input_seq)

            output = output[eval_iter, 0, :]

            values = torch.topk(output, k).values.tolist()
            indices = torch.topk(output, k).indices.tolist()

            next_word_index = random.choices(indices, values, k = 1)[0]

            next_word = index_to_word[next_word_index]

            input_seq[:, :eval_iter+1] = next_word_index


            if next_word == '<end>' :
                break

            predicted_sentence.append(next_word)
    listToStr = ' '.join(map(str,predicted_sentence))
    print("\n")
    print(listToStr)
    pridiction.append(listToStr)

In [72]:
let = len(dft)
print(let)

3994


In [73]:
df2 = pd.read_csv("../input/image-caption/sample.csv")


In [74]:
for i in range(let):
    generate_caption(1, dft.iloc[i]['id'])



view of on


how to this is in


the is in


the of


the is in


the is in


the is in


the is in forest on


the is with on


the is in garden in


person in garden in


person in


the is and are and


person in home in


actor at


the is in


person in


the of


the is in


the of


the is in


person in


person in


the of on background with on


the of on


the is in


this is with and on


person in


this is in


the of


the is in


this is of


the is just


this is in


the is in


the of


the is with on


person in


person in


person in


person in


the of house by


the is for


person in


the is in


actor with on


the is in


the is with and


the is with and in


person in


the is with and in


the is of


this is with and


the is in


this is with and


person in


this is with and


person in


this is with and table


the is with and on


person in


the is in


the is in


city over city over


the of and in


the is in


the of


the is in


the is wi

In [75]:
print(pridiction)

['view of on', 'how to this is in', 'the is in', 'the of', 'the is in', 'the is in', 'the is in', 'the is in forest on', 'the is with on', 'the is in garden in', 'person in garden in', 'person in', 'the is and are and', 'person in home in', 'actor at', 'the is in', 'person in', 'the of', 'the is in', 'the of', 'the is in', 'person in', 'person in', 'the of on background with on', 'the of on', 'the is in', 'this is with and on', 'person in', 'this is in', 'the of', 'the is in', 'this is of', 'the is just', 'this is in', 'the is in', 'the of', 'the is with on', 'person in', 'person in', 'person in', 'person in', 'the of house by', 'the is for', 'person in', 'the is in', 'actor with on', 'the is in', 'the is with and', 'the is with and in', 'person in', 'the is with and in', 'the is of', 'this is with and', 'the is in', 'this is with and', 'person in', 'this is with and', 'person in', 'this is with and table', 'the is with and on', 'person in', 'the is in', 'the is in', 'city over city ov

In [76]:
df2["Caption"] = pridiction
df2 = df2[["id", "Caption"]]
df2.to_csv("submission.csv", index=False)
df2.head()


Unnamed: 0,id,Caption
0,140809,view of on
1,140810,how to this is in
2,140811,the is in
3,140813,the of
4,140814,the is in


## Lets Generate Captions !!!

start_token = word_to_index['<start>']
end_token = word_to_index['<end>']
pad_token = word_to_index['<pad>']
max_seq_len = 33
print(start_token, end_token, pad_token)

valid_img_embed = pd.read_pickle('EncodedImageValidResNet.pkl')

### Here in the below function,we are generating caption in beam search. K defines the topK token to look at each time step

def generate_caption(K, img_nm): 
    img_loc = '../input/flickr8k/Images/'+str(img_nm)
    image = Image.open(img_loc).convert("RGB")
    plt.imshow(image)

    model.eval() 
    valid_img_df = valid[valid['image']==img_nm]
    print("Actual Caption : ")
    print(valid_img_df['caption'].tolist())
    img_embed = valid_img_embed[img_nm].to(device)


    img_embed = img_embed.permute(0,2,3,1)
    img_embed = img_embed.view(img_embed.size(0), -1, img_embed.size(3))


    input_seq = [pad_token]*max_seq_len
    input_seq[0] = start_token

    input_seq = torch.tensor(input_seq).unsqueeze(0).to(device)
    predicted_sentence = []
    with torch.no_grad():
        for eval_iter in range(0, max_seq_len):

            output, padding_mask = model.forward(img_embed, input_seq)

            output = output[eval_iter, 0, :]

            values = torch.topk(output, K).values.tolist()
            indices = torch.topk(output, K).indices.tolist()

            next_word_index = random.choices(indices, values, k = 1)[0]

            next_word = index_to_word[next_word_index]

            input_seq[:, eval_iter+1] = next_word_index


            if next_word == '<end>' :
                break

            predicted_sentence.append(next_word)
    print("\n")
    print("Predicted caption : ")
    print(" ".join(predicted_sentence+['.']))

### 1st Example 

generate_caption(1, unq_valid_imgs.iloc[50]['image'])

generate_caption(2, unq_valid_imgs.iloc[50]['image'])

### 2nd Example

generate_caption(1, unq_valid_imgs.iloc[100]['image'])

generate_caption(2, unq_valid_imgs.iloc[100]['image'])

### 3rd Example

generate_caption(1, unq_valid_imgs.iloc[500]['image'])

generate_caption(2, unq_valid_imgs.iloc[500]['image'])

### 4rth Example

generate_caption(1, unq_valid_imgs.iloc[600]['image'])

generate_caption(2, unq_valid_imgs.iloc[600]['image'])

## Thanks for going through the whole work. Please do upvote the notebook if you liked it.