In [1]:
!nvidia-smi

Fri Apr  9 11:43:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 450.66       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 2070    Off  | 00000000:26:00.0  On |                  N/A |
| 28%   34C    P0    56W / 215W |    979MiB /  7979MiB |     27%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
# Packages we need
import os
import torch
import random
import numpy as np
import transformers
import pandas as pd
from PIL import Image
import torch.nn as nn
import torchvision.transforms as T
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torchtext.data.metrics import bleu_score
from sklearn.model_selection import train_test_split
from typing import Callable, Optional
from matplotlib import pyplot as plt
from tqdm import tqdm

## For Reproducibility
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

## Tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased",do_lower_case=True)

## Device Configuration 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("Everything works")
print(device)

ModuleNotFoundError: No module named 'torchtext'

In [None]:
# File paths
data_dir = './data/flickr30k_images'
image_dir = f'{data_dir}/flickr30k_images'
csv_file = f'{data_dir}/results.csv'

In [None]:
# There's an error on line 19999 of this dataset, I had to search this up to fix it
df = pd.read_csv(csv_file, delimiter='|')
df[' comment_number'][19999] = ' 4'
df[' comment'][19999] = ' A dog runs across the grass .'
df['image_name'] = image_dir+'/'+df['image_name']
df.head(5)

# Sort the data into a data frame with 4 comment cells on each row
image_name = {
    'image_name':df[df[' comment_number'] == df[' comment_number'][0]]['image_name'].values,
}
comments = {
    'comment_0':df[df[' comment_number'] == df[' comment_number'][0]][' comment'].values,
    'comment_1':df[df[' comment_number'] == df[' comment_number'][1]][' comment'].values,
    'comment_2':df[df[' comment_number'] == df[' comment_number'][2]][' comment'].values,
    'comment_3':df[df[' comment_number'] == df[' comment_number'][3]][' comment'].values,
    'comment_4':df[df[' comment_number'] == df[' comment_number'][4]][' comment'].values,
}

image_name_df = pd.DataFrame.from_dict(image_name)
comments_df = pd.DataFrame.from_dict(comments)

df = pd.concat([image_name_df,comments_df], axis=1)
df.head(5)

In [None]:
## Training and Test splits 
train, test = train_test_split(df, test_size=0.2, random_state=42)

## Reset Indexes 
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

## Split training into training and validation 
train, val = train_test_split(train, test_size=0.25, random_state=42)

## Reset Indexes 
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

## Get sizes
print(train.shape)
print(val.shape)
print(test.shape)

In [None]:
class FlickrDataset(Dataset):
    def __init__(self, data, 
                 transforms: Optional[Callable] = None) -> None:
        self.data = data
        self.transforms = T.Compose([
            T.Resize((256,256)),
            T.ToTensor(),
            T.Normalize(mean = [0.5], std = [0.5]),
        ])
    
    def __len__(self) -> int:
        return len(self.data)
    
    def __getitem__(self, i: int):
        image_name = self.data.image_name.values[i]
        image = Image.open(image_name).convert('RGB')
        
        if self.transforms is not None:
            image = self.transforms(image)
            
        comments = self.data[self.data.image_name == image_name].values.tolist()[0][1:]
        encoded_inputs = tokenizer(comments,
                            return_token_type_ids = False, 
                            return_attention_mask = False, 
                            max_length = 100, 
                            padding = "max_length",
                            return_tensors = "pt")
        
        sample = {"image":image.to(device),
                  "captions": encoded_inputs["input_ids"].flatten().to(device)}
        
        return sample

In [None]:
# Because our dataset has an odd number of files, drop_last ensures we don't get errors
batch_size = 16

train_dataset = FlickrDataset(train, transforms = True)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, drop_last=True)

val_dataset = FlickrDataset(val, transforms = True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size = batch_size,drop_last=True)

test_dataset = FlickrDataset(test, transforms = True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size,drop_last=True)

In [None]:
# Visualize a random image and its comment
plt.imshow(train_dataset[42]['image'].permute(1,2,0).cpu())
print(tokenizer.decode(train_dataset[42]['captions'].cpu()))

In [None]:
# The CNN - Based on ResNet
class Encoder(nn.Module):
    def __init__(self, embed_size):
        super(Encoder, self).__init__()
        model = models.resnet50(pretrained=True)
        # Freeze the resnet
        for param in model.parameters():
            param.requires_grad_(False)
        
        # Replace the output with our own embedding
        modules = list(model.children())[:-1]
        self.model = nn.Sequential(*modules)
        self.embed = nn.Linear(model.fc.in_features * 4, embed_size)
        
    def forward(self, image):
        features = self.model(image)
        features = features.view(features.size(0), -1)
        features = self.embed(features)
        
        return features

In [None]:
# The RNN
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size,
                embedding_dim, vocab_size):
        super(Decoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(num_embeddings = vocab_size,
                                     embedding_dim = embedding_dim)
        
        self.lstm = nn.LSTM(input_size = input_size,
                           hidden_size = hidden_size,
                           batch_first = True)
        
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def init_hidden(self, features):
        # Intialize the hidden state, similar to an earlier lab
        return (torch.autograd.Variable(torch.zeros(1,batch_size, self.hidden_size).to(device)),
               torch.autograd.Variable(features.unsqueeze(0)).to(device))
    
    def forward(self, features, captions):
        state = self.init_hidden(features)
        embed = self.embedding(captions)
        lstm_out, state = self.lstm(embed, state)
        outputs = self.fc(lstm_out)
        outputs = outputs.view(-1, self.vocab_size)
        
        return outputs

In [None]:
vocab_size = 90000
steps_per_epoch = 19069 // 32

encoder = Encoder(embed_size = 512).to(device)
decoder = Decoder(input_size = 512, 
                  hidden_size = 512, 
                  embedding_dim = 512, 
                  vocab_size = vocab_size).to(device)

criterion = nn.CrossEntropyLoss().to(device)
params = list(decoder.parameters()) + list(encoder.embed.parameters())

optimizer = torch.optim.Adam(params, lr = 0.001)

In [None]:
def accuracy(output, truth):
    # TODO: Write a good accuracy function based on BLEU
    x = tokenizer.decode(x.cpu())
    y = tokenizer.decode(truth.cpu())
    return bleu_score([x], y)

In [None]:
train_losses = []
train_accs = []
val_losses = []
val_accs = []

for epoch in range(10):
    #pbar = tqdm(total=len(train_loader))

    for i, sample in enumerate(train_loader):
        if i > steps_per_epoch:
            break
        
        # Get the info
        image, captions = sample['image'], sample['captions']
        
        # Zero grad
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Forward
        features = encoder(image)
        outputs = decoder(features, captions)
        
        # Loss
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        train_losses.append(loss.item())
        train_accs.append(accuracy(outputs, captions))
        
        loss.backward()
        optimizer.step()
    