In [1]:
!nvidia-smi

Thu Apr  8 19:57:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 450.66       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 2070    Off  | 00000000:26:00.0  On |                  N/A |
| 31%   35C    P5    36W / 215W |    950MiB /  7979MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [17]:
# Packages we need
import os
import torch
import random
import numpy as np
import transformers
import pandas as pd
from PIL import Image
import torch.nn as nn
import torchvision.transforms as T
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from typing import Callable, Optional


## For Reproducibility
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

## Tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased",do_lower_case=True)

## Device Configuration 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("Everything works")
print(device)

Everything works
cuda


In [8]:
# File paths
data_dir = './data/flickr30k_images'
image_dir = f'{data_dir}/flickr30k_images'
csv_file = f'{data_dir}/results.csv'

In [13]:
# There's an error on line 19999 of this dataset, I had to search this up to fix it
df = pd.read_csv(csv_file, delimiter='|')
df[' comment_number'][19999] = ' 4'
df[' comment'][19999] = ' A dog runs across the grass .'
df['image_name'] = image_dir+'/'+df['image_name']
df.head(5)

# Sort the data into a data frame with 4 comment cells on each row
image_name = {
    'image_name':df[df[' comment_number'] == df[' comment_number'][0]]['image_name'].values,
}
comments = {
    'comment_0':df[df[' comment_number'] == df[' comment_number'][0]][' comment'].values,
    'comment_1':df[df[' comment_number'] == df[' comment_number'][1]][' comment'].values,
    'comment_2':df[df[' comment_number'] == df[' comment_number'][2]][' comment'].values,
    'comment_3':df[df[' comment_number'] == df[' comment_number'][3]][' comment'].values,
    'comment_4':df[df[' comment_number'] == df[' comment_number'][4]][' comment'].values,
}

image_name_df = pd.DataFrame.from_dict(image_name)
comments_df = pd.DataFrame.from_dict(comments)

df = pd.concat([image_name_df,comments_df], axis=1)
df.head(5)

Unnamed: 0,image_name,comment_0,comment_1,comment_2,comment_3,comment_4
0,./data/flickr30k_images/flickr30k_images/10000...,Two young guys with shaggy hair look at their...,"Two young , White males are outside near many...",Two men in green shirts are standing in a yard .,A man in a blue shirt standing in a garden .,Two friends enjoy time spent together .
1,./data/flickr30k_images/flickr30k_images/10002...,Several men in hard hats are operating a gian...,Workers look down from up above on a piece of...,Two men working on a machine wearing hard hats .,Four men on top of a tall structure .,Three men on a large rig .
2,./data/flickr30k_images/flickr30k_images/10002...,A child in a pink dress is climbing up a set ...,A little girl in a pink dress going into a wo...,A little girl climbing the stairs to her play...,A little girl climbing into a wooden playhouse,A girl going into a wooden building .
3,./data/flickr30k_images/flickr30k_images/10003...,Someone in a blue shirt and hat is standing o...,A man in a blue shirt is standing on a ladder...,A man on a ladder cleans the window of a tall...,man in blue shirt and jeans on ladder cleanin...,a man on a ladder cleans a window
4,./data/flickr30k_images/flickr30k_images/10003...,"Two men , one in a gray shirt , one in a blac...",Two guy cooking and joking around with the ca...,Two men in a kitchen cooking food on a stove .,Two men are at the stove preparing food .,Two men are cooking a meal .


In [15]:
## Training and Test splits 
train, test = train_test_split(df, test_size=0.2, random_state=42)

## Reset Indexes 
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

## Split training into training and validation 
train, val = train_test_split(train, test_size=0.25, random_state=42)

## Reset Indexes 
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

## Get sizes
print(train.shape)
print(val.shape)
print(test.shape)

(19069, 6)
(6357, 6)
(6357, 6)


In [30]:
class FlickrDataset(Dataset):
    def __init__(self, data, 
                 transforms: Optional[Callable] = None) -> None:
        self.data = data
        self.transforms = T.Compose([
            T.Resize((256,256)),
            T.ToTensor(),
            T.Normalize(mean = [0.5], std = [0.5]),
        ])
    
    def __len__(self) -> int:
        return len(self.data)
    
    def __getitem__(self, i: int):
        image_name = self.data.image_name.values[i]
        image = Image.open(image_name).convert('RGB')
        
        if self.transforms is not None:
            image = self.transforms(image)
            
        comments = self.data[self.data.image_name == image_name].values.tolist()[0][1:][0]
        encoded_inputs = tokenizer(comments,
                            return_token_type_ids = False, 
                            return_attention_mask = False, 
                            max_length = 100, 
                            padding = "max_length",
                            return_tensors = "pt")
        
        sample = {"image":image.to(device),
                  "captions": encoded_inputs["input_ids"].flatten().to(device)}
        
        return sample

In [31]:
batch_size = 32

train_dataset = FlickrDataset(train, transforms = True)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, drop_last=True)

val_dataset = FlickrDataset(val, transforms = True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size = batch_size,drop_last=True)

test_dataset = FlickrDataset(test, transforms = True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size,drop_last=True)

{'image': tensor([[[ 0.1216,  0.2784,  0.3569,  ..., -0.0980, -0.0196, -0.0431],
          [ 0.0745,  0.0667,  0.0902,  ...,  0.0980,  0.2078,  0.2314],
          [ 0.2941,  0.3412,  0.4118,  ...,  0.1059,  0.1451,  0.2157],
          ...,
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000]],
 
         [[ 0.4510,  0.5490,  0.5804,  ...,  0.3961,  0.3961,  0.3882],
          [ 0.4549,  0.4000,  0.3647,  ...,  0.4471,  0.5255,  0.4902],
          [ 0.6196,  0.6431,  0.6667,  ...,  0.4627,  0.5059,  0.4902],
          ...,
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000]],
 
         [[ 0.4588,  0.5961,  0.6471,  ...,  0.4235,  0.4353,  0.4078],
          [ 0.4275,