In [1]:
import numpy as np
import random
from transformers import ResNetModel
from torch import nn
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms
from torchvision.transforms import v2
import torch
import pandas as pd
import evaluate

ImportError: cannot import name 'v2' from 'torchvision.transforms' (c:\Users\polme\anaconda3\lib\site-packages\torchvision\transforms\__init__.py)

In [53]:
!pip install torchvision

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
DEVICE = 'cuda'

base_path = 'archive/'
img_path = f'{base_path}Images/'
cap_path = f'{base_path}captions.txt'

data = pd.read_csv(cap_path)
partitions = np.load('flickr8k_partitions.npy', allow_pickle=True).item()

In [3]:
chars = ['<SOS>', '<EOS>', '<PAD>', ' ', '!', '"', '#', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

NUM_CHAR = len(chars)
idx2char = {k: v for k, v in enumerate(chars)}
char2idx = {v: k for k, v in enumerate(chars)}

TEXT_MAX_LEN = 201

In [4]:
class Data(Dataset):
    def __init__(self, data, partition):
        self.data = data
        self.partition = partition
        self.num_captions = 5
        self.max_len = TEXT_MAX_LEN
        self.img_proc = torch.nn.Sequential([
            v2.ToImage(),
            v2.ToDtype(torch.float32, scale=True),
            v2.Resize((224, 224), antialias=True),
            v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),])

    def __len__(self):
        return len(self.partition)
    
    def __getitem__(self, idx):
        real_idx = self.num_captions*self.partition[idx]
        item = self.data.iloc[real_idx: real_idx+self.num_captions]
        ## image processing
        img_name = item.image.reset_index(drop=True)[0]
        img = Image.open(f'{img_path}{img_name}').convert('RGB')
        img = self.img_proc(img)
    
        ## caption processing
        caption = item.caption.reset_index(drop=True)[random.choice(list(range(self.num_captions)))]
        cap_list = list(caption)
        final_list = [chars[0]]
        final_list.extend(cap_list)
        final_list.extend([chars[1]])
        gap = self.max_len - len(final_list)
        final_list.extend([chars[2]]*gap)
        cap_idx = [char2idx[i] for i in final_list]
        return img, cap_idx

In [5]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet = ResNetModel.from_pretrained('microsoft/resnet-18').to(DEVICE)
        self.gru = nn.GRU(512, 512, num_layers=1)
        self.proj = nn.Linear(512, NUM_CHAR)
        self.embed = nn.Embedding(NUM_CHAR, 512)

    def forward(self, img):
        batch_size = img.shape[0]
        feat = self.resnet(img)
        feat = feat.pooler_output.squeeze(-1).squeeze(-1).unsqueeze(0) # 1, batch, 512
        start = torch.tensor(char2idx['<SOS>']).to(DEVICE)
        start_embed = self.embed(start) # 512
        start_embeds = start_embed.repeat(batch_size, 1).unsqueeze(0) # 1, batch, 512
        inp = start_embeds
        hidden = feat
        for t in range(TEXT_MAX_LEN-1): # rm <SOS>
            out, hidden = self.gru(inp, hidden)
            inp = torch.cat((inp, out[-1:]), dim=0) # N, batch, 512
    
        res = inp.permute(1, 0, 2) # batch, seq, 512
        res = self.proj(res) # batch, seq, 80
        res = res.permute(0, 2, 1) # batch, 80, seq
        return res

In [6]:
'''A simple example to calculate loss of a single batch (size 2)'''
dataset = Data(data, partitions['train'][:10])
img1, caption1 = next(iter(dataset))
img2, caption2 = next(iter(dataset))
caption1 = torch.tensor(caption1)
caption2 = torch.tensor(caption2)
img = torch.cat((img1.unsqueeze(0), img2.unsqueeze(0)))
caption = torch.cat((caption1.unsqueeze(0), caption2.unsqueeze(0)))
img, caption = img.to(DEVICE), caption.to(DEVICE)
model = Model().to(DEVICE)
pred = model(img)
crit = nn.CrossEntropyLoss()
loss = crit(pred, caption)
print(loss)


Some weights of the model checkpoint at microsoft/resnet-18 were not used when initializing ResNetModel: ['classifier.1.bias', 'classifier.1.weight']
- This IS expected if you are initializing ResNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ResNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor(4.3552, device='cuda:0', grad_fn=<NllLoss2DBackward0>)


In [7]:
'''metrics'''
bleu = evaluate.load('bleu')
meteor = evaluate.load('meteor')
rouge = evaluate.load('rouge')

reference = [['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .']]
prediction = ['A girl goes into a wooden building .']

res_b = bleu.compute(predictions=prediction, references=reference)
res_r = rouge.compute(predictions=prediction, references=reference)
res_m = meteor.compute(predictions=prediction, references=reference)

res_b, res_r, res_m

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\polme\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\polme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\polme\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


({'bleu': 0.5946035575013605,
  'precisions': [0.875, 0.7142857142857143, 0.5, 0.4],
  'brevity_penalty': 1.0,
  'length_ratio': 1.0,
  'translation_length': 8,
  'reference_length': 8},
 {'rouge1': 0.8571428571428571,
  'rouge2': 0.6666666666666666,
  'rougeL': 0.8571428571428571,
  'rougeLsum': 0.8571428571428571},
 {'meteor': 0.864795918367347})

In [8]:
ref = [['A child is running in the campus']]
pred1 = ['A child is running']

res_b = bleu.compute(predictions=pred1, references=ref)
res_r = rouge.compute(predictions=pred1, references=ref)
res_m = meteor.compute(predictions=pred1, references=ref)

res_b, res_r, res_m

({'bleu': 0.4723665527410147,
  'precisions': [1.0, 1.0, 1.0, 1.0],
  'brevity_penalty': 0.4723665527410147,
  'length_ratio': 0.5714285714285714,
  'translation_length': 4,
  'reference_length': 7},
 {'rouge1': 0.7272727272727273,
  'rouge2': 0.6666666666666666,
  'rougeL': 0.7272727272727273,
  'rougeLsum': 0.7272727272727273},
 {'meteor': 0.5923507462686567})

In [9]:
ref = [['A child is running in the campus']]
pred1 = ['A child is']

res_b = bleu.compute(predictions=pred1, references=ref)
res_r = rouge.compute(predictions=pred1, references=ref)
res_m = meteor.compute(predictions=pred1, references=ref)

res_b, res_r, res_m

({'bleu': 0.0,
  'precisions': [1.0, 1.0, 1.0, 0.0],
  'brevity_penalty': 0.2635971381157267,
  'length_ratio': 0.42857142857142855,
  'translation_length': 3,
  'reference_length': 7},
 {'rouge1': 0.6, 'rouge2': 0.5, 'rougeL': 0.6, 'rougeLsum': 0.6},
 {'meteor': 0.44612794612794615})

In [10]:
ref = [['A child is running in the campus']]
pred1 = ['A child campus']

res_b = bleu.compute(predictions=pred1, references=ref)
res_r = rouge.compute(predictions=pred1, references=ref)
res_m = meteor.compute(predictions=pred1, references=ref)
res_m_sin = meteor.compute(predictions=pred1, references=ref, gamma=0) # no penalty by setting gamma to 0

res_b, res_r, res_m, res_m_sin

({'bleu': 0.0,
  'precisions': [1.0, 0.5, 0.0, 0.0],
  'brevity_penalty': 0.2635971381157267,
  'length_ratio': 0.42857142857142855,
  'translation_length': 3,
  'reference_length': 7},
 {'rouge1': 0.6, 'rouge2': 0.25, 'rougeL': 0.6, 'rougeLsum': 0.6},
 {'meteor': 0.3872053872053872},
 {'meteor': 0.45454545454545453})

Final metric we use for challenge 3: BLEU1, BLEU2, ROUGE-L, METEOR

In [11]:
ref = [['A child is running in the campus']]
pred1 = ['A child campus']

bleu1 = bleu.compute(predictions=pred1, references=ref, max_order=1)
bleu2 = bleu.compute(predictions=pred1, references=ref, max_order=2)
res_r = rouge.compute(predictions=pred1, references=ref)
res_m = meteor.compute(predictions=pred1, references=ref)

f"BLEU-1:{bleu1['bleu']*100:.1f}%, BLEU2:{bleu2['bleu']*100:.1f}%, ROUGE-L:{res_r['rougeL']*100:.1f}%, METEOR:{res_m['meteor']*100:.1f}%"

'BLEU-1:26.4%, BLEU2:18.6%, ROUGE-L:60.0%, METEOR:38.7%'

Now it is your turn! Try to finish the code below to run the train function

In [12]:
import torch
from torch.utils.data import DataLoader
print(DEVICE)


cuda


In [46]:
def train(EPOCHS):
    data_train = Data(data, partitions['train'][:10])
    data_valid = Data(data, partitions['valid'][:2])
    data_test = Data(data, partitions['test'][:5])
    dataloader_train = DataLoader(data_train, batch_size=8) # '''write a proper dataloader, same for valid and test'''
    dataloader_valid = DataLoader(data_valid, batch_size=4)
    dataloader_test = DataLoader(data_test, batch_size=2)
    model = Model().to(DEVICE)
    model.train()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=10**-5) # '''choose a proper optimizer'''
    crit = nn.CrossEntropyLoss()
    metric = None
    for epoch in range(EPOCHS):
        loss, res = train_one_epoch(model, optimizer, crit, metric, dataloader_train)
        print(f'train loss: {loss:.2f}, metric: {res:.2f}, epoch: {epoch}')
        loss_v, res_v = eval_epoch(model, crit, metric, dataloader_valid)
        print(f'valid loss: {loss:.2f}, metric: {res:.2f}')
    loss_t, res_t = eval_epoch(model, crit, metric, dataloader_test)
    print(f'test loss: {loss:.2f}, metric: {res:.2f}')
    
def train_one_epoch(model, optimizer, crit, metric, dataloader):
    model.train()
    for batch_idx, (imgs, captions) in enumerate(dataloader):
        print(batch_idx)
        print(captions)
        imgs, captions = imgs.to(DEVICE), torch.Tensor(captions).to(DEVICE)
        res = model(imgs)
        print(res.shape, captions.shape)
        loss = crit(res, caption)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    '''finish the code'''
    return loss, res

def eval_epoch(model, crit, metric, dataloader):
    model.eval()
    with torch.no_grad():
        for img, caption in dataloader:
            img, caption = img.to(DEVICE), torch.Tensor(caption).to(DEVICE)
            res = model(img)
            loss = crit(res, caption)
    '''finish the code'''
    return loss, res


In [47]:
train(1)

Some weights of the model checkpoint at microsoft/resnet-18 were not used when initializing ResNetModel: ['classifier.1.bias', 'classifier.1.weight']
- This IS expected if you are initializing ResNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ResNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0
[tensor([0, 0, 0, 0, 0, 0, 0, 0]), tensor([28, 28, 28, 28, 28, 47, 28, 47]), tensor([ 3,  3,  3,  3,  3, 61,  3, 61]), tensor([55, 65, 76, 57, 65, 58, 64, 58]), tensor([71, 62, 61, 68, 62,  3, 62,  3]), tensor([68, 73, 62, 60, 73, 73, 57, 60]), tensor([76, 73, 73,  3, 73, 76,  3, 62]), tensor([67, 65, 58, 56, 65, 68, 76, 71]), tensor([ 3, 58,  3, 61, 58,  3, 62, 65]), tensor([57,  3, 57, 58,  3, 73, 73,  3]), tensor([68, 60, 68, 76, 60, 54, 61, 76]), tensor([60, 62, 60, 72, 62, 67,  3, 62]), tensor([ 3, 71,  3,  3, 71,  3, 54, 73]), tensor([71, 65, 55, 54, 65, 56,  3, 61]), tensor([74,  3, 68, 67,  3, 68, 55,  3]), tensor([67, 62, 74,  3, 76, 65, 71, 60]), tensor([72, 67, 67, 68, 54, 68, 68, 65]), tensor([ 3,  3, 57, 55, 65, 71, 64, 54]), tensor([73, 54, 62, 63, 64, 58, 58, 72]), tensor([61,  3, 67, 58, 62, 57, 67, 72]), tensor([71, 55, 60, 56, 67,  3,  3, 58]), tensor([68, 65,  3, 73, 60, 57, 54, 72]), tensor([74, 74, 73,  3,  3, 68, 71,  3]), tensor([60, 58, 61, 54, 68, 60, 66, 68]

ValueError: only one element tensors can be converted to Python scalars