In [1]:
#Import needed libraries

import numpy as np
import random
from transformers import ResNetModel
from torch import nn
from torch.utils.data import Dataset
from PIL import Image
from torchvision.transforms import v2
import torch
import pandas as pd
import evaluate

In [2]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

base_path = 'C:/Users/Alex/Desktop/Universidad/Third Course/First Semester/Vision & Learning/PROJECT 3//'
img_path = f'{base_path}Images/'
cap_path = f'{base_path}captions.txt'

data = pd.read_csv(cap_path)
partitions = np.load('flickr8k_partitions.npy', allow_pickle=True).item()

cpu


In [3]:
import torch

if torch.cuda.is_available():
    print("CUDA is available!")
else:
    print("CUDA is not available.")

CUDA is not available.


In [5]:
chars = ['<SOS>', '<EOS>', '<PAD>', ' ', '!', '"', '#', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

NUM_CHAR = len(chars)
idx2char = {k: v for k, v in enumerate(chars)}
char2idx = {v: k for k, v in enumerate(chars)}

TEXT_MAX_LEN = 201

In [6]:
class Data(Dataset):
    def __init__(self, data, partition):
        self.data = data
        self.partition = partition
        self.num_captions = 5
        self.max_len = TEXT_MAX_LEN
        self.img_proc = torch.nn.Sequential(
            v2.ToImage(),
            v2.ToDtype(torch.float32, scale=True),
            v2.Resize((224, 224), antialias=True),
            v2.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),)

    def __len__(self):
        return len(self.partition)
    
    def __getitem__(self, idx):
        real_idx = self.num_captions*self.partition[idx]
        item = self.data.iloc[real_idx: real_idx+self.num_captions]
        ## image processing
        img_name = item.image.reset_index(drop=True)[0]
        img = Image.open(f'{img_path}{img_name}').convert('RGB')
        img = self.img_proc(img)
    
        ## caption processing
        caption = item.caption.reset_index(drop=True)[random.choice(list(range(self.num_captions)))]
        cap_list = list(caption)
        final_list = [chars[0]]
        final_list.extend(cap_list)
        final_list.extend([chars[1]])
        gap = self.max_len - len(final_list)
        final_list.extend([chars[2]]*gap)
        cap_idx = torch.Tensor([char2idx[i] for i in final_list])
        return img, cap_idx

In [7]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet = ResNetModel.from_pretrained('microsoft/resnet-18').to(DEVICE)
        self.gru = nn.GRU(512, 512, num_layers=1)
        self.proj = nn.Linear(512, NUM_CHAR)
        self.embed = nn.Embedding(NUM_CHAR, 512)

    def forward(self, img):
        batch_size = img.shape[0]
        feat = self.resnet(img)
        feat = feat.pooler_output.squeeze(-1).squeeze(-1).unsqueeze(0) # 1, batch, 512
        start = torch.tensor(char2idx['<SOS>']).to(DEVICE)
        start_embed = self.embed(start) # 512
        start_embeds = start_embed.repeat(batch_size, 1).unsqueeze(0) # 1, batch, 512
        inp = start_embeds
        hidden = feat
        for t in range(TEXT_MAX_LEN-1): # rm <SOS>
            out, hidden = self.gru(inp, hidden)
            inp = torch.cat((inp, out[-1:]), dim=0) # N, batch, 512
    
        res = inp.permute(1, 0, 2) # batch, seq, 512
        res = self.proj(res) # batch, seq, 80
        res = res.permute(0, 2, 1) # batch, 80, seq
        return res

In [8]:
'''A simple example to calculate loss of a single batch (size 2)'''
dataset = Data(data, partitions['train'])
img1, caption1 = next(iter(dataset))
img2, caption2 = next(iter(dataset))
caption1 = torch.tensor(caption1)
caption2 = torch.tensor(caption2)
img = torch.cat((img1.unsqueeze(0), img2.unsqueeze(0)))
caption = torch.cat((caption1.unsqueeze(0), caption2.unsqueeze(0)))
img, caption = img.to(DEVICE), caption.to(DEVICE)
model = Model().to(DEVICE)
pred = model(img)
crit = nn.CrossEntropyLoss()
caption = caption.long()
loss = crit(pred, caption)
print(loss)


  caption1 = torch.tensor(caption1)
  caption2 = torch.tensor(caption2)


tensor(4.3606, grad_fn=<NllLoss2DBackward0>)


In [9]:
'''metrics'''
bleu = evaluate.load('bleu')
meteor = evaluate.load('meteor')
rouge = evaluate.load('rouge')

reference = [['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .']]
prediction = ['A girl goes into a wooden building .']

res_b = bleu.compute(predictions=prediction, references=reference)
res_r = rouge.compute(predictions=prediction, references=reference)
res_m = meteor.compute(predictions=prediction, references=reference)

res_b, res_r, res_m

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


({'bleu': 0.5946035575013605,
  'precisions': [0.875, 0.7142857142857143, 0.5, 0.4],
  'brevity_penalty': 1.0,
  'length_ratio': 1.0,
  'translation_length': 8,
  'reference_length': 8},
 {'rouge1': 0.8571428571428571,
  'rouge2': 0.6666666666666666,
  'rougeL': 0.8571428571428571,
  'rougeLsum': 0.8571428571428571},
 {'meteor': 0.864795918367347})

In [10]:
ref = [['A child is running in the campus']]
pred1 = ['A child is running']

res_b = bleu.compute(predictions=pred1, references=ref)
res_r = rouge.compute(predictions=pred1, references=ref)
res_m = meteor.compute(predictions=pred1, references=ref)

res_b, res_r, res_m

({'bleu': 0.4723665527410147,
  'precisions': [1.0, 1.0, 1.0, 1.0],
  'brevity_penalty': 0.4723665527410147,
  'length_ratio': 0.5714285714285714,
  'translation_length': 4,
  'reference_length': 7},
 {'rouge1': 0.7272727272727273,
  'rouge2': 0.6666666666666666,
  'rougeL': 0.7272727272727273,
  'rougeLsum': 0.7272727272727273},
 {'meteor': 0.5923507462686567})

In [11]:
ref = [['A child is running in the campus']]
pred1 = ['A child is']

res_b = bleu.compute(predictions=pred1, references=ref)
res_r = rouge.compute(predictions=pred1, references=ref)
res_m = meteor.compute(predictions=pred1, references=ref)

res_b, res_r, res_m

({'bleu': 0.0,
  'precisions': [1.0, 1.0, 1.0, 0.0],
  'brevity_penalty': 0.2635971381157267,
  'length_ratio': 0.42857142857142855,
  'translation_length': 3,
  'reference_length': 7},
 {'rouge1': 0.6, 'rouge2': 0.5, 'rougeL': 0.6, 'rougeLsum': 0.6},
 {'meteor': 0.44612794612794615})

In [14]:
ref = [['A child is running in the campus']]
pred1 = ['A child campus']

res_b = bleu.compute(predictions=pred1, references=ref)
res_r = rouge.compute(predictions=pred1, references=ref)
res_m = meteor.compute(predictions=pred1, references=ref)
res_m_sin = meteor.compute(predictions=pred1, references=ref, gamma=0) # no penalty by setting gamma to 0

res_b, res_r, res_m, res_m_sin

({'bleu': 0.0,
  'precisions': [1.0, 0.5, 0.0, 0.0],
  'brevity_penalty': 0.2635971381157267,
  'length_ratio': 0.42857142857142855,
  'translation_length': 3,
  'reference_length': 7},
 {'rouge1': 0.6, 'rouge2': 0.25, 'rougeL': 0.6, 'rougeLsum': 0.6},
 {'meteor': 0.3872053872053872},
 {'meteor': 0.45454545454545453})

Final metric we use for challenge 3: BLEU1, BLEU2, ROUGE-L, METEOR

In [12]:
ref = [['A child is running in the campus']]
pred1 = ['A child campus']

bleu1 = bleu.compute(predictions=pred1, references=ref, max_order=1)
bleu2 = bleu.compute(predictions=pred1, references=ref, max_order=2)
res_r = rouge.compute(predictions=pred1, references=ref)
res_m = meteor.compute(predictions=pred1, references=ref)

f"BLEU-1:{bleu1['bleu']*100:.1f}%, BLEU2:{bleu2['bleu']*100:.1f}%, ROUGE-L:{res_r['rougeL']*100:.1f}%, METEOR:{res_m['meteor']*100:.1f}%"

'BLEU-1:26.4%, BLEU2:18.6%, ROUGE-L:60.0%, METEOR:38.7%'

Now it is your turn! Try to finish the code below to run the train function

In [13]:
from torch.utils.data import DataLoader
import torch.optim as optim

In [14]:
data_train = Data(data, partitions['train'])
dataloader_train = DataLoader(data_train, batch_size=4, shuffle=True)
batch= next(iter(dataloader_train))

In [15]:
type(batch)

list

In [43]:
def train(EPOCHS):
    data_train = Data(data, partitions['train'])
    data_valid = Data(data, partitions['valid'])
    data_test = Data(data, partitions['test'])

    dataloader_train = DataLoader(data_train, batch_size=4, shuffle=True)
    dataloader_valid = DataLoader(data_valid, batch_size=4, shuffle=True)
    dataloader_test = DataLoader(data_test, batch_size=4, shuffle=True)
    
    model = Model().to(DEVICE)
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    crit = nn.CrossEntropyLoss()
    metric = "bleu"
    #metric = "bleu2"
    #metric = "rouge"
    #metric = "meteor"
    for epoch in range(EPOCHS):
        print("Entrando en la función")
        loss, res = train_one_epoch(model, optimizer, crit, metric, dataloader_train)
        print(f'train loss: {loss:.2f}, metric: {res:.2f}, epoch: {epoch}')
        loss_v, res_v = eval_epoch(model, crit, metric, dataloader_valid)
        print(f'valid loss: {loss_v:.2f}, metric: {res_v:.2f}')
    loss_t, res_t = eval_epoch(model, crit, metric, dataloader_test)
    print(f'test loss: {loss_t:.2f}, metric: {res_t:.2f}')
    
def train_one_epoch(model, optimizer, crit, metric, dataloader):
    total_loss= 0.0
    total_metric= 0.0
    print("Entrando al batch")
    for batch in dataloader:
        inputs, targets = batch
        inputs = inputs.to(DEVICE) 
        
        targets = targets.to(DEVICE)

        print("Targets y inputs pasados al device")
        
        outputs= model(inputs)
        targets = targets.long()

        print("Targets pasados a long")
        print("Calculando loss")
        loss= crit(outputs,targets)
        print("Loss calculada")
        total_loss += loss.item()

        print(outputs.shape)
        print(outputs)
        print(targets)
        print(targets.shape)
        
        # Convert raw outputs to character indices
        outputs_idx = torch.argmax(outputs, dim=-1)

    # Convert outputs and targets to sentences
        targets_sentence = ["".join(idx2char[idx.item()] for idx in target if idx.item() not in [char2idx['<EOS>'], char2idx['<PAD>'], char2idx['<SOS>']]) for target in targets]
        print(targets_sentence)
        outputs_sentence = ["".join(idx2char[idx.item()] for idx in output if idx.item() not in [char2idx['<EOS>'], char2idx['<PAD>'], char2idx['<SOS>']]) for output in outputs_idx]
        print(outputs_sentence)
        
        #outputs_sentence = ''.join([idx2char[idx.item()] for idx in outputs if idx.item() not in [char2idx['<EOS>'], char2idx['<PAD>'], char2idx['<SOS>']]])
        #targets_sentence = ''.join([idx2char[idx.item()] for idx in targets if idx.item() not in [char2idx['<EOS>'], char2idx['<PAD>'], char2idx['<SOS>']]])

        print("Outputs y targets sentence calculados")
        print("Output sentence:",outputs_sentence, "Target sentence:",targets_sentence)
        
        
        METRIC = evaluate.load(str(metric).lower())

        print("Metrica cargada")

        if metric == "bleu":
                metriccompute = METRIC.compute(predictions=outputs_sentence, references=targets_sentence,max_order=1 )
        elif metric == "bleu2":
                metriccompute = METRIC.compute(predictions=outputs_sentence, references=targets_sentence,max_order=2 )
        else:
                metriccompute = METRIC.compute(predictions=outputs, references=targets )

        print("Metrica calculada")

        if metric == "rouge":
                total_metric += metriccompute['rougeL']
        else:
                total_metric += metriccompute[str(metric).lower()]
        
        print("Metrica añadida")
        
        
    avg_loss = total_loss / len(dataloader)
    avg_metric = total_metric / len(dataloader)
    print(f'train loss: {avg_loss:.2f}, metric: {avg_metric:.2f}')
    print(f'train loss: {avg_loss:.2f}')
    
    return avg_loss

def eval_epoch(model, crit, metric, dataloader):
    model.eval()
    total_loss = 0.0

    with torch.no_grad():
        for batch in dataloader:
            inputs, targets = batch
            inputs = inputs.to(DEVICE) 
            targets = targets.to(DEVICE)


            outputs = model(inputs)

            targets = targets.long()
            loss = crit(outputs, targets)

            total_loss += loss.item()


        
    avg_loss = total_loss / len(dataloader)

    print(f'valid loss: {avg_loss:.2f}')
    
    return avg_loss

The output of your model is a 3D tensor because it's providing a probability distribution over your vocabulary for each position in each sequence in the batch. This is a common output for models dealing with sequence data like text or time series.

Your reference data (`targets`) is a 2D tensor because it contains the actual character indices (not probabilities) for each position in each sequence.

The line `outputs_idx = torch.argmax(outputs, dim=-1)` is converting the probability distributions in `outputs` to character indices. The `torch.argmax` function returns the index of the maximum value in a tensor along a specified dimension. When `dim=-1`, it's looking at the last dimension of `outputs` (the feature dimension, which represents the vocabulary in your case). 

So for each position in each sequence, it's picking the character with the highest probability according to the model. The result is a 2D tensor of the same shape as `targets`, which allows you to compare the model's predictions with the actual data.

In [44]:
train(1)

Entrando en la función
Entrando al batch
Targets y inputs pasados al device
Targets pasados a long
Calculando loss
Loss calculada
torch.Size([4, 80, 201])
tensor([[[ 0.3113,  0.0835,  0.0691,  ...,  0.0409,  0.0409,  0.0409],
         [ 0.3734,  0.2365, -0.0300,  ..., -0.0036, -0.0036, -0.0036],
         [-0.9380, -0.1694,  0.1842,  ..., -0.0679, -0.0679, -0.0679],
         ...,
         [ 1.5160, -0.8472, -0.3491,  ...,  0.0137,  0.0137,  0.0137],
         [ 0.1962, -0.1848, -0.4888,  ..., -0.0368, -0.0368, -0.0368],
         [ 0.1195,  0.0643,  0.2568,  ..., -0.0571, -0.0571, -0.0571]],

        [[ 0.3113,  0.6412,  0.2011,  ...,  0.0409,  0.0409,  0.0409],
         [ 0.3734, -0.1515, -0.1680,  ..., -0.0036, -0.0036, -0.0036],
         [-0.9380,  0.3387,  0.3100,  ..., -0.0679, -0.0679, -0.0679],
         ...,
         [ 1.5160, -0.5618, -0.1468,  ...,  0.0137,  0.0137,  0.0137],
         [ 0.1962, -0.3008, -0.3848,  ..., -0.0368, -0.0368, -0.0368],
         [ 0.1195, -0.2758,  0.084

KeyboardInterrupt: 

In [32]:
outputs

NameError: name 'outputs' is not defined

In [19]:
batch= next(iter(dataloader_train))

In [22]:
onecaption=batch[1][1]

In [25]:
onecaption_chr = ''.join([idx2char[idx.item()] for idx in onecaption if idx.item() not in [char2idx['<EOS>'], char2idx['<PAD>'], char2idx['<SOS>']]])


'A painting of a man riding a mountain bike on a mountain trail .'