In [2]:
nome = 'Arthur Baia'
print(f'Meu nome é {nome}')

Meu nome é Arthur Baia


#  Exercício: Modelo de Linguagem com auto-atenção

Este exercício é similar ao da aula 4, mas iremos agora treinar uma rede neural *com auto-atenção* para prever a próxima palavra de um texto, data as palavras anteriores como entrada. 

Na camada de auto-atenção, deve-se implementar (vide slide 80):
- Embeddings de posição
- Projeções lineares (WQ, WK, WV, WO)
- Camada de feed forward (2-layer MLP)

Instrucões:
- É necessário fazer duas implementações da camada de auto-atenção: uma usando laços (ineficiente mas fácil de entender) e outra matricial (eficiente mas difícil de entender).

- Fazer um assert para garantir que o resultado das duas implementações é exatamente igual.

- No treinamento, usar apenas a implementação matricial.

## Importação dos pacotes

In [3]:
import collections
import itertools
import functools
import math
import os
import random
import re

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook
from typing import List

In [4]:
# Check which GPU we are using
!nvidia-smi

Wed Sep 28 14:35:17 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   54C    P0    22W /  N/A |   1349MiB /  5944MiB |     26%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu"
device = torch.device(dev)
print('Using {}'.format(device))

Using cuda:0


# Carregamento do dataset 

Primeiro, fazemos download do dataset:

In [6]:
!wget -nc http://files.fast.ai/data/aclImdb.tgz 
!tar -xzf aclImdb.tgz

File ‘aclImdb.tgz’ already there; not retrieving.



## Carregando o dataset

Criaremos uma divisão de treino (80%) e validação (20%) artificialmente.

Nota: Evitar de olhar ao máximo o dataset de teste para não ficar enviseado no que será testado. Em aplicações reais, o dataset de teste só estará disponível no futuro, ou seja, é quando o usuário começa a testar o seu produto.

In [7]:
def load_texts(folder):
    texts = []
    for path in os.listdir(folder):
        with open(os.path.join(folder, path)) as f:
            texts.append(f.read())
    return texts

x_train_pos = load_texts('aclImdb/train/pos')
x_train_neg = load_texts('aclImdb/train/neg')
x_test_pos = load_texts('aclImdb/test/pos')
x_test_neg = load_texts('aclImdb/test/neg')

x_train = x_train_pos + x_train_neg
x_test = x_test_pos + x_test_neg

# Embaralhamos o treino para depois fazermos a divisão treino/valid.
random.shuffle(x_train)

n_train = int(0.8 * len(x_train))

x_valid = x_train[n_train:]
x_train = x_train[:n_train]

print(len(x_train), 'amostras de treino.')
print(len(x_valid), 'amostras de desenvolvimento.')
print(len(x_test), 'amostras de teste.')

print('3 primeiras amostras treino:')
for x in x_train[:3]:
    print(x[:100])

print('3 últimas amostras treino:')
for x in x_train[-3:]:
    print(x[:100])

print('3 primeiras amostras validação:')
for x in x_valid[:3]:
    print(x[:100])

print('3 últimas amostras validação:')
for x in x_valid[-3:]:
    print(x[:100])

20000 amostras de treino.
5000 amostras de desenvolvimento.
25000 amostras de teste.
3 primeiras amostras treino:
I am sad that a period of history that is so fascinating and so rich in material for film can be mad
THE SUNSHINE BOYS was the hilarious 1975 screen adaptation of Neil Simon's play about a retired vaud
this has by far been one of the most beautiful portraits of a person that I've ever seen on screen. 
3 últimas amostras treino:
My daughter gets really put out at me when I refer to Drew Barrymore as looking as if she'd been hit
This is quite a dull movie. Well-shot with realistic performances especially a very good one from De
Too bad neither the animals or Eddie Murphy had anything to say worth saying. this movie is just bla
3 primeiras amostras validação:
The title should have been the walker. The guy expend 90% of the movie walking. He doesn't know what
The movie was a long awaited release, which where a bit disappointing because of the expectation's I
The jokes are obvio

In [8]:
unk = '<UNK>'

In [9]:
class Tokenizer():

    def __init__(self, max_vocab_token=3000):

        self.max_vocab_tokens = max_vocab_token

    def encode(self, text: str):
        # Escreva aqui seu código.
        return [self.vocab[word] if word in self.vocab else unk for word in self.tokenize(text)]

    def decode(self, tokens: List[int]):
        # Escreva aqui seu código.
        return ' '.join([self.vocab_inv[token] for token in tokens])

    def create_vocab(self, texts: List[str]):
        L = [word for phrase in list(map(self.tokenize, texts))
             for word in phrase]
        k = self.max_vocab_tokens
        def vocab(L, k): return {value: key for key, value in enumerate(
            dict(collections.Counter(L).most_common(k)))}
        self.vocab = vocab(L, k)
        self.vocab_inv = {v: k for k, v in self.vocab.items()}
    def tokenize(self, text: str):
        """
        Convert string to a list of tokens (i.e., words).
        This function lower cases everything and removes punctuation.
        """
        # Escreva aqui seu código.
        text = re.sub(re.escape('<br /><br />'), '°', text)
        return re.findall(r"\w+|[^\w\s]", text.lower())


In [10]:
def test_tokenizer():
    phrase = 'a cat walks in the bad.'
    assert Tokenizer().tokenize(phrase) == ['a', 'cat', 'walks', 'in', 'the', 'bad', '.']
    tokenizer = Tokenizer(len(phrase.split())+1)
    vocab_ = tokenizer.create_vocab([phrase])
    assert tokenizer.vocab == {'a': 0, 'cat': 1,
                               'walks': 2, 'in': 3, 'the': 4, 'bad': 5, '.': 6}
    assert tokenizer.vocab_inv == {
        0: 'a', 1: 'cat', 2: 'walks', 3: 'in', 4: 'the', 5: 'bad', 6: '.'}
    assert tokenizer.encode(phrase) == [0, 1, 2, 3, 4, 5, 6]
    assert tokenizer.encode('a cat') == [0, 1]
    assert tokenizer.encode('dog') == [unk]
    phrase = ['a cat walks in the bad.', 'a dog walks in the good.']
    tokenizer = Tokenizer(20)
    vocab_ = tokenizer.create_vocab(phrase)
    assert tokenizer.vocab == {'a': 0,
                               'walks': 1,
                               'in': 2,
                               'the': 3,
                               '.': 4,
                               'cat': 5,
                               'bad': 6,
                               'dog': 7,
                               'good': 8}


test_tokenizer()


In [11]:
class IMDBdataset(torch.utils.data.Dataset):
    def __init__(self, corpus: List[str], tokenizer, context_size=9):
        data = []
        for text in corpus:
            tokens = tokenizer.encode(text)
            data.extend([
                tokens[i:i+context_size+1] for i in range(len(tokens)-context_size)
                if unk not in tokens[i:i+context_size+1] and len(tokens) > context_size
            ])
        self.data = torch.tensor(data)
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx][:-1], self.data[idx][-1].long()


In [12]:
"""Function to test the dataset"""
def test_dataset():
    vocab = Tokenizer()
    test_corpus = ['a cat walks in the bad.', 'a dog walks in the good.']
    vocab.create_vocab(test_corpus)
    unseen_tokens = ['aprendendo sobre o modelo n-grama']
    dataset_ = IMDBdataset(unseen_tokens, vocab, 3)
    assert len(dataset_) == 0
    context_size = 3
    dataset_ = IMDBdataset(test_corpus, vocab, context_size=context_size)
    assert len(dataset_) == 8
    assert len(dataset_.__getitem__(0)[0].tolist()) == context_size
    assert len([dataset_.__getitem__(0)[1].tolist()]) == 1
    assert dataset_.__getitem__(0)[1].tolist() == vocab.encode('in')[0]
    assert dataset_.__getitem__(0)[0].tolist() == vocab.encode('a cat walks')
    assert dataset_.__getitem__(1)[0].tolist() == vocab.encode('cat walks in')
    assert dataset_.__getitem__(1)[1].tolist() == vocab.encode('the')[0]


test_dataset()

In [13]:
"""function to test the dataloader"""
def test_dataloader_():
    tokenizer = Tokenizer()
    test_corpus = ['a cat walks in the bad.', 'a dog walks in the good.']
    tokenizer.create_vocab(test_corpus)
    context_size = 3
    dataset_ = IMDBdataset(test_corpus, tokenizer, context_size=context_size)
    dataloader_ = torch.utils.data.DataLoader(dataset_, batch_size=2, shuffle=True)
    for batch in dataloader_:
        assert len(batch[0].shape) == 2
        assert len(batch[1].shape) == 1
        assert batch[0].shape[0] == batch[1].shape[0]
        assert batch[0].shape[1] == context_size
        break
test_dataloader_()

In [137]:
class LoopedSelfAttention(nn.Module):
    def __init__(self, embedding_dim):
        super(LoopedSelfAttention, self).__init__()

        self.wq = nn.Linear(embedding_dim, embedding_dim)
        self.wk= nn.Linear(embedding_dim, embedding_dim)
        self.wv = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, x):
        # x has shape (batch_size, seq_len, embedding_dim)
        Q = self.wq(x)
        K = self.wk(x)
        V = self.wv(x)
        
        new_embeddings = []
        for query in Q:
            scores = []
            for key in K:
                score = (query * key).sum()
                scores.append(score)

            attention_weights = torch.softmax(torch.FloatTensor(scores), dim=0)

            new_embedding = 0
            for weight, value in zip(attention_weights, V):
                new_embedding += weight * value

            new_embeddings.append(new_embedding)
        return torch.stack(new_embeddings)


class MatrixSelfAttention(nn.Module):
    def __init__(self, embedding_dim):
        super(MatrixSelfAttention, self).__init__()

        self.wq = nn.Linear(embedding_dim, embedding_dim)
        self.wk = nn.Linear(embedding_dim, embedding_dim)
        self.wv = nn.Linear(embedding_dim, embedding_dim)


    def forward(self, x):
        # x has shape (batch_size, seq_len, embedding_dim)
        Q = self.wq(x)
        K = self.wk(x)
        V = self.wv(x)
        print('k',K)
        print('k t',torch.transpose(K, -2, -1))
        scores = torch.matmul(Q, torch.transpose(K, -2, -1))
        print(scores)
        attention_weigths = torch.softmax(scores, dim=-1)
        new_embeddings = torch.matmul(attention_weigths, V)
        return new_embeddings


        


In [138]:
def test_self_attention():
    embedding_dim = 2
    seq_len = 2
    batch_size = 2
    x = torch.randn(batch_size, seq_len, embedding_dim)
    torch.manual_seed(42)
    looped_self_attention = LoopedSelfAttention(embedding_dim)
    torch.manual_seed(42)
    matrix_self_attention = MatrixSelfAttention(embedding_dim)
    # print('shapes: ', looped_self_attention(x).shape, matrix_self_attention(x).shape)
    assert looped_self_attention(x).shape == matrix_self_attention(x).shape == (batch_size, seq_len, embedding_dim)

test_self_attention()

k tensor([[[ 0.6590,  0.1529],
         [ 1.6470, -1.4208]],

        [[ 0.0905,  0.9164],
         [ 0.6113,  0.0733]]], grad_fn=<AddBackward0>)
k t tensor([[[ 0.6590,  1.6470],
         [ 0.1529, -1.4208]],

        [[ 0.0905,  0.6113],
         [ 0.9164,  0.0733]]], grad_fn=<TransposeBackward0>)
tensor([[[ 0.2601,  0.0035],
         [ 0.1281, -1.9326]],

        [[-0.4109, -0.1590],
         [-0.0304, -0.3016]]], grad_fn=<UnsafeViewBackward>)


In [70]:
def time_test():
    print('looped Self Attention exectution time:')
    %timeit looped_self_attention(x)
    print('matrix Self Attention exectution time:')
    %timeit matrix_self_attention(x)
time_test()

looped Self Attention exectution time:
303 µs ± 4.98 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
matrix Self Attention exectution time:
161 µs ± 3 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [108]:
class AttentionModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(AttentionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embedding = nn.Parameter(torch.randn(context_size, embedding_dim)*0.001 - 0.0005)
        self.self_attention = MatrixSelfAttention(embedding_dim)
        self.MLP = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim),
            nn.ReLU(),
            nn.Linear(embedding_dim, embedding_dim)
        )
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        # x has shape (batch_size, context_size)
        x = self.embedding(x)
      
        x = x + self.pos_embedding
      
        x = self.self_attention(x)

        x = self.MLP(x)

        x = self.output(x[:, -1, :])
        
        return x
    


In [109]:
def test_model():
    vocab_size = 100
    embedding_dim = 10
    context_size = 5
    model = AttentionModel(vocab_size = vocab_size, embedding_dim = embedding_dim, context_size = context_size)
    x = torch.randint(0, vocab_size, (2, context_size))
    # print('x shape: ', x.shape)
    print('final shape:',model(x).shape)
    # assert model(x).shape == (2, vocab_size)
test_model()

final shape: torch.Size([2, 100])


In [115]:
"""Function to train the model, which returns the train loss"""


def train(model, train_dataloader, loss_func, optimizer):
    model.train()
    train_loss = 0
    for x, y in train_dataloader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = loss_func(output, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    return train_loss / len(train_dataloader)


In [116]:
"""Function to evaluate the model, which returns the validation loss and accuracy"""
def evaluate(model, num_examples, valid_dataloader, loss_func):

    correct = 0
    val_loss = 0

    for x, y in valid_dataloader:
        x, y = x.to(device), y.to(device)

        with torch.no_grad():

          logits = model(x)

        preds = logits.argmax(dim=1)

        loss = loss_func(logits, y)

        val_loss += loss.item()

        correct += (preds==y).sum().item()

    return (correct / num_examples), (val_loss / len(valid_dataloader))

In [125]:

def test_train_eval():
    tokenizer = Tokenizer()
    test_corpus = ['a cat walks in the bad.', 'a dog walks in the good.']
    tokenizer.create_vocab(test_corpus)
    context_size = 3
    dataset_ = IMDBdataset(test_corpus, tokenizer, context_size=context_size)
    dataloader_ = torch.utils.data.DataLoader(dataset_, batch_size=2, shuffle=True)
    model = AttentionModel(vocab_size = len(tokenizer.vocab), embedding_dim = 10, context_size = context_size).to(device)
    loss_func = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    train_loss = train(model, dataloader_, loss_func, optimizer)
    acc, val_loss = evaluate(model, len(dataset_), dataloader_, loss_func)
    assert train_loss > 0
    assert acc >= 0
    assert val_loss > 0
    
test_train_eval()


In [127]:
learning_rate = 0.0001
tokenizer = Tokenizer(max_vocab_token=3000)
tokenizer.create_vocab(x_train)
context_size = 9
dataset_train = IMDBdataset(x_train, tokenizer, context_size=context_size)
dataset_valid = IMDBdataset(x_valid, tokenizer, context_size=context_size)
num_examples = len(dataset_valid)
train_dataloader = torch.utils.data.DataLoader(dataset_train, batch_size=128, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(dataset_valid, batch_size=32, shuffle=True)
model = AttentionModel(vocab_size = len(tokenizer.vocab), embedding_dim = 100, context_size = context_size).to(device)
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model.to(device)
loss_func.to(device)

CrossEntropyLoss()

In [128]:


epochs = 10
train_losses = []
valid_losses = []
accuracy_train = []
perplexities = []
accuracy, valid_loss = evaluate(model, num_examples,valid_dataloader, loss_func)
print(f"Pré treino; Validation Loss: {valid_loss:.3f}; Perplexity: {np.exp(valid_loss):.3f}; Accuracy: {accuracy:.3f}")
for t in range(epochs):
  train_loss = train(model, train_dataloader, loss_func, optimizer)
  train_losses.append(train_loss)
  accuracy, valid_loss = evaluate(model, num_examples, valid_dataloader, loss_func)
  valid_losses.append(valid_loss)
  accuracy_train.append(accuracy)
  perplexities.append(np.exp(train_loss))
  print(f"Epoch: {t+1}; Train Loss: {train_loss:.3f}; Perplexity: {np.exp(train_loss):.3f}; Validation Loss: {valid_loss:.3f}; Accuracy: {accuracy:.3f}")



Pré treino; Validation Loss: 8.018; Perplexity: 3035.908; Accuracy: 0.000
Epoch: 1; Train Loss: 5.509; Perplexity: 246.830; Validation Loss: 5.195; Accuracy: 0.119


KeyboardInterrupt: 