In [9]:
nome = "Arthur"
print(f"Meu nome é {nome}")


Meu nome é Arthur


#  Exercício: Modelo de Linguagem com auto-atenção (versão eficiente)

Este exercício é similar ao da aula 5, mas iremos agora treinar *eficientemente* uma rede neural com uma ou mais camadas de auto-atenção para prever a próxima palavra de um texto, data as palavras anteriores como entrada. 

Para tanto, deve-se implementar:
1. A máscara causal de atenção. Ela possibilitará que, durante o treinamento, com apenas uma forward+backward pass na rede, tenhamos as losses para todos os tokens de entrada (slide 117).
2. A máscara de PADs, que permite que usemos sequencias de comprimento variável no mesmo batch (slide 118).
3. Múltiplas cabeças.

## Importação dos pacotes

In [10]:
import collections
import itertools
import functools
import math
import os
import random
import re

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook
from typing import List


In [3]:
# Check which GPU we are using
!nvidia-smi

Mon Oct 17 19:23:57 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   55C    P5    10W /  N/A |    725MiB /  5944MiB |     23%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
device = torch.device(dev)
print("Using {}".format(device))


Using cuda:0


# Carregamento do dataset 

Primeiro, fazemos download do dataset:

In [5]:
!wget -nc http://files.fast.ai/data/aclImdb.tgz 
!tar -xzf aclImdb.tgz

File ‘aclImdb.tgz’ already there; not retrieving.



## Carregando o dataset

Criaremos uma divisão de treino (80%) e validação (20%) artificialmente.

Nota: Evitar de olhar ao máximo o dataset de teste para não ficar enviseado no que será testado. Em aplicações reais, o dataset de teste só estará disponível no futuro, ou seja, é quando o usuário começa a testar o seu produto.

In [11]:
def load_texts(folder):
    texts = []
    for path in os.listdir(folder):
        with open(os.path.join(folder, path)) as f:
            texts.append(f.read())
    return texts


x_train_pos = load_texts("aclImdb/train/pos")
x_train_neg = load_texts("aclImdb/train/neg")
x_test_pos = load_texts("aclImdb/test/pos")
x_test_neg = load_texts("aclImdb/test/neg")

x_train = x_train_pos + x_train_neg
x_test = x_test_pos + x_test_neg

# Embaralhamos o treino para depois fazermos a divisão treino/valid.
random.shuffle(x_train)

n_train = int(0.8 * len(x_train))

x_valid = x_train[n_train:]
x_train = x_train[:n_train]

print(len(x_train), "amostras de treino.")
print(len(x_valid), "amostras de desenvolvimento.")
print(len(x_test), "amostras de teste.")

print("3 primeiras amostras treino:")
for x in x_train[:3]:
    print(x[:100])

print("3 últimas amostras treino:")
for x in x_train[-3:]:
    print(x[:100])

print("3 primeiras amostras validação:")
for x in x_valid[:3]:
    print(x[:100])

print("3 últimas amostras validação:")
for x in x_valid[-3:]:
    print(x[:100])


20000 amostras de treino.
5000 amostras de desenvolvimento.
25000 amostras de teste.
3 primeiras amostras treino:
Terrfic film with a slightyly slow start - give it a chance to start cooking. Story builds in intere
my friend bought the movie for 5 (its is not even 1 cent worth), because they wrote it was like Ame
I haven't laughed this hard at a movie in a long time. I got to go to an advance screening, and was 
3 últimas amostras treino:
I saw this film last night following a lot of good reviews from many sources. I would like to point 
Parsifal (1982) Starring Michael Kutter, Armin Jordan, Robert Lloyd, Martin Sperr, Edith Clever, Aag
This is a bit of a puzzle for a lot of the artsy Lynch crowd. They tend to try to write this off as 
3 primeiras amostras validação:
Robot Jox tries hard, but is fundamentally a series of fight scenes strung together -- robot against
On Steve Irwin's show, he's hillarious. He doesn't even try to be funny and he just is but his movie
I didn't expect a m

In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [70]:
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, corpus: List[str], tokenizer, max_seq_length):

        self.tokenizer = tokenizer

        data = []

        self.tokenized = self.tokenizer.batch_encode_plus(
            [x.replace("<br />", " ") for x in corpus],
            padding=False,
            truncation=False,
            return_tensors=None,
            return_attention_mask=True,
        )

        for tokens, attention_masks in zip(
            self.tokenized["input_ids"], self.tokenized["attention_mask"]
        ):
            print(tokens, attention_masks, len(tokens), len(attention_masks))
            assert len(tokens) == len(attention_masks)
            data.extend(
                [
                    [
                        tokens[i : i + max_seq_length + 1],
                        attention_masks[i : i + max_seq_length + 1],
                    ]
                    for i in range(0, len(tokens) - max_seq_length, max_seq_length)
                ]
            )

        self.data = torch.IntTensor(data)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return (
            self.data[index][0, :-1],
            self.data[index][1, :-1],
            self.data[index][0, 1:].long(),
            self.data[index][1, 1:],
        )


In [8]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, emb_dim, heads) -> None:
        super(MultiHeadSelfAttention, self).__init__()
        self.heads = heads
        self.Wq = nn.Linear(emb_dim, emb_dim, bias=False)
        self.Wk = nn.Linear(emb_dim, emb_dim, bias=False)
        self.Wv = nn.Linear(emb_dim, emb_dim, bias=False)
        self.Wo = nn.Linear(emb_dim, emb_dim, bias=False)

    def forward(self, input_embeddings, mask=None):
        Q = self.Wq(input_embeddings)
        K = self.Wk(input_embeddings)
        V = self.Wv(input_embeddings)

        Q = Q.reshape(Q.shape[0], Q.shape[1], self.heads, -1)
        K = K.reshape(K.shape[0], K.shape[1], self.heads, -1)
        V = V.reshape(V.shape[0], V.shape[1], self.heads, -1)

        Q = Q.transpose(1, 2)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        scores = Q @ torch.transpose(K, -2, -1) / math.sqrt(Q.shape[-1])

        if mask is not None:
            scores[mask == 0] = -1e8

        attention_weigths = torch.softmax(scores, dim=-1)

        E = attention_weigths @ V

        E = E.transpose(1, 2)

        E = E.reshape(E.shape[0], E.shape[1], -1)

        E = self.out_proj(E)

        return E


In [None]:
class MyAttentionModel(nn.Module):
    def __init__(
        self,
        max_seq_length: int,
        vocab_size: int,
        embedding_dim: int = 50,
        heads: int = 5,
        eos_token_id: int = None,
    ):
        super(MyAttentionModel, self).__init__()

        self.generating = False

        self.eos_token_id = eos_token_id

        self.max_seq_length = max_seq_length

        self.heads = heads

        self.causal_mask = nn.Parameter(
            data=torch.tril(torch.ones(max_seq_length, max_seq_length)),
            requires_grad=False,
        )

        self.dropout = nn.Dropout(p=0.15)

        self.tokens_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.positional_embeddings = nn.Parameter(
            data=torch.normal(0, 0.1, size=(max_seq_length, embedding_dim))
        )

        self.self_attention = MultiHeadSelfAttention(
            heads=heads, embedding_dim=embedding_dim
        )

        self.feed_foward = nn.Sequential(
            nn.Linear(embedding_dim, 4 * embedding_dim),
            nn.ReLU(),
            nn.Linear(4 * embedding_dim, embedding_dim),
        )

        self.language_head = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x, attention_mask=None):

        batch_size = x.shape[0]
        seq_len = x.shape[1]

        input_embeddings = (
            self.tokens_embeddings(x) + self.positional_embeddings[:seq_len, :]
        )

        if attention_mask is not None:
            attention_mask = attention_mask.reshape(batch_size, 1, 1, seq_len).expand(
                -1, self.heads, seq_len, -1
            )
            causal_mask = (
                self.causal_mask[:seq_len, :seq_len]
                .reshape(1, 1, seq_len, seq_len)
                .expand(batch_size, self.heads, seq_len, seq_len)
            )
            mask = attention_mask * causal_mask
        else:
            mask = (
                self.causal_mask[:seq_len, :seq_len]
                .reshape(1, 1, seq_len, seq_len)
                .expand(batch_size, self.heads, seq_len, seq_len)
            )

        E = self.self_attention(input_embeddings, mask=mask)
        E = E + self.dropout(input_embeddings)  # skip-connection

        y = self.feed_foward(E)
        y = y + self.dropout(E)  # skip-connection

        if self.generating:
            logits = self.language_head(y[:, -1, :])
        else:
            logits = self.language_head(y)

        return logits
