<a href="https://colab.research.google.com/github/celestinoalan/inteligencia_artificial/blob/main/transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformers

Criamos um transformer e utilizamos para gerar texto similar às obras de Machado de Assis.

In [1]:
from torch.nn import Linear, Module, Dropout, ModuleList, Sequential, LayerNorm, Embedding
from torch.nn import functional as F
import torch

## Dados

In [2]:
# Baixa machado_de_assis.txt no diretório raíz
!wget https://raw.githubusercontent.com/celestinoalan/inteligencia_artificial/main/data/machado_de_assis.txt
with open('machado_de_assis.txt', 'r', encoding='utf-8') as f:
    text = f.read()

len(text)

--2024-10-24 01:37:45--  https://raw.githubusercontent.com/celestinoalan/inteligencia_artificial/main/data/machado_de_assis.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11340314 (11M) [text/plain]
Saving to: ‘machado_de_assis.txt’


2024-10-24 01:37:46 (232 MB/s) - ‘machado_de_assis.txt’ saved [11340314/11340314]



10977697

In [8]:
class Tokenizer:
    def __init__(self, text):
        self.chars = sorted(list(set(text)))
        self.stoi = { ch:i for i,ch in enumerate(self.chars) }
        self.itos = { i:ch for i,ch in enumerate(self.chars) }

    def encode(self, s):
        return torch.tensor([self.stoi[c] for c in s], dtype=torch.int)

    def decode(self, tokens):
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.to("cpu").numpy().squeeze()
        return ''.join([self.itos[t] for t in tokens])

## Transformer

In [108]:
dropout = 0.2
block_size = 256
n_embeds = 384
tokenizer = Tokenizer(text)
data = tokenizer.encode(text)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
vocab_size = len(tokenizer.stoi)
device = 'cuda' if torch.cuda.is_available() else 'cpu'


class Head(Module):
    def __init__(self, head_size: int):
        super().__init__()
        self.head_size = head_size
        self.query = Linear(n_embeds, self.head_size, bias=False)
        self.key = Linear(n_embeds, self.head_size, bias=False)
        self.value = Linear(n_embeds, self.head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = Dropout(dropout)

    def forward(self, x):
        q = self.query(x)  # (B x T x C) * (C x Hs) = (B x T x Hs)
        k = self.key(x)  # (B x T x C) * (C x Hs) = (B x T x Hs)
        v = self.value(x)  # (B x T x C) * (C x Hs) = (B x T x Hs)
        affinities = q @ k.transpose(-2, -1) / self.head_size ** 0.5  # (B x T x Hs) @ (B x Hs x T) = (B x T x T)
        affinities = affinities.masked_fill(self.tril[:affinities.shape[1], :affinities.shape[2]] == 0, float("-inf"))  # For a given block T might be < than block size
        affinities = affinities.softmax(dim=-1)
        affinities = self.dropout(affinities)
        return affinities @ v  # (B x T x T) @ (B x T x Hs) = (B x T x Hs)


class MultiHeadAttention(Module):
    def __init__(self, num_heads):
        super().__init__()
        head_size = n_embeds // num_heads
        self.heads = ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = Linear(num_heads * head_size, n_embeds)
        self.dropout = Dropout(dropout)

    def forward(self, x):
        concat = torch.cat([head(x) for head in self.heads], dim=-1)  # (B x T x Hs*NH)
        return self.dropout(self.proj(concat))  # (B x T x Hs*NH) * (Hs*NH x C) = (B x T x C)

class FeedForward(Module):
    def __init__(self, n_embeds):
        super().__init__()
        self.net = Sequential(
            Linear(n_embeds, 4 * n_embeds),
            torch.nn.ReLU(),
            Linear(4 * n_embeds, n_embeds),
            Dropout(dropout)
        )
        self.ln1 = Linear(n_embeds, 4 * n_embeds)
        self.ln2 = Linear(4 * n_embeds, n_embeds)
        self.dropout = Dropout(dropout)

    def forward(self, x):
        return self.dropout(self.ln2(torch.relu(self.ln1(x))))


class Block(Module):
    def __init__(self, n_embeds, num_heads):
        super().__init__()
        self.sa = MultiHeadAttention(num_heads)
        self.ffwd = FeedForward(n_embeds)
        self.ln1 = LayerNorm(n_embeds)
        self.ln2 = LayerNorm(n_embeds)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class Transformer(Module):
    def __init__(self, n_blocks: int, num_heads: int):
        super().__init__()
        self.token_embedding_table = Embedding(vocab_size, n_embeds)
        self.position_embedding_table = Embedding(block_size, n_embeds)
        self.blocks = Sequential(*(Block(n_embeds, num_heads) for _ in range(n_blocks)))
        self.ln_f = LayerNorm(n_embeds)
        self.lm_head = Linear(n_embeds, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)  # (B x T x C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T x C)
        x = tok_emb + pos_emb  # (B x T x C)
        x = self.blocks(x)  # (B x T x C)
        x = self.ln_f(x)  # (B x T x C)
        logits = self.lm_head(x)  # (B x T x V)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = torch.nn.functional.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]  # Indices we'll use for predictions
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]  # (B, V)
            probs = torch.nn.functional.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

## Output Without Training

In [109]:
model = Transformer(n_blocks=6, num_heads=6)
model = model.to(device)
tokenizer.decode(model.generate(torch.zeros((1, 1), dtype=torch.long, device=device), 1000))

'\n½t ûÔ\'mv4ã$% éÈ!ïû?—:Pºçç–4MwöùmVwí%DmÔî/&ÛãÍ\nc*v(r¿è”Í"»èEÇQÇìõÈ-p»qÊvFG!RhC_;üFËªïl–ÛÍm9’3W2ùdìûêw–\'nSÉ!ÍdìÚuêDõªã21â°+V´?"â"1fKLömÜ½öF§+-púçt=Uúd+F2oëºZª’WWeÓîoI4Pëum8ï(MxJxht;o*«œPHO)“ùJ$Tz)çYä!½ãzÛ9ôVê«1âEËï)*(dÈá°úûù/Aë§?Ë;`“;’_6Jë4èUñ…Deáígº¡üäpÊgSG´xmgw¿+QñÕ=â«qí,5LqòdyM5GY5ñî¡A)66vVèJ:’“äèpöÛ5ù.QnììòS¡è‘vè‘59H’RB2.ÃfèçA!worò1eèyAîì«7úêr7”fœGè´ÉOh\n9;hlèÜ?`½_Gñ¿5eóñé”Í\'?ûâ¿―sÈQkaéu?‘ÀQõ´X»Ã-N`Oâ–§\n=$ExjwîúvÀIkâY’¿CÓ-0JÛõ’1bEÂ“FCAÍòîep’”óJVó?N2Wí–«Ae\'2…JLfâ88wuJdîç…P(Zi3°ÜÔH…z,J§ûäPt1Qd´sZ§Â(1=i\nTÛ5òB+=1é+àOwO´´M"mS¿ZkòL9=óAîhwBík??Xààk¡L:ÇÁq?―TJïìuÕÓ2–é.3nJaYÁìÜ?úî”*1à6j:Ízúfáü5Ww2!ÛÂHj‘lœ´J`4ÛyHÇkM;UVMP–ôJ+.OÂ45uÈEVªuª+–Ê2H¿C4îPÊiwùjÇO,î:òì¿õõáÈót5l«6h14Bõó5cÀ‘é§fay=P.çE\ne¡’Jî:ÂñBHY‘ëKfâmöîzòòœJòO5èn?(LÉ*RœWtn&=_ã:%»è+!Z`ÛûzR…´uq(ÓqÚGO5Qap(wAn–6unM=‘Hvç6hnªUI¡HÂú)ôâ¡Íwò2è’öU=Q?=ÇìïÀY¿´RõÂ2:ë`çë–né½*;eúw"sE?Êk°áWg–j¡aUY¿jHs´à$«œw)\nÊÀ1‘’kùGóP%õœHÜ1fs―2Ó6JoV;ª1ñny_e?sõÃa´smõwaDd&TÍ«‘–ªóù8pWEÛpw;Ó¿y%―\n½¿Vªò/èq1,4‘ëèaÂdìhûâ8néÓF"“wfG4MY‘ylòó¿âñ+ltÍ5W:ò;nìºÂ´c\'ÃºFi´iéì

## Training

In [110]:
def get_random_batch(split, batch_size):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,), device=device)
    x = torch.stack([data[i:i+block_size] for i in ix]).long()
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]).long()
    return x.to(device), y.to(device)


@torch.no_grad()
def estimate_loss(batch_size):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters, device=device)
        for k in range(eval_iters):
            X, Y = get_random_batch(split, batch_size)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [111]:
batch_size = 64
max_iters = 5000
eval_interval = 100
eval_iters = 200

lr = 3e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
x_batch, y_batch = get_random_batch(train_data, batch_size=batch_size)

In [112]:
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

17.938319 M parameters


In [113]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(batch_size)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_random_batch('train', batch_size)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(tokenizer.decode(model.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 5.1193, val loss 5.1240
step 100: train loss 2.4613, val loss 2.4715
step 200: train loss 2.4176, val loss 2.4274
step 300: train loss 2.3480, val loss 2.3673
step 400: train loss 2.2241, val loss 2.2544
step 500: train loss 2.1138, val loss 2.1473
step 600: train loss 2.0158, val loss 2.0581
step 700: train loss 1.9369, val loss 1.9804
step 800: train loss 1.8651, val loss 1.9126
step 900: train loss 1.8062, val loss 1.8626
step 1000: train loss 1.7549, val loss 1.8142
step 1100: train loss 1.7155, val loss 1.7718
step 1200: train loss 1.6769, val loss 1.7382
step 1300: train loss 1.6474, val loss 1.7137
step 1400: train loss 1.6152, val loss 1.6825
step 1500: train loss 1.5867, val loss 1.6528
step 1600: train loss 1.5662, val loss 1.6318
step 1700: train loss 1.5388, val loss 1.6058
step 1800: train loss 1.5251, val loss 1.5974
step 1900: train loss 1.5049, val loss 1.5764
step 2000: train loss 1.4887, val loss 1.5573
step 2100: train loss 1.4722, val loss 1.5441
