<a href="https://colab.research.google.com/github/arelkeselbri/gsi073/blob/main/mini_optimus_prime.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GSI073 - Tópicos Especiais de Inteligência Artificial (Large Language Models) - Prof. Marcelo Keese Albertini

Este código foi escrito em aula para demonstrar rapidamente como é a arquitetura de um Transformer.

In [None]:
import torch
from torch import nn

from torch.nn import functional as F

In [None]:
class MeuBloco(nn.Module):
  def __init__(self, n_heads, model_dim, vocab_size):
    super().__init__()
    self.norm1 = nn.LayerNorm(model_dim, bias = False)
    self.norm2 = nn.LayerNorm(model_dim, bias = False)

    self.attention = nn.MultiheadAttention(embed_dim = model_dim,
                                           num_heads = n_heads
                                           )
    self.ffn = nn.Sequential(nn.Linear(model_dim, 2*model_dim),
                             nn.ReLU(),
                             nn.Linear(2*model_dim, model_dim))

  def forward(self, x):

    res_atencao, _ = self.attention(x, x, x)
    x = self.norm1(x + res_atencao)

    res_ffn = self.ffn(x)
    x = self.norm2(res_ffn + x)
    return x



class MeuEncoder(nn.Module):
  def __init__(self, n_layers, n_heads, model_dim, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, model_dim)
    self.layers = nn.ModuleList( [
        MeuBloco(n_heads, model_dim, vocab_size)
        for _ in range(n_heads) ])

  def forward(self, x):
    x = self.embedding(x)

    for layer in self.layers:
      x = layer(x)

    return x



In [None]:
mini_llm = MeuEncoder(n_layers = 2, n_heads = 2, model_dim = 8, vocab_size= 10)

In [None]:
mini_llm(torch.tensor([0,1,3,4]))

In [None]:
mini_llm

In [None]:
mini_llm.embedding.weight.requires_grad = False

In [None]:
mini_llm.embedding.weight

In [None]:
class MeuBlocoDecoder(nn.Module):
  def __init__(self, model_dim, vocab_size, n_heads, n_layers):
    super().__init__()
    self.norm1 = nn.LayerNorm(model_dim)
    self.norm2 = nn.LayerNorm(model_dim)
    self.norm3 = nn.LayerNorm(model_dim)

    self.embedding = nn.Embedding(vocab_size, model_dim)

    self.att_cros = nn.MultiheadAttention(model_dim, n_heads)
    self.att_self = nn.MultiheadAttention(model_dim, n_heads)

    self.ffn = nn.Sequential(nn.Linear(model_dim, 2*model_dim), nn.ReLU(), nn.Linear(2*model_dim, model_dim))

    self.lm_head = nn.Linear(model_dim, vocab_size)

    self.softmax = nn.Softmax(0)

  def forward(self, x, encoder_output):
    x = self.embedding(x)

    res_att_self = self.att_self(x, x, x)
    x = self.norm1(x + res_att_self)

    res_att_cros = self.att_cros(encoder_output, encoder_output, x)
    x = self.norm2( x + res_att_cros)

    res_ffn = self.ffn(x)

    x = self.norm3(x + res_ffn)

    logits = self.softmax(self.lm_head(x))

    return logits


    logits = nn.Softmax(out)

