<a href="https://colab.research.google.com/github/alxmarqs/LLMtopics/blob/main/gsi073_aula0_attention_comparative.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepara√ß√£o dos dados

Esta tarefa √© inverter sequ√™ncias de caracteres. Exemplo: **aabcd** em **dcbaa**.


In [None]:
import torch
import torch.nn as nn
import random
import torch.nn.functional as F

chars = list("abcd ")
vocab = {ch: i for i, ch in enumerate(chars)} # Cada letra, ganha um n√∫mero
inv_vocab = {i: ch for ch, i in vocab.items()}# Tabela de decodifica√ß√£o
vocab_size = len(vocab)

def encode(s): # Codifica letras em n√∫meros
    return torch.tensor([vocab[c] for c in s], dtype=torch.long)

def decode(t): # Decodifica n√∫meros em letras
    return ''.join(inv_vocab[int(x)] for x in t)

def random_seq(n=6): # Cria novas sequ√™ncias
    return ''.join(random.choice(chars[:-1]) for _ in range(n))

# Gerar dados
pairs = [(encode(s), encode(s[::-1])) for s in [random_seq() for _ in range(50000)]]

max_len = max(len(x) for x, _ in pairs) # pega maior sequ√™ncia

def pad(x):  # Preenche conjunto de dados em pad no √∫ltimo √≠ndice
    return torch.cat([x, torch.tensor([vocab[' ']] * (max_len - len(x)))], dim=0)

inputs = torch.stack([pad(x) for x, _ in pairs])
targets = torch.stack([pad(y) for _, y in pairs])

train_ds = torch.utils.data.TensorDataset(inputs, targets)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=128, shuffle=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Veja um par

In [None]:
print(pairs[1])

# Defini√ß√£o do modelo Seq2Seq com GRU

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)

    def forward(self, x):
        x = self.embed(x)
        outputs, h = self.gru(x)
        return outputs, h   # <--- ESSENCIAL

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class OneHeadAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        # Proje√ß√µes lineares para Q, K, V
        self.W_q = nn.Linear(hidden_size, hidden_size, bias=False)
        self.W_k = nn.Linear(hidden_size, hidden_size, bias=False)
        self.W_v = nn.Linear(hidden_size, hidden_size, bias=False)

        # fator de escala (1 / sqrt(d_k))
        self.scale = hidden_size ** -0.5

    def forward(self, decoder_hidden, encoder_outputs):
        """
        decoder_hidden: (B, 1, H)
        encoder_outputs: (B, S, H)

        Retorna:
          context: (B, 1, H)
          attn_weights: (B, 1, S)
        """

        # ---- 1) Proje√ß√µes Q, K, V ----
        # Q: usa o hidden do decoder
        Q = self.W_q(decoder_hidden)       # (B, 1, H)

        # K, V: usam os outputs do encoder
        K = self.W_k(encoder_outputs)      # (B, S, H)
        V = self.W_v(encoder_outputs)      # (B, S, H)

        # ---- 2) Scaled Dot-Product Attention ----
        # score = Q ¬∑ K^T / sqrt(d_k)
        attn_scores = torch.bmm(Q, K.transpose(1, 2))  # (B, 1, S)
        attn_scores = attn_scores * self.scale

        # softmax sobre S
        attn_weights = F.softmax(attn_scores, dim=-1)  # (B, 1, S)

        # ---- 3) Contexto = soma ponderada dos valores V ----
        context = torch.bmm(attn_weights, V)           # (B, 1, H)

        return context, attn_weights


In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)
        self.attn = OneHeadAttention(hidden_size)

        self.fc = nn.Linear(hidden_size * 2, vocab_size)

    def forward(self, x, h, encoder_outputs):
        """
        x: tokens anteriores corretos  (B, T)
        h: estado inicial do decoder   (1, B, H)
        encoder_outputs: todos os h_s  (B, S, H)
        """
        x = self.embed(x)  # (B, T, E)

        outputs = []
        seq_len = x.size(1)
        hidden = h

        for t in range(seq_len):
            inp = x[:, t:t+1]  # (B, 1, E)

            out_t, hidden = self.gru(inp, hidden)   # out_t: (B,1,H)

            # Aten√ß√£o
            context, attn_w = self.attn(out_t, encoder_outputs)

            # concatena√ß√£o [out_t ; context]
            combined = torch.cat([out_t, context], dim=-1)

            logits = self.fc(combined)  # (B,1,V)
            outputs.append(logits)

        outputs = torch.cat(outputs, dim=1)  # (B, T, V)
        return outputs, hidden


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        encoder_outputs, h = self.encoder(src)
        logits, _ = self.decoder(tgt[:, :-1], h, encoder_outputs)
        return logits

# C√≥digo para usar o modelo treinado: infer√™ncia

In [None]:
def decode_step(decoder, token, h, encoder_outputs):
    """
    Executa um passo de decodifica√ß√£o:
    - token: tensor (B,1)
    - h: estado oculto do decoder (1,B,H)
    - encoder_outputs: (B,S,H)
    """
    logits, h = decoder(token, h, encoder_outputs)  # (B,1,V)
    next_token = logits[:, -1, :].argmax(-1, keepdim=True)  # (B,1)
    return next_token, h


def predict(model, seq, max_len=10):
    model.eval()
    with torch.no_grad():
        # codifica entrada
        src = pad(encode(seq)).unsqueeze(0).to(device, dtype=torch.long)

        # encoder agora retorna (encoder_outputs, h)
        encoder_outputs, h = model.encoder(src)

        # token inicial (ex: espa√ßo ou <sos>)
        token = torch.tensor([[vocab[' ']]], dtype=torch.long, device=device)

        seq_invertida = []
        for _ in range(max_len):
            token, h = decode_step(model.decoder, token, h, encoder_outputs)
            seq_invertida.append(token.item())

        return decode(seq_invertida)


# Prepara√ß√£o para treino

In [None]:
emb_size = 64
hidden_size = 64
encoder = Encoder(vocab_size, emb_size, hidden_size)
decoder = Decoder(vocab_size, emb_size, hidden_size)
model = Seq2Seq(encoder, decoder).to(device)

loss_fn = nn.CrossEntropyLoss(ignore_index=vocab[' ']) # ignora o pad: " "
opt = torch.optim.Adam(model.parameters(), lr=1e-3)

# Execu√ß√£o do treino

In [None]:
for epoch in range(10):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device, dtype=torch.long), yb.to(device, dtype=torch.long)
        opt.zero_grad()
        logits = model(xb, yb)
        loss = loss_fn(logits.reshape(-1, vocab_size), yb[:, 1:].reshape(-1))
        loss.backward()
        opt.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}: loss={total_loss/len(train_dl):.4f}")

# Exerc√≠cio
Compare os resultados da aten√ß√£o de Luong e de uma cabe√ßa de aten√ß√£o do Transformer.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Configura√ß√£o dos dados
chars = list("abcd ")
vocab = {ch: i for i, ch in enumerate(chars)}
inv_vocab = {i: ch for ch, i in vocab.items()}
vocab_size = len(vocab)

def encode(s):
    return torch.tensor([vocab[c] for c in s], dtype=torch.long)

def decode(t):
    return ''.join(inv_vocab[int(x)] for x in t)

def random_seq(n=6):
    return ''.join(random.choice(chars[:-1]) for _ in range(n))

# Gerar dados
pairs = [(encode(s), encode(s[::-1])) for s in [random_seq() for _ in range(50000)]]
max_len = max(len(x) for x, _ in pairs)

def pad(x):
    return torch.cat([x, torch.tensor([vocab[' ']] * (max_len - len(x)))], dim=0)

inputs = torch.stack([pad(x) for x, _ in pairs])
targets = torch.stack([pad(y) for _, y in pairs])

train_ds = torch.utils.data.TensorDataset(inputs, targets)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=128, shuffle=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ============================================================
# ATEN√á√ÉO DE LUONG (dot, general, concat)
# ============================================================
class LuongAttention(nn.Module):
    def __init__(self, hidden_size, attn_type='dot'):
        super().__init__()
        self.hidden_size = hidden_size
        self.attn_type = attn_type

        if attn_type == 'general':
            self.W_a = nn.Linear(hidden_size, hidden_size, bias=False)
        elif attn_type == 'concat':
            self.W_a = nn.Linear(hidden_size * 2, hidden_size, bias=False)
            self.v_a = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, decoder_hidden, encoder_outputs):
        """
        decoder_hidden: (B, 1, H)
        encoder_outputs: (B, S, H)
        """
        if self.attn_type == 'dot':
            # score(h_t, h_s) = h_t^T ¬∑ h_s
            attn_scores = torch.bmm(decoder_hidden, encoder_outputs.transpose(1, 2))

        elif self.attn_type == 'general':
            # score(h_t, h_s) = h_t^T ¬∑ W_a ¬∑ h_s
            transformed = self.W_a(encoder_outputs)
            attn_scores = torch.bmm(decoder_hidden, transformed.transpose(1, 2))

        elif self.attn_type == 'concat':
            # score(h_t, h_s) = v_a^T ¬∑ tanh(W_a ¬∑ [h_t; h_s])
            seq_len = encoder_outputs.size(1)
            decoder_expanded = decoder_hidden.expand(-1, seq_len, -1)
            combined = torch.cat([decoder_expanded, encoder_outputs], dim=-1)
            attn_scores = self.v_a(torch.tanh(self.W_a(combined)))
            attn_scores = attn_scores.transpose(1, 2)

        attn_weights = F.softmax(attn_scores, dim=-1)
        context = torch.bmm(attn_weights, encoder_outputs)

        return context, attn_weights

# ============================================================
# ATEN√á√ÉO DO TRANSFORMER (Scaled Dot-Product com Q, K, V)
# ============================================================
class TransformerAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.W_q = nn.Linear(hidden_size, hidden_size, bias=False)
        self.W_k = nn.Linear(hidden_size, hidden_size, bias=False)
        self.W_v = nn.Linear(hidden_size, hidden_size, bias=False)
        self.scale = hidden_size ** -0.5

    def forward(self, decoder_hidden, encoder_outputs):
        """
        decoder_hidden: (B, 1, H)
        encoder_outputs: (B, S, H)
        """
        Q = self.W_q(decoder_hidden)
        K = self.W_k(encoder_outputs)
        V = self.W_v(encoder_outputs)

        attn_scores = torch.bmm(Q, K.transpose(1, 2)) * self.scale
        attn_weights = F.softmax(attn_scores, dim=-1)
        context = torch.bmm(attn_weights, V)

        return context, attn_weights

# ============================================================
# ENCODER E DECODER COM ATEN√á√ÉO CONFIGUR√ÅVEL
# ============================================================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)

    def forward(self, x):
        x = self.embed(x)
        outputs, h = self.gru(x)
        return outputs, h

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, attention_module):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)
        self.attn = attention_module
        self.fc = nn.Linear(hidden_size * 2, vocab_size)

    def forward(self, x, h, encoder_outputs):
        x = self.embed(x)
        outputs = []
        seq_len = x.size(1)
        hidden = h
        attn_weights_list = []

        for t in range(seq_len):
            inp = x[:, t:t+1]
            out_t, hidden = self.gru(inp, hidden)
            context, attn_w = self.attn(out_t, encoder_outputs)
            attn_weights_list.append(attn_w)
            combined = torch.cat([out_t, context], dim=-1)
            logits = self.fc(combined)
            outputs.append(logits)

        outputs = torch.cat(outputs, dim=1)
        return outputs, hidden, attn_weights_list

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        encoder_outputs, h = self.encoder(src)
        logits, _, attn_weights = self.decoder(tgt[:, :-1], h, encoder_outputs)
        return logits, attn_weights

# ============================================================
# FUN√á√ÉO DE TREINO
# ============================================================
def train_model(model, epochs=10):
    loss_fn = nn.CrossEntropyLoss(ignore_index=vocab[' '])
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)

    losses = []
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for xb, yb in train_dl:
            xb, yb = xb.to(device, dtype=torch.long), yb.to(device, dtype=torch.long)
            opt.zero_grad()
            logits, _ = model(xb, yb)
            loss = loss_fn(logits.reshape(-1, vocab_size), yb[:, 1:].reshape(-1))
            loss.backward()
            opt.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_dl)
        losses.append(avg_loss)
        print(f"Epoch {epoch+1}: loss={avg_loss:.4f}")

    return losses

# ============================================================
# FUN√á√ÉO DE PREDI√á√ÉO COM VISUALIZA√á√ÉO DE ATEN√á√ÉO
# ============================================================
def predict_with_attention(model, seq):
    model.eval()
    with torch.no_grad():
        src = pad(encode(seq)).unsqueeze(0).to(device, dtype=torch.long)
        encoder_outputs, h = model.encoder(src)

        token = torch.tensor([[vocab[' ']]], dtype=torch.long, device=device)
        seq_invertida = []
        all_attn_weights = []

        for _ in range(len(seq)):
            x = model.decoder.embed(token)
            out_t, h = model.decoder.gru(x, h)
            context, attn_w = model.decoder.attn(out_t, encoder_outputs)
            all_attn_weights.append(attn_w.squeeze().cpu().numpy())

            combined = torch.cat([out_t, context], dim=-1)
            logits = model.decoder.fc(combined)
            token = logits[:, -1, :].argmax(-1, keepdim=True)
            seq_invertida.append(token.item())

        return decode(seq_invertida), np.array(all_attn_weights)

# ============================================================
# TREINAR MODELOS
# ============================================================
print("="*70)
print("COMPARA√á√ÉO: ATEN√á√ÉO DE LUONG vs TRANSFORMER")
print("="*70)

# Modelo 1: Luong Dot
print("\nüìå Treinando modelo com Aten√ß√£o de Luong (DOT)...")
emb_size, hidden_size = 64, 64
encoder1 = Encoder(vocab_size, emb_size, hidden_size)
decoder1 = Decoder(vocab_size, emb_size, hidden_size, LuongAttention(hidden_size, 'dot'))
model_luong_dot = Seq2Seq(encoder1, decoder1).to(device)
losses_luong_dot = train_model(model_luong_dot, epochs=10)

# Modelo 2: Luong General
print("\nüìå Treinando modelo com Aten√ß√£o de Luong (GENERAL)...")
encoder2 = Encoder(vocab_size, emb_size, hidden_size)
decoder2 = Decoder(vocab_size, emb_size, hidden_size, LuongAttention(hidden_size, 'general'))
model_luong_general = Seq2Seq(encoder2, decoder2).to(device)
losses_luong_general = train_model(model_luong_general, epochs=10)

# Modelo 3: Transformer
print("\nüìå Treinando modelo com Aten√ß√£o do Transformer...")
encoder3 = Encoder(vocab_size, emb_size, hidden_size)
decoder3 = Decoder(vocab_size, emb_size, hidden_size, TransformerAttention(hidden_size))
model_transformer = Seq2Seq(encoder3, decoder3).to(device)
losses_transformer = train_model(model_transformer, epochs=10)

# ============================================================
# VISUALIZA√á√ÉO E COMPARA√á√ÉO
# ============================================================
test_seq = "abcdba"

print("\n" + "="*70)
print("RESULTADOS NAS PREDI√á√ïES")
print("="*70)

pred_luong_dot, attn_luong_dot = predict_with_attention(model_luong_dot, test_seq)
pred_luong_general, attn_luong_general = predict_with_attention(model_luong_general, test_seq)
pred_transformer, attn_transformer = predict_with_attention(model_transformer, test_seq)

print(f"\nSequ√™ncia de entrada: {test_seq}")
print(f"Sequ√™ncia esperada:   {test_seq[::-1]}")
print(f"\nLuong (dot):      {pred_luong_dot}")
print(f"Luong (general):  {pred_luong_general}")
print(f"Transformer:      {pred_transformer}")

# Criar visualiza√ß√£o
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Curvas de aprendizado
axes[0, 0].plot(losses_luong_dot, label='Luong (dot)', linewidth=2, marker='o')
axes[0, 0].plot(losses_luong_general, label='Luong (general)', linewidth=2, marker='s')
axes[0, 0].plot(losses_transformer, label='Transformer', linewidth=2, marker='^')
axes[0, 0].set_xlabel('√âpoca', fontsize=12)
axes[0, 0].set_ylabel('Loss', fontsize=12)
axes[0, 0].set_title('Curvas de Aprendizado', fontsize=14, fontweight='bold')
axes[0, 0].legend(fontsize=11)
axes[0, 0].grid(True, alpha=0.3)

# 2. Aten√ß√£o Luong (dot)
sns.heatmap(attn_luong_dot[:, :len(test_seq)],
            xticklabels=list(test_seq),
            yticklabels=list(pred_luong_dot),
            cmap='YlOrRd', annot=True, fmt='.2f',
            cbar_kws={'label': 'Peso de Aten√ß√£o'},
            ax=axes[0, 1])
axes[0, 1].set_title('Aten√ß√£o de Luong (dot)', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Entrada', fontsize=12)
axes[0, 1].set_ylabel('Sa√≠da', fontsize=12)

# 3. Aten√ß√£o Luong (general)
sns.heatmap(attn_luong_general[:, :len(test_seq)],
            xticklabels=list(test_seq),
            yticklabels=list(pred_luong_general),
            cmap='YlGnBu', annot=True, fmt='.2f',
            cbar_kws={'label': 'Peso de Aten√ß√£o'},
            ax=axes[1, 0])
axes[1, 0].set_title('Aten√ß√£o de Luong (general)', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Entrada', fontsize=12)
axes[1, 0].set_ylabel('Sa√≠da', fontsize=12)

# 4. Aten√ß√£o Transformer
sns.heatmap(attn_transformer[:, :len(test_seq)],
            xticklabels=list(test_seq),
            yticklabels=list(pred_transformer),
            cmap='RdPu', annot=True, fmt='.2f',
            cbar_kws={'label': 'Peso de Aten√ß√£o'},
            ax=axes[1, 1])
axes[1, 1].set_title('Aten√ß√£o do Transformer', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Entrada', fontsize=12)
axes[1, 1].set_ylabel('Sa√≠da', fontsize=12)

plt.tight_layout()
plt.savefig('attention_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

# ============================================================
# AN√ÅLISE COMPARATIVA
# ============================================================
print("\n" + "="*70)
print("AN√ÅLISE COMPARATIVA")
print("="*70)

print("\n1Ô∏è‚É£ ATEN√á√ÉO DE LUONG:")
print("   ‚Ä¢ DOT: score(h_t, h_s) = h_t^T ¬∑ h_s")
print("     - Mais simples e r√°pida")
print("     - N√£o tem par√¢metros trein√°veis")
print("     - Funciona bem quando encoder e decoder t√™m mesma dimens√£o")

print("\n   ‚Ä¢ GENERAL: score(h_t, h_s) = h_t^T ¬∑ W_a ¬∑ h_s")
print("     - Adiciona matriz de peso W_a")
print("     - Mais flex√≠vel, aprende transforma√ß√£o")
print("     - Melhor quando dimens√µes s√£o diferentes")

print("\n2Ô∏è‚É£ ATEN√á√ÉO DO TRANSFORMER:")
print("   ‚Ä¢ score(Q, K) = (Q ¬∑ K^T) / sqrt(d_k)")
print("     - Usa proje√ß√µes separadas Q, K, V")
print("     - Fator de escala previne gradientes inst√°veis")
print("     - Mais expressiva e flex√≠vel")
print("     - Base para multi-head attention")

print("\n3Ô∏è‚É£ DIFEREN√áAS PRINCIPAIS:")
print("   ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê")
print("   ‚îÇ Caracter√≠stica  ‚îÇ Luong (dot)  ‚îÇ Luong (gen)  ‚îÇ Transformer  ‚îÇ")
print("   ‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§")
print("   ‚îÇ Par√¢metros      ‚îÇ 0            ‚îÇ H¬≤           ‚îÇ 3¬∑H¬≤         ‚îÇ")
print("   ‚îÇ Proje√ß√µes       ‚îÇ N√£o          ‚îÇ 1            ‚îÇ 3 (Q,K,V)    ‚îÇ")
print("   ‚îÇ Escalonamento   ‚îÇ N√£o          ‚îÇ N√£o          ‚îÇ Sim (1/‚àöd)   ‚îÇ")
print("   ‚îÇ Flexibilidade   ‚îÇ Baixa        ‚îÇ M√©dia        ‚îÇ Alta         ‚îÇ")
print("   ‚îÇ Custo           ‚îÇ Baixo        ‚îÇ M√©dio        ‚îÇ Alto         ‚îÇ")
print("   ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò")

print(f"\n4Ô∏è‚É£ PERFORMANCE NO TREINO:")
print(f"   ‚Ä¢ Luong (dot):     Loss final = {losses_luong_dot[-1]:.4f}")
print(f"   ‚Ä¢ Luong (general): Loss final = {losses_luong_general[-1]:.4f}")
print(f"   ‚Ä¢ Transformer:     Loss final = {losses_transformer[-1]:.4f}")

print("\n‚úÖ CONCLUS√ïES:")
print("   ‚Ä¢ Transformer geralmente aprende representa√ß√µes mais ricas")
print("   ‚Ä¢ Luong √© mais eficiente computacionalmente")
print("   ‚Ä¢ General oferece bom balan√ßo entre complexidade e performance")
print("   ‚Ä¢ Escalonamento do Transformer ajuda estabilidade no treino")