<a href="https://colab.research.google.com/github/alxmarqs/LLMtopics/blob/main/gsi073_aula0_seq2seanalisado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepara√ß√£o dos dados

Esta tarefa √© inverter sequ√™ncias de caracteres. Exemplo: **aabcd** em **dcbaa**.


In [None]:
import torch
import torch.nn as nn
import random

chars = list("abcd ")
vocab = {ch: i for i, ch in enumerate(chars)} # Cada letra, ganha um n√∫mero
inv_vocab = {i: ch for ch, i in vocab.items()}# Tabela de decodifica√ß√£o
vocab_size = len(vocab)

def encode(s): # Codifica letras em n√∫meros
    return torch.tensor([vocab[c] for c in s], dtype=torch.long)

def decode(t): # Decodifica n√∫meros em letras
    return ''.join(inv_vocab[int(x)] for x in t)

def random_seq(n=5): # Cria novas sequ√™ncias
    return ''.join(random.choice(chars[:-1]) for _ in range(n))

# Gerar dados
pairs = [(encode(s), encode(s[::-1])) for s in [random_seq() for _ in range(50000)]]

max_len = max(len(x) for x, _ in pairs) # pega maior sequ√™ncia

def pad(x):  # Preenche conjunto de dados em pad no √∫ltimo √≠ndice
    return torch.cat([x, torch.tensor([vocab[' ']] * (max_len - len(x)))], dim=0)

inputs = torch.stack([pad(x) for x, _ in pairs])
targets = torch.stack([pad(y) for _, y in pairs])

train_ds = torch.utils.data.TensorDataset(inputs, targets)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=128, shuffle=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Veja um par

In [None]:
print(pairs[1])

# Defini√ß√£o do modelo Seq2Seq com GRU

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)

    def forward(self, x):
        x = self.embed(x)
        _, h = self.gru(x)
        return h  # [1, B, H]

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        """
        x: tensor que indica a parte pr√©via correta
        h: tensor que indica o estado do encoder da parte pr√©via
        """
        x = self.embed(x)
        out, h = self.gru(x, h)
        logits = self.fc(out)
        return logits, h # retorna o estado latente para atualizar o estado

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        h = self.encoder(src)
        # usa contexto correto anterior e estado atual para prever o tgt[:, -1]
        logits, _ = self.decoder(tgt[:, :-1], h)
        return logits

# C√≥digo para usar o modelo treinado: infer√™ncia

In [None]:
def decode_step(decoder, token, h):
    logits, h = decoder(token, h) # obt√©m logits e atualiza estado da sequ√™ncia
    next_token = logits[:, -1, :].argmax(-1, keepdim=True)
    return next_token, h

def predict(model, seq, max_len=10):
    model.eval()
    with torch.no_grad():
        src = pad(encode(seq)).unsqueeze(0).to(device, dtype=torch.long)
        h = model.encoder(src) # Obt√©m estado do modelo ap√≥s processar entrada inicial

        # 'token' representa a gera√ß√£o passo a passo da sequ√™ncia invertida
        token = torch.tensor([[vocab[' ']]], dtype=torch.long, device=device)
        seq_invertida = []
        for _ in range(max_len):
            token, h = decode_step(model.decoder, token, h)
            seq_invertida.append(token.item())
        return decode(seq_invertida)

# Prepara√ß√£o para treino

In [None]:
emb_size = 32
hidden_size = 64
encoder = Encoder(vocab_size, emb_size, hidden_size)
decoder = Decoder(vocab_size, emb_size, hidden_size)
model = Seq2Seq(encoder, decoder).to(device)

loss_fn = nn.CrossEntropyLoss(ignore_index=vocab[' ']) # ignora o pad: " "
opt = torch.optim.Adam(model.parameters(), lr=1e-3)

# Execu√ß√£o do treino

In [None]:
for epoch in range(10):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device, dtype=torch.long), yb.to(device, dtype=torch.long)
        opt.zero_grad()
        logits = model(xb, yb)
        loss = loss_fn(logits.reshape(-1, vocab_size), yb[:, 1:].reshape(-1))
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: loss={total_loss/len(train_dl):.4f}")

# Vamos testar

In [None]:
for _ in range(10):
    s = random_seq()
    pred = predict(model, s, max_len=len(s))
    print(f"{s} -> {pred}")


# Exerc√≠cio
Compare o resultado do uso do encoder de de sequ√™ncias muito similares e muito diferentes. Por exemplo, codifique "aaaabb", "bbaaab", "cbcaccc" e "cccacbc" e depois fa√ßa uma figura das 2 componentes principais usando o m√©todo Principal Components Analysis (PCA) do pacote `sklearn.decomposition.PCA`.

In [None]:
import torch
import torch.nn as nn
import random
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np

# Configura√ß√£o dos dados
chars = list("abcd ")
vocab = {ch: i for i, ch in enumerate(chars)}
inv_vocab = {i: ch for ch, i in vocab.items()}
vocab_size = len(vocab)

def encode(s):
    return torch.tensor([vocab[c] for c in s], dtype=torch.long)

def decode(t):
    return ''.join(inv_vocab[int(x)] for x in t)

def random_seq(n=5):
    return ''.join(random.choice(chars[:-1]) for _ in range(n))

# Gerar dados de treino
pairs = [(encode(s), encode(s[::-1])) for s in [random_seq() for _ in range(50000)]]
max_len = max(len(x) for x, _ in pairs)

def pad(x):
    return torch.cat([x, torch.tensor([vocab[' ']] * (max_len - len(x)))], dim=0)

inputs = torch.stack([pad(x) for x, _ in pairs])
targets = torch.stack([pad(y) for _, y in pairs])

train_ds = torch.utils.data.TensorDataset(inputs, targets)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=128, shuffle=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Defini√ß√£o do modelo
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)

    def forward(self, x):
        x = self.embed(x)
        _, h = self.gru(x)
        return h

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        x = self.embed(x)
        out, h = self.gru(x, h)
        logits = self.fc(out)
        return logits, h

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        h = self.encoder(src)
        logits, _ = self.decoder(tgt[:, :-1], h)
        return logits

# Treinar modelo
emb_size = 32
hidden_size = 64
encoder = Encoder(vocab_size, emb_size, hidden_size)
decoder = Decoder(vocab_size, emb_size, hidden_size)
model = Seq2Seq(encoder, decoder).to(device)

loss_fn = nn.CrossEntropyLoss(ignore_index=vocab[' '])
opt = torch.optim.Adam(model.parameters(), lr=1e-3)

print("Treinando modelo...")
for epoch in range(10):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device, dtype=torch.long), yb.to(device, dtype=torch.long)
        opt.zero_grad()
        logits = model(xb, yb)
        loss = loss_fn(logits.reshape(-1, vocab_size), yb[:, 1:].reshape(-1))
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: loss={total_loss/len(train_dl):.4f}")

# EXERC√çCIO: An√°lise com PCA
print("\n" + "="*50)
print("AN√ÅLISE PCA DAS REPRESENTA√á√ïES DO ENCODER")
print("="*50)

# Sequ√™ncias para an√°lise
sequences = [
    "aaaabb",   # Muitos 'a' seguidos de 'b'
    "bbaaab",   # Come√ßa com 'b', depois 'a', termina com 'b'
    "cbcaccc",  # Mix com muitos 'c' no final
    "cccacbc"   # Muitos 'c' no in√≠cio, depois mix
]

# Extrair representa√ß√µes do encoder
model.eval()
encodings = []
with torch.no_grad():
    for seq in sequences:
        src = pad(encode(seq)).unsqueeze(0).to(device, dtype=torch.long)
        h = model.encoder(src)  # [1, 1, hidden_size]
        encodings.append(h.squeeze().cpu().numpy())

encodings = np.array(encodings)  # [4, hidden_size]

# Aplicar PCA para reduzir para 2 dimens√µes
pca = PCA(n_components=2)
encodings_2d = pca.fit_transform(encodings)

# Visualiza√ß√£o
plt.figure(figsize=(12, 5))

# Subplot 1: Gr√°fico PCA
plt.subplot(1, 2, 1)
colors = ['red', 'blue', 'green', 'orange']
for i, (seq, color) in enumerate(zip(sequences, colors)):
    plt.scatter(encodings_2d[i, 0], encodings_2d[i, 1],
               c=color, s=200, alpha=0.6, edgecolors='black', linewidth=2)
    plt.annotate(seq, (encodings_2d[i, 0], encodings_2d[i, 1]),
                fontsize=12, ha='center', va='bottom', fontweight='bold')

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} vari√¢ncia)', fontsize=12)
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} vari√¢ncia)', fontsize=12)
plt.title('Representa√ß√µes do Encoder em 2D (PCA)', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)

# Subplot 2: An√°lise de similaridade
plt.subplot(1, 2, 2)
similarities = np.zeros((4, 4))
for i in range(4):
    for j in range(4):
        # Dist√¢ncia euclidiana entre representa√ß√µes
        dist = np.linalg.norm(encodings[i] - encodings[j])
        similarities[i, j] = dist

im = plt.imshow(similarities, cmap='YlOrRd', aspect='auto')
plt.colorbar(im, label='Dist√¢ncia Euclidiana')
plt.xticks(range(4), sequences, rotation=45)
plt.yticks(range(4), sequences)
plt.title('Matriz de Dist√¢ncias entre Sequ√™ncias', fontsize=14, fontweight='bold')

for i in range(4):
    for j in range(4):
        text = plt.text(j, i, f'{similarities[i, j]:.2f}',
                       ha="center", va="center", color="black", fontsize=10)

plt.tight_layout()
plt.savefig('pca_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

# An√°lise textual
print("\nüìä AN√ÅLISE DOS RESULTADOS:")
print("\n1. Compara√ß√£o de sequ√™ncias SIMILARES:")
print(f"   - '{sequences[0]}' vs '{sequences[1]}':")
print(f"     Dist√¢ncia: {similarities[0, 1]:.3f}")
print(f"     An√°lise: Ambas t√™m muitos 'a' e 'b', mas ordem diferente")

print(f"\n   - '{sequences[2]}' vs '{sequences[3]}':")
print(f"     Dist√¢ncia: {similarities[2, 3]:.3f}")
print(f"     An√°lise: Ambas t√™m muitos 'c', mas distribui√ß√£o diferente")

print("\n2. Compara√ß√£o de sequ√™ncias DIFERENTES:")
print(f"   - '{sequences[0]}' vs '{sequences[2]}':")
print(f"     Dist√¢ncia: {similarities[0, 2]:.3f}")
print(f"     An√°lise: Composi√ß√£o muito diferente (a/b vs c/a)")

print(f"\n   - '{sequences[1]}' vs '{sequences[3]}':")
print(f"     Dist√¢ncia: {similarities[1, 3]:.3f}")
print(f"     An√°lise: Composi√ß√£o e ordem bem distintas")

print(f"\n3. Vari√¢ncia explicada pelo PCA:")
print(f"   - PC1: {pca.explained_variance_ratio_[0]:.1%}")
print(f"   - PC2: {pca.explained_variance_ratio_[1]:.1%}")
print(f"   - Total: {sum(pca.explained_variance_ratio_):.1%}")

print("\nüí° CONCLUS√ïES:")
print("   ‚úì O encoder captura padr√µes de composi√ß√£o das sequ√™ncias")
print("   ‚úì Sequ√™ncias com caracteres similares ficam mais pr√≥ximas")
print("   ‚úì A ordem dos caracteres tamb√©m afeta a representa√ß√£o")
print("   ‚úì O PCA consegue separar bem as sequ√™ncias no espa√ßo 2D")