# 🧠 Seq2Seq LSTM Example: Translating 'thank you' → 'obrigado'

This notebook demonstrates a minimal, well-commented example of a sequence-to-sequence (seq2seq) translation model using LSTM layers.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

## 🔡 Define toy vocabulary

In [2]:
# Minimal vocabulary for English and Portuguese
en_vocab = {'thank': 0, 'you': 1, '<eos>': 2}
pt_vocab = {'<start>': 0, 'obrigado': 1, '<eos>': 2}

inv_pt_vocab = {v: k for k, v in pt_vocab.items()}

## 📦 Prepare training data

In [3]:
# English input: "thank you <eos>"
# Portuguese target: "<start> obrigado <eos>"

X = torch.tensor([[en_vocab['thank'], en_vocab['you'], en_vocab['<eos>']]])  # shape: (1, 3)
Y = torch.tensor([[pt_vocab['<start>'], pt_vocab['obrigado'], pt_vocab['<eos>']]])  # shape: (1, 3)

## 🔁 Define Encoder LSTM

In [4]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (h, c) = self.lstm(embedded)
        return h, c

## 🔁 Define Decoder LSTM

In [5]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, y, h, c):
        embedded = self.embedding(y)
        output, (h, c) = self.lstm(embedded, (h, c))
        logits = self.fc(output)
        return logits, h, c

In [6]:
input_dim = len(en_vocab)
output_dim = len(pt_vocab)
emb_dim = 8
hidden_dim = 16

encoder = Encoder(input_dim, emb_dim, hidden_dim)
decoder = Decoder(output_dim, emb_dim, hidden_dim)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.01)

## 🏋️ Train the model

In [7]:
for epoch in range(100):
    encoder.train()
    decoder.train()

    h, c = encoder(X)

    decoder_input = Y[:, :-1]  # <start>, obrigado
    target = Y[:, 1:]          # obrigado, <eos>

    logits, _, _ = decoder(decoder_input, h, c)
    loss = loss_fn(logits.squeeze(0), target.squeeze(0))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 0, Loss: 1.0633
Epoch 10, Loss: 0.4118
Epoch 20, Loss: 0.0654
Epoch 30, Loss: 0.0152
Epoch 40, Loss: 0.0070
Epoch 50, Loss: 0.0046
Epoch 60, Loss: 0.0036
Epoch 70, Loss: 0.0030
Epoch 80, Loss: 0.0025
Epoch 90, Loss: 0.0022


## 🔤 Inference: Translate 'thank you'

In [8]:
encoder.eval()
decoder.eval()

with torch.no_grad():
    h, c = encoder(X)
    decoder_input = torch.tensor([[pt_vocab['<start>']]])
    result = []

    for _ in range(3):
        logits, h, c = decoder(decoder_input, h, c)
        next_token = logits.argmax(2)[:, -1]
        token_id = next_token.item()
        result.append(inv_pt_vocab[token_id])
        decoder_input = next_token.unsqueeze(0)

print("Predicted translation:", " ".join(result))

Predicted translation: obrigado <eos> <eos>
