# 🧠 Seq2Seq LSTM Example: 'thank you' → 'obrigado'

This notebook walks through a small, self-contained sequence-to-sequence translation model using LSTM, with detailed comments for educational clarity.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

## 🧾 Define a Minimal Vocabulary

In [None]:
# Token-to-index mappings
en_vocab = {'thank': 0, 'you': 1, '<eos>': 2}
pt_vocab = {'<start>': 0, 'obrigado': 1, '<eos>': 2}

# Index-to-token mapping for decoding
inv_pt_vocab = {v: k for k, v in pt_vocab.items()}

## 📦 Prepare Training Data

In [None]:
# English: thank you <eos>
# Portuguese: <start> obrigado <eos>
X = torch.tensor([[en_vocab['thank'], en_vocab['you'], en_vocab['<eos>']]])
Y = torch.tensor([[pt_vocab['<start>'], pt_vocab['obrigado'], pt_vocab['<eos>']]])

## 🔁 Encoder LSTM

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)  # Convert word indices to vectors
        outputs, (h, c) = self.lstm(embedded)  # Only need final (h, c)
        return h, c

## 🔁 Decoder LSTM

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, y, h, c):
        embedded = self.embedding(y)           # Convert word index to embedding
        output, (h, c) = self.lstm(embedded, (h, c))  # Run one step with state
        logits = self.fc(output)               # Convert hidden state to vocab logits
        return logits, h, c

## ⚙️ Model Setup

In [None]:
input_dim = len(en_vocab)
output_dim = len(pt_vocab)
emb_dim = 8
hidden_dim = 16

encoder = Encoder(input_dim, emb_dim, hidden_dim)
decoder = Decoder(output_dim, emb_dim, hidden_dim)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.01)

## 🏋️ Training Loop

In [None]:
for epoch in range(100):
    encoder.train()
    decoder.train()

    h, c = encoder(X)

    decoder_input = Y[:, :-1]  # all tokens except the last
    target = Y[:, 1:]          # all tokens except the first

    logits, _, _ = decoder(decoder_input, h, c)
    loss = loss_fn(logits.squeeze(0), target.squeeze(0))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

## 🧪 Inference and Decoding
We simulate translation at test time by feeding the decoder's output token back in as its next input.

In [None]:
# Set models to evaluation mode (turn off dropout etc.)
encoder.eval()
decoder.eval()

# Turn off gradient tracking for inference
with torch.no_grad():
    # Encode the input sentence to get the final hidden state
    h, c = encoder(X)

    # Start token for the decoder
    decoder_input = torch.tensor([[pt_vocab['<start>']]])

    result = []

    for _ in range(3):  # Max 3 decoding steps
        # Pass input + hidden state into decoder
        logits, h, c = decoder(decoder_input, h, c)

        # Take the most likely token (argmax) from logits
        next_token = logits.argmax(2)[:, -1]  # shape: [batch]

        # Convert token ID to string
        token_id = next_token.item()
        result.append(inv_pt_vocab[token_id])

        # Next input to decoder is this token
        decoder_input = next_token.unsqueeze(0)

print("Predicted translation:", " ".join(result))