<a href="https://colab.research.google.com/github/a01110946/transformer/blob/main/src/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Attention is all you need

In [1]:
# Import all the required libraries

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from collections import Counter
import math
import numpy as np
import re

In [2]:
torch.manual_seed(23)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# Define your constants

MAX_SEQ_LEN = 128

In [4]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_seq_len=MAX_SEQ_LEN):
    super().__init__()
    self.pos_embed_matrix = torch.zeros(max_seq_len, d_model, device=device)
    token_pos = torch.arange(0, max_seq_len, dtype = torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float()
                         * (-math.log(10000.0)/d_model))
    self.pos_embed_matrix[:, 0::2] = torch.sin(token_pos * div_term)
    self.pos_embed_matrix[:, 1::2] = torch.cos(token_pos * div_term)
    self.pos_embed_matrix = self.pos_embed_matrix.unsqueeze(0).transpose(0, 1)

  def forward(self, x):
    return x + self.pos_embed_matrix[:x.size(0), :]

In [5]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model = 512, num_heads = 8):
    super().__init__()
    assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

    self.d_v = d_model // num_heads
    self.d_k = self.d_v
    self.num_heads = num_heads

    self.W_q = nn.Linear(d_model, d_model) # Query matrix
    self.W_k = nn.Linear(d_model, d_model) # Key matrix
    self.W_v = nn.Linear(d_model, d_model) # Value matrix
    self.W_o = nn.Linear(d_model, d_model) # Output matrix

  def forward(self, Q, K, V, mask=None):
    batch_size = Q.size(0)
    '''
    Q, K, V -> [batch_size, seq_len, num_heads*d_k]
    After transpose, Q, K, V -> [batch_size, num_heads, seq_len, d_k]
    '''
    Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
    K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
    V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

    weighted_values, attention = self.scale_dot_product_attention(Q, K, V, mask=None)
    weighted_values = weighted_values.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
    weighted_values = self.W_o(weighted_values)
    return weighted_values, attention

  def scale_dot_product_attention(self, Q, K, V, mask=None):
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
    if mask is not None:
      scores = scores.masked_fill(mask == 0, -1e9)
    attention = F.softmax(scores, dim=-1)
    weighted_values = torch.matmul(attention, V)
    return weighted_values, attention


In [6]:
class PositionwiseFeedForward(nn.Module):
  def __init__(self, d_model = 512, d_ff = 2048):
    super().__init__()
    self.linear1 = nn.Linear(d_model, d_ff)
    self.linear2 = nn.Linear(d_ff, d_model)
  def forward(self, x):
    x = F.relu(self.linear1(x))
    x = self.linear2(x)
    return x

In [7]:
class EncoderSublayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
    super().__init__()
    self.self_attn = MultiHeadAttention(d_model, num_heads)
    self.feed_forward = PositionwiseFeedForward(d_model, d_ff)
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.dropout1 = nn.Dropout(dropout)
    self.dropout2 = nn.Dropout(dropout)

  def forward(self, x, mask=None):
    attention_score, _ = self.self_attn(x, x, x, mask)
    x = x + self.dropout1(attention_score)
    x = self.norm1(x)
    x = x + self.dropout2(self.feed_forward(x))
    x = self.norm2(x)
    return x

In [8]:
class Encoder(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, num_layers, dropout=0.1):
    super().__init__()
    self.layers = nn.ModuleList([EncoderSublayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
    self.norm = nn.LayerNorm(d_model)

  def forward(self, x, mask=None):
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

In [9]:
class DecoderSublayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
    super().__init__()
    self.self_attn = MultiHeadAttention(d_model, num_heads)
    self.cross_attn = MultiHeadAttention(d_model, num_heads)
    self.feed_forward = PositionwiseFeedForward(d_model, d_ff)
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.norm3 = nn.LayerNorm(d_model)
    self.dropout1 = nn.Dropout(dropout)
    self.dropout2 = nn.Dropout(dropout)
    self.dropout3 = nn.Dropout(dropout)

  def forward(self, x, encoder_output, target_mask=None, encoder_mask=None):
    attention_score, _ = self.self_attn(x, x, x, target_mask)
    x = x + self.dropout1(attention_score)
    x = self.norm1(x)
    encoder_attn, _ = self.cross_attn(x, encoder_output, encoder_output, encoder_mask)
    x = x + self.dropout2(encoder_attn)
    x = self.norm2(x)
    ff_output = self.feed_forward(x)
    x = x + self.dropout3(ff_output)
    x = self.norm3(x)
    return x

In [10]:
class Decoder(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, num_layers, dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([DecoderSublayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, encoder_output, target_mask, encoder_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, target_mask, encoder_mask)
        return self.norm(x)

In [11]:
class Transformer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, num_layers,
               input_vocab_size, target_vocab_size,
               max_seq_len=MAX_SEQ_LEN, dropout=0.1):
    super().__init__()
    self.encoder_embedding = nn.Embedding(input_vocab_size, d_model)
    self.decoder_embedding = nn.Embedding(target_vocab_size, d_model)
    self.positional_encoding = PositionalEncoding(d_model, max_seq_len)
    self.dropout = nn.Dropout(dropout)
    self.encoder = Encoder(d_model, num_heads, d_ff, num_layers)
    self.decoder = Decoder(d_model, num_heads, d_ff, num_layers)
    self.output_layer = nn.Linear(d_model, target_vocab_size)

  def mask(self, source, target):
        source_mask = (source != 0).unsqueeze(1).unsqueeze(2)
        target_mask = (target != 0).unsqueeze(1).unsqueeze(3)
        seq_length = target.size(1)
        nopeak_mask = torch.tril(torch.ones((1, seq_length, seq_length), device=device)).bool()
        target_mask = target_mask & nopeak_mask

        print(f"Source mask shape: {source_mask.shape}")
        print(f"Target mask shape: {target_mask.shape}")

        return source_mask, target_mask

  def forward(self, source, target):
        source_mask, target_mask = self.mask(source, target)
        source = self.encoder_embedding(source) * math.sqrt(self.encoder_embedding.embedding_dim)
        source = self.positional_encoding(source)
        source = self.dropout(source)

        print(f"Source shape after embedding: {source.shape}")

        encoder_output = self.encoder(source, source_mask)

        print(f"Encoder output shape: {encoder_output.shape}")

        target = self.decoder_embedding(target) * math.sqrt(self.decoder_embedding.embedding_dim)
        target = self.positional_encoding(target)
        target = self.dropout(target)

        print(f"Target shape after embedding: {target.shape}")

        output = self.decoder(target, encoder_output, target_mask, source_mask)
        output = self.output_layer(output)
        return output

## Simple Test

In [12]:
seq_len_source = 10
seq_len_target = 10
batch_size = 4
input_vocab_size = 50
target_vocab_size = 5

source = torch.randint(1, input_vocab_size, (batch_size, seq_len_source))
target = torch.randint(1, target_vocab_size, (batch_size, seq_len_target))
d_model = 512
num_heads = 8
d_ff = 2048
num_layers = 6

model = Transformer(d_model, num_heads, d_ff, num_layers,
               input_vocab_size, target_vocab_size,
               max_seq_len=MAX_SEQ_LEN, dropout=0.1)

model = model.to(device)
source = source.to(device)
target = target.to(device)

output = model(source, target)
print(output.shape)

Source mask shape: torch.Size([4, 1, 1, 10])
Target mask shape: torch.Size([4, 1, 10, 10])
Source shape after embedding: torch.Size([4, 10, 512])
Encoder output shape: torch.Size([4, 10, 512])
Target shape after embedding: torch.Size([4, 10, 512])
torch.Size([4, 10, 5])


## Translator ENG-SPA

In [13]:
import requests

url = "https://raw.githubusercontent.com/a01110946/transformer/main/data/en-es.txt"
response = requests.get(url)
content = response.text

# Now you can work with the content
print(content[:500])  # Print first 500 characters

# Optionally, save it to a file in Colab
with open('en-es.txt', 'w', encoding='utf-8') as f:
    f.write(content)

1276	Let's try something.	2481	¡Intentemos algo!
1277	I have to go to sleep.	2482	Tengo que irme a dormir.
1280	Today is June 18th and it is Muiriel's birthday!	2485	¡Hoy es 18 de junio y es el cumpleaños de Muiriel!
1280	Today is June 18th and it is Muiriel's birthday!	1130137	¡Hoy es el 18 de junio y es el cumpleaños de Muiriel!
1282	Muiriel is 20 now.	2487	Ahora, Muiriel tiene 20 años.
1282	Muiriel is 20 now.	1130133	Muiriel tiene 20 años ahora.
1283	The password is "Muiriel".	2488	La c


In [14]:
# Process the content
eng_sentences = []
spa_sentences = []

for line in content.split('\n'):
    parts = line.split('\t')
    if len(parts) >= 4:  # Ensure we have at least 4 parts
        english = parts[1].strip() # strip() removes leading/trailing whitespace including \r
        spanish = parts[3].strip() # strip() removes leading/trailing whitespace including \r
        eng_sentences.append(english)
        spa_sentences.append(spanish)

# Print the first few sentences from each list to verify
print("English sentences:")
print(eng_sentences[:5])
print("\nSpanish sentences:")
print(spa_sentences[:5])

# Print the lengths to ensure they match
print(f"\nNumber of English sentences: {len(eng_sentences)}")
print(f"Number of Spanish sentences: {len(spa_sentences)}")

English sentences:
["Let's try something.", 'I have to go to sleep.', "Today is June 18th and it is Muiriel's birthday!", "Today is June 18th and it is Muiriel's birthday!", 'Muiriel is 20 now.']

Spanish sentences:
['¡Intentemos algo!', 'Tengo que irme a dormir.', '¡Hoy es 18 de junio y es el cumpleaños de Muiriel!', '¡Hoy es el 18 de junio y es el cumpleaños de Muiriel!', 'Ahora, Muiriel tiene 20 años.']

Number of English sentences: 265511
Number of Spanish sentences: 265511


In [15]:
def preprocess_sentence(sentence):
    # Convert to lowercase
    sentence = sentence.lower().strip()

    # Remove special characters
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[á]+", "a", sentence)
    sentence = re.sub(r"[é]+", "e", sentence)
    sentence = re.sub(r"[í]+", "i", sentence)
    sentence = re.sub(r"[ó]+", "o", sentence)
    sentence = re.sub(r"[ú]+", "u", sentence)
    sentence = re.sub(r"[ñ]+", "n", sentence)
    sentence = re.sub(r"[^a-z0-9<>_@.]+", " ", sentence)
    sentence = sentence.strip()
    # Add start and end tokens
    sentence = '<SOS> ' + sentence + ' <EOS>'
    return sentence


In [16]:
# Test the proprocess_sentence function
s1 = 'Hola Alejandro, ¿cómo estás? ¿Tu correo electrónico es alejandro123@gmail.com?'
print(preprocess_sentence(s1))

<SOS> hola alejandro como estas tu correo electronico es alejandro123@gmail.com <EOS>


In [17]:
eng_sentences = [preprocess_sentence(sentence) for sentence in eng_sentences]
spa_sentences = [preprocess_sentence(sentence) for sentence in spa_sentences]

In [18]:
print(eng_sentences[:5])
print(spa_sentences[:5])

['<SOS> let s try something. <EOS>', '<SOS> i have to go to sleep. <EOS>', '<SOS> today is june 18th and it is muiriel s birthday <EOS>', '<SOS> today is june 18th and it is muiriel s birthday <EOS>', '<SOS> muiriel is 20 now. <EOS>']
['<SOS> intentemos algo <EOS>', '<SOS> tengo que irme a dormir. <EOS>', '<SOS> hoy es 18 de junio y es el cumpleanos de muiriel <EOS>', '<SOS> hoy es el 18 de junio y es el cumpleanos de muiriel <EOS>', '<SOS> ahora muiriel tiene 20 anos. <EOS>']


In [19]:
def build_vocab(sentences):
    words = [word for sentence in sentences for word in sentence.split()]
    word_count = Counter(words)
    sorted_word_counts = sorted(word_count.items(), key=lambda x:x[1], reverse=True)
    word2idx = {word: idx for idx, (word, _) in enumerate(sorted_word_counts, 2)}
    word2idx['<PAD>'] = 0
    word2idx['<UNK>'] = 1
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

In [20]:
eng_word2idx, eng_idx2word = build_vocab(eng_sentences)
spa_word2idx, spa_idx2word = build_vocab(spa_sentences)

if '< SOS >' not in spa_word2idx:
    spa_word2idx['< SOS >'] = len(spa_word2idx)
if '<EOS>' not in spa_word2idx:
    spa_word2idx['<EOS>'] = len(spa_word2idx)
spa_idx2word = {idx: word for word, idx in spa_word2idx.items()}

eng_vocab_size = len(eng_word2idx)
spa_vocab_size = len(spa_word2idx)

In [21]:
print(f"English vocabulary size: {eng_vocab_size}")
print(f"Spanish vocabulary size: {spa_vocab_size}")
print(f"First 10 English words: {list(eng_word2idx.items())[:10]}")
print(f"First 10 Spanish words: {list(spa_word2idx.items())[:10]}")

English vocabulary size: 39550
Spanish vocabulary size: 64796
First 10 English words: [('<SOS>', 2), ('<EOS>', 3), ('the', 4), ('i', 5), ('to', 6), ('you', 7), ('a', 8), ('is', 9), ('t', 10), ('tom', 11)]
First 10 Spanish words: [('<SOS>', 2), ('<EOS>', 3), ('de', 4), ('que', 5), ('el', 6), ('la', 7), ('no', 8), ('a', 9), ('en', 10), ('es', 11)]


In [None]:
print("< SOS > in eng_word2idx:", '< SOS >' in eng_word2idx)
print("<EOS> in eng_word2idx:", '<EOS>' in eng_word2idx)
print("< SOS > in spa_word2idx:", '< SOS >' in spa_word2idx)
print("<EOS> in spa_word2idx:", '<EOS>' in spa_word2idx)

In [22]:
class EngSpaDataset(Dataset):
  def __init__(self, eng_sentences, spa_sentences, eng_word2idx, spa_word2idx):
    self.eng_sentences = eng_sentences
    self.spa_sentences = spa_sentences
    self.eng_word2idx = eng_word2idx
    self.spa_word2idx = spa_word2idx

  def __len__(self):
    return len(self.eng_sentences)

  def __getitem__(self, idx):
    eng_sentence = self.eng_sentences[idx]
    spa_sentence = self.spa_sentences[idx]
    eng_tokens = [self.eng_word2idx.get(word, self.eng_word2idx['<UNK>']) for word in eng_sentence.split()]
    spa_tokens = [self.spa_word2idx.get(word, self.spa_word2idx['<UNK>']) for word in spa_sentence.split()]
    return torch.tensor(eng_tokens), torch.tensor(spa_tokens)

In [23]:
def collate_fn(batch):
    eng_sentences, spa_sentences = zip(*batch)
    eng_batch = [seq[:MAX_SEQ_LEN].clone().detach() for seq in eng_sentences]
    spa_batch = [seq[:MAX_SEQ_LEN].clone().detach() for seq in spa_sentences]
    eng_batch = torch.nn.utils.rnn.pad_sequence(eng_batch, batch_first=True, padding_value=0)
    spa_batch = torch.nn.utils.rnn.pad_sequence(spa_batch, batch_first=True, padding_value=0)
    return eng_batch, spa_batch

In [24]:
def train(model, dataloader, loss_function, optimiser, epochs):
  model.train()
  for epoch in range(epochs):
    total_loss = 0
    for i, (eng_batch, spa_batch) in enumerate(dataloader):
      eng_batch = eng_batch.to(device)
      spa_batch = spa_batch.to(device)
      # Decoder preprocessing
      target_input = spa_batch[:, :-1]
      target_output = spa_batch[:, 1:].contiguous().view(-1)
      # Zero grads
      optimiser.zero_grad()
      # Run model
      output = model(eng_batch, target_input)
      output = output.view(-1, output.size(-1))
      # Calculate loss
      loss = loss_function(output, target_output)
      # Backprop
      loss.backward()
      # Gradient Clipping
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      # Update weights
      optimiser.step()
      total_loss += loss.item()
      print(f"Epoch: {epoch+1}/{epochs}, Loss: {total_loss/(i+1)}")

In [25]:
BATCH_SIZE = 64
dataset = EngSpaDataset(eng_sentences, spa_sentences, eng_word2idx, spa_word2idx)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

print("Number of training examples:", len(dataset))

In [26]:
model = Transformer(d_model=512, num_heads=8, d_ff=2048, num_layers=6,
               input_vocab_size=eng_vocab_size, target_vocab_size=spa_vocab_size,
               max_seq_len=MAX_SEQ_LEN, dropout=0.1)

# Initializing the model weights with a normal distribution
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model.apply(init_weights)

model = model.to(device)
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimiser = optim.Adam(model.parameters(), lr=0.001)

In [27]:
def print_model_params(model):
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"{name}: {param.data.mean().item():.4f}")

print("Model parameters before training:")
print_model_params(model)

Model parameters before training:
encoder_embedding.weight: -0.0005
decoder_embedding.weight: -0.0002
encoder.layers.0.self_attn.W_q.weight: 0.0001
encoder.layers.0.self_attn.W_q.bias: -0.0004
encoder.layers.0.self_attn.W_k.weight: 0.0000
encoder.layers.0.self_attn.W_k.bias: 0.0007
encoder.layers.0.self_attn.W_v.weight: 0.0000
encoder.layers.0.self_attn.W_v.bias: -0.0014
encoder.layers.0.self_attn.W_o.weight: -0.0000
encoder.layers.0.self_attn.W_o.bias: -0.0015
encoder.layers.0.feed_forward.linear1.weight: -0.0000
encoder.layers.0.feed_forward.linear1.bias: 0.0005
encoder.layers.0.feed_forward.linear2.weight: -0.0000
encoder.layers.0.feed_forward.linear2.bias: 0.0000
encoder.layers.0.norm1.weight: 1.0000
encoder.layers.0.norm1.bias: 0.0000
encoder.layers.0.norm2.weight: 1.0000
encoder.layers.0.norm2.bias: 0.0000
encoder.layers.1.self_attn.W_q.weight: 0.0000
encoder.layers.1.self_attn.W_q.bias: 0.0005
encoder.layers.1.self_attn.W_k.weight: 0.0000
encoder.layers.1.self_attn.W_k.bias: -0.

In [28]:
train(model, dataloader, loss_function, optimiser, epochs=1)

[1;30;43mSe truncaron las últimas líneas 5000 del resultado de transmisión.[0m
Target shape after embedding: torch.Size([64, 21, 512])
Epoch: 1/1, Loss: 3.7559223471274565
Source mask shape: torch.Size([64, 1, 1, 26])
Target mask shape: torch.Size([64, 1, 28, 28])
Source shape after embedding: torch.Size([64, 26, 512])
Encoder output shape: torch.Size([64, 26, 512])
Target shape after embedding: torch.Size([64, 28, 512])
Epoch: 1/1, Loss: 3.7556011777417213
Source mask shape: torch.Size([64, 1, 1, 16])
Target mask shape: torch.Size([64, 1, 17, 17])
Source shape after embedding: torch.Size([64, 16, 512])
Encoder output shape: torch.Size([64, 16, 512])
Target shape after embedding: torch.Size([64, 17, 512])
Epoch: 1/1, Loss: 3.7551736399809807
Source mask shape: torch.Size([64, 1, 1, 22])
Target mask shape: torch.Size([64, 1, 16, 16])
Source shape after embedding: torch.Size([64, 22, 512])
Encoder output shape: torch.Size([64, 22, 512])
Target shape after embedding: torch.Size([64, 16,

In [29]:
print("\nModel parameters after training:")
print_model_params(model)


Model parameters after training:
encoder_embedding.weight: -0.0005
decoder_embedding.weight: -0.0002
encoder.layers.0.self_attn.W_q.weight: 0.0000
encoder.layers.0.self_attn.W_q.bias: 0.0002
encoder.layers.0.self_attn.W_k.weight: 0.0000
encoder.layers.0.self_attn.W_k.bias: 0.0007
encoder.layers.0.self_attn.W_v.weight: -0.0000
encoder.layers.0.self_attn.W_v.bias: -0.0013
encoder.layers.0.self_attn.W_o.weight: -0.0000
encoder.layers.0.self_attn.W_o.bias: -0.0014
encoder.layers.0.feed_forward.linear1.weight: -0.0000
encoder.layers.0.feed_forward.linear1.bias: -0.0145
encoder.layers.0.feed_forward.linear2.weight: -0.0000
encoder.layers.0.feed_forward.linear2.bias: 0.0000
encoder.layers.0.norm1.weight: 1.0007
encoder.layers.0.norm1.bias: 0.0001
encoder.layers.0.norm2.weight: 1.0020
encoder.layers.0.norm2.bias: -0.0000
encoder.layers.1.self_attn.W_q.weight: 0.0000
encoder.layers.1.self_attn.W_q.bias: 0.0009
encoder.layers.1.self_attn.W_k.weight: 0.0000
encoder.layers.1.self_attn.W_k.bias: -

In [30]:
def sentence_to_indices(sentence, word2idx):
    return [word2idx.get(word, word2idx['<UNK>']) for word in sentence.split()]

def indices_to_sentences(indices, idx2word):
    return ' '.join([idx2word[idx] for idx in indices if idx in idx2word and idx2word[idx] != '<PAD>'])

def translate_sentence(model, sentence, eng_word2idx, spa_idx2word, max_seq_len=MAX_SEQ_LEN, device='cuda'):
    model.eval()
    sentence = preprocess_sentence(sentence)
    input_indices = sentence_to_indices(sentence, eng_word2idx)
    input_tensor = torch.tensor(input_indices).unsqueeze(0).to(device)

    # Initialize the target tensor with <SOS> token
    tgt_indices = [spa_word2idx['<SOS>']]
    tgt_tensor = torch.tensor(tgt_indices).unsqueeze(0).to(device)

    with torch.no_grad():
        for _ in range(max_seq_len):
            output = model(input_tensor, tgt_tensor)
            print(f"Output shape: {output.shape}")
            print(f"Output sample: {output[0, -1, :5]}")
            next_token = output.squeeze(0).argmax(dim=-1).item()
            print(f"Next token id: {next_token}, word: {spa_idx2word.get(next_token, '<UNK>')}")
            tgt_indices.append(next_token)
            tgt_tensor = torch.tensor(tgt_indices).unsqueeze(0).to(device)
            if next_token == spa_word2idx['<EOS>']:
                break
    return indices_to_sentences(tgt_indices, spa_idx2word)

def evaluate_translations(model, sentences, eng_word2idx, spa_idx2word, max_seq_len=MAX_SEQ_LEN, device='cuda'):
    for sentence in sentences:
        translation = translate_sentence(model, sentence, eng_word2idx, spa_idx2word, max_seq_len, device)
        print(f"Input sentences: {sentence}")
        print(f"Translated sentence: {translation}")
        print()

In [31]:
# Example sentences to test the translator
test_sentences = [
    "Hello, how are you?",
    "I am learning artificial intelligence.",
    "Artificial Intelligence is great.",
    "Good night!"
]

In [32]:
# Assuming the model is trained and loaded
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Evaluate translations
evaluate_translations(model, test_sentences, eng_word2idx, spa_idx2word, MAX_SEQ_LEN, device)

Source mask shape: torch.Size([1, 1, 1, 6])
Target mask shape: torch.Size([1, 1, 1, 1])
Source shape after embedding: torch.Size([1, 6, 512])
Encoder output shape: torch.Size([1, 6, 512])
Target shape after embedding: torch.Size([1, 1, 512])
Input sentences: Hello, how are you?
Translated sentence: <SOS> <EOS>

Source mask shape: torch.Size([1, 1, 1, 7])
Target mask shape: torch.Size([1, 1, 1, 1])
Source shape after embedding: torch.Size([1, 7, 512])
Encoder output shape: torch.Size([1, 7, 512])
Target shape after embedding: torch.Size([1, 1, 512])
Input sentences: I am learning artificial intelligence.
Translated sentence: <SOS> <EOS>

Source mask shape: torch.Size([1, 1, 1, 6])
Target mask shape: torch.Size([1, 1, 1, 1])
Source shape after embedding: torch.Size([1, 6, 512])
Encoder output shape: torch.Size([1, 6, 512])
Target shape after embedding: torch.Size([1, 1, 512])
Input sentences: Artificial Intelligence is great.
Translated sentence: <SOS> <EOS>

Source mask shape: torch.Siz