**INF 721 - Final Project**

**Student:** Bruno Alencar Vieira de Rezende

**Enrollment Number:** ES102008

# Introduction

This work implements the Transformer model introduced in "Attention Is All You Need" by Vaswani et al. (2017) using PyTorch, focusing on testing the results achieved on the WMT 2014 English-to-German translation task. The project includes dataset preprocessing, environment setup, and building key Transformer components like multi-head attention and the encoder-decoder structure, while adapting training to resource constraints.


# Imports

In [None]:
!pip install datasets



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import torch.optim as optim
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from concurrent.futures import ThreadPoolExecutor
import threading

nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Loading Dataset

In [None]:
dataset = load_dataset("wmt14", "de-en")

In [None]:
dataset['train'] = dataset['train'].select(range(500000))

# Transformer Architecture

In [None]:
class SelfAttention(nn.Module):
  def __init__(
    self,
    embed_size,
    num_heads
  ):
    super(SelfAttention, self).__init__()

    self.embed_size = embed_size
    self.num_heads = num_heads
    self.head_dim = embed_size // num_heads

    self.values = nn.Linear(embed_size, embed_size)
    self.keys = nn.Linear(embed_size, embed_size)
    self.queries = nn.Linear(embed_size, embed_size)

    self.fc_out = nn.Linear(embed_size, embed_size)

  def forward(
    self,
    values,
    keys,
    queries,
    mask
  ):
    n = queries.shape[0] # number of training examples

    values_len, keys_len, queries_len = values.shape[1], keys.shape[1], queries.shape[1]

    values = self.values(values)
    keys = self.keys(keys)
    queries = self.queries(queries)

    values = values.reshape(n, values_len, self.num_heads, self.head_dim)
    keys = keys.reshape(n, keys_len, self.num_heads, self.head_dim)
    queries = queries.reshape(n, queries_len, self.num_heads, self.head_dim)

    temp = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

    if mask is not None:
      MINUS_INF = float("-1e16")
      temp = temp.masked_fill(mask == 0, MINUS_INF)

    attention = torch.softmax(temp / (self.embed_size ** (1/2)), dim=3)

    out = torch.einsum("nhql,nlhd->nqhd", [attention, values])

    # concatenate
    out = out.reshape(n, queries_len, self.num_heads * self.head_dim)

    out = self.fc_out(out)

    return out

In [None]:
class TransformerBlock(nn.Module):
  def __init__(
    self,
    embed_size,
    num_heads,
    dropout,
    forward_expansion
  ):
    super(TransformerBlock, self).__init__()
    self.attention = SelfAttention(embed_size, num_heads)

    self.norm1 = nn.LayerNorm(embed_size)

    self.feed_forward = nn.Sequential(
      nn.Linear(embed_size, forward_expansion * embed_size),
      nn.ReLU(),
      nn.Linear(forward_expansion * embed_size, embed_size),
    )

    self.norm2 = nn.LayerNorm(embed_size)

    self.dropout = nn.Dropout(dropout)

  def forward(self,
    values,
    keys,
    queries,
    mask
  ):
    attention_out = self.attention(values, keys, queries, mask)

    norm1_out = self.dropout(self.norm1(attention_out + queries)) # the queries represent the skip connection

    feed_forward_out = self.feed_forward(norm1_out)

    out = self.dropout(self.norm2(feed_forward_out + norm1_out)) # the norm1_out represent the skip connection

    return out

In [None]:
class Encoder(nn.Module):
  def __init__(
    self,
    vocab_size,
    embed_size,
    num_layers,
    num_heads,
    device,
    forward_expansion,
    dropout,
    max_length # for positional embedding
  ):
    super(Encoder, self).__init__()
    self.embed_size = embed_size
    self.device = device
    self.word_embedding = nn.Embedding(vocab_size, embed_size)
    self.position_embedding = nn.Embedding(max_length, embed_size)

    self.layers = nn.ModuleList(
      [TransformerBlock(embed_size, num_heads, dropout=dropout, forward_expansion=forward_expansion) for _ in range(num_layers)]
    )

    self.dropout = nn.Dropout(dropout)

  def forward(
    self,
    x,
    mask
  ):
    n, seq_length = x.shape
    positions = torch.arange(0, seq_length).expand(n, seq_length).to(self.device)

    out = self.word_embedding(x) + self.position_embedding(positions)
    out = self.dropout(out)

    for layer in self.layers:
      out = layer(out, out, out, mask) # values, keys and queries are all the same

    return out

In [None]:
class DecoderBlock(nn.Module):
  def __init__(
      self,
      embed_size,
      num_heads,
      forward_expansion,
      dropout,
      device
    ):
    super(DecoderBlock, self).__init__()
    self.attention = SelfAttention(embed_size, num_heads)
    self.norm = nn.LayerNorm(embed_size)
    self.transformer_block = TransformerBlock(
      embed_size, num_heads, dropout, forward_expansion
    )
    self.dropout = nn.Dropout(dropout)

  def forward(
    self,
    x,
    value,
    key,
    src_mask,
    trg_mask
  ):
    attention_out = self.attention(x, x, x, trg_mask)
    norm_out = self.dropout(self.norm(attention_out + x)) # the x represents the skip connection
    out = self.transformer_block(value, key, norm_out, src_mask)
    return out

In [None]:
class Decoder(nn.Module):
  def __init__(
    self,
    vocab_size,
    embed_size,
    num_layers,
    num_heads,
    forward_expansion,
    dropout,
    device,
    max_length,
  ):
    super(Decoder, self).__init__()
    self.device = device
    self.word_embedding = nn.Embedding(vocab_size, embed_size)
    self.position_embedding = nn.Embedding(max_length, embed_size)

    self.layers = nn.ModuleList(
      [DecoderBlock(embed_size, num_heads, forward_expansion, dropout, device) for _ in range(num_layers)]
    )
    self.fc_out = nn.Linear(embed_size, vocab_size)
    self.dropout = nn.Dropout(dropout)

  def forward(
    self,
    x,
    enc_out,
    src_mask,
    trg_mask
  ):
    n, seq_length = x.shape
    positions = torch.arange(0, seq_length).expand(n, seq_length).to(self.device)
    x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))

    for layer in self.layers:
      x = layer(x, enc_out, enc_out, src_mask, trg_mask)

    out = self.fc_out(x)

    return out

In [None]:
class Transformer(nn.Module):
  def __init__(
    self,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    trg_pad_idx,
    embed_size=512,
    num_layers=6,
    forward_expansion=4,
    heads=8,
    dropout=0,
    device="cpu",
    max_length=100,
  ):
    super(Transformer, self).__init__()
    self.encoder = Encoder(
      src_vocab_size,
      embed_size,
      num_layers,
      heads,
      device,
      forward_expansion,
      dropout,
      max_length,
    )
    self.decoder = Decoder(
      trg_vocab_size,
      embed_size,
      num_layers,
      heads,
      forward_expansion,
      dropout,
      device,
      max_length,
    )
    self.src_pad_idx = src_pad_idx
    self.trg_pad_idx = trg_pad_idx
    self.device = device

  def make_src_mask(
    self,
    src
  ):
    src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    return src_mask.to(self.device)

  def make_trg_mask(
    self,
    trg
  ):
    n, trg_len = trg.shape
    trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(n, 1, trg_len, trg_len)

    return trg_mask.to(self.device)

  def forward(self,
    src,
    trg
  ):
    src_mask = self.make_src_mask(src)
    trg_mask = self.make_trg_mask(trg)

    enc_src = self.encoder(src, src_mask)
    out = self.decoder(trg, enc_src, src_mask, trg_mask)

    return out

# Preprocessing

In [None]:
def build_vocab(data, language, vocab_size=10000):
  token_counter = Counter()
  for translation in data["translation"]:
    tokens = word_tokenize(translation[language].lower())
    token_counter.update(tokens)
  most_common = token_counter.most_common(vocab_size - 4)
  vocab = {word: idx + 4 for idx, (word, _) in enumerate(most_common)}
  vocab["<PAD>"] = 0 # padding
  vocab["<SOS>"] = 1 # start of sequence
  vocab["<EOS>"] = 2 # end of sequence
  vocab["<UNK>"] = 3 # unknown
  return vocab

In [None]:
src_vocab = build_vocab(dataset["train"], "en")
trg_vocab = build_vocab(dataset["train"], "de")

In [None]:
id_to_en = {v: k for k, v in src_vocab.items()}
id_to_de = {v: k for k, v in trg_vocab.items()}

In [None]:
def preprocess_translation(example, src_vocab, trg_vocab):
  src_text = word_tokenize(example["translation"]["en"].lower())
  trg_text = word_tokenize(example["translation"]["de"].lower())

  src_ids = [src_vocab.get(token, src_vocab["<UNK>"]) for token in src_text]
  trg_ids = [trg_vocab.get(token, trg_vocab["<UNK>"]) for token in trg_text]

  return (
    [src_vocab["<SOS>"]] + src_ids + [src_vocab["<EOS>"]],
    [trg_vocab["<SOS>"]] + trg_ids + [trg_vocab["<EOS>"]],
  )

train_data_pairs = [
    preprocess_translation(example, src_vocab, trg_vocab) for example in dataset["train"]
]
validation_data_pairs = [
    preprocess_translation(example, src_vocab, trg_vocab) for example in dataset["validation"]
]
test_data_pairs = [
    preprocess_translation(example, src_vocab, trg_vocab) for example in dataset["test"]
]

In [None]:
print(train_data_pairs[0])
dataset["train"][0]

([1, 5090, 7, 4, 1621, 2], [1, 3606, 7, 3137, 2])


{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode',
  'en': 'Resumption of the session'}}

In [None]:
def pad_sequence(seq, max_length, pad_idx):
  return seq + [pad_idx] * (max_length - len(seq)) if len(seq) < max_length else seq[:max_length]

In [None]:
class TranslationDataset(Dataset):
  def __init__(self, data, src_pad_idx, trg_pad_idx, max_length=50):
    self.data = data
    self.src_pad_idx = src_pad_idx
    self.trg_pad_idx = trg_pad_idx
    self.max_length = max_length

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    src, trg = self.data[idx]
    src = pad_sequence(src, self.max_length, self.src_pad_idx)
    trg = pad_sequence(trg, self.max_length, self.trg_pad_idx)
    return torch.tensor(src), torch.tensor(trg)

In [None]:
src_pad_idx = src_vocab["<PAD>"]
trg_pad_idx = trg_vocab["<PAD>"]

train_dataset = TranslationDataset(train_data_pairs, src_pad_idx, trg_pad_idx)
validation_dataset = TranslationDataset(validation_data_pairs, src_pad_idx, trg_pad_idx)
test_dataset = TranslationDataset(test_data_pairs, src_pad_idx, trg_pad_idx)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

# Tranining The Transformer

In [None]:
def evaluate_model(loader, model, criterion, device):
  model.eval()
  total_loss = 0

  with torch.no_grad():
    for src, trg in loader:
      src, trg = src.to(device), trg.to(device)

      output = model(src, trg[:, :-1])
      output = output.reshape(-1, output.shape[2])
      trg = trg[:, 1:].reshape(-1)

      loss = criterion(output, trg)
      total_loss += loss.item()

  return total_loss / len(loader)

In [None]:
src_vocab_size = len(src_vocab)
trg_vocab_size = len(trg_vocab)
model = Transformer(
    src_vocab_size=src_vocab_size,
    trg_vocab_size=trg_vocab_size,
    src_pad_idx=src_pad_idx,
    trg_pad_idx=trg_pad_idx,
    device=device,
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)
optimizer = optim.Adam(model.parameters(), lr=3e-4)

EPOCHS = 3

for epoch in range(EPOCHS):
  model.train()
  loop = tqdm(train_loader, leave=True)
  epoch_loss = 0
  for batch_idx, (src, trg) in enumerate(loop):
    src, trg = src.to(device), trg.to(device)

    output = model(src, trg[:, :-1])
    output = output.reshape(-1, output.shape[2])
    trg = trg[:, 1:].reshape(-1)

    loss = criterion(output, trg)
    epoch_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    loop.set_description(f"Epoch [{epoch+1}/{EPOCHS}]")
    loop.set_postfix(loss=loss.item())

  val_loss = evaluate_model(validation_loader, model, criterion, device)
  print(f"Epoch {epoch+1} completed. Training Loss: {epoch_loss / len(train_loader):.4f}, Validation Loss: {val_loss:.4f}")

test_loss = evaluate_model(test_loader, model, criterion, device)
print(f"Test Loss: {test_loss:.4f}")

Epoch [1/3]: 100%|██████████| 7813/7813 [40:38<00:00,  3.20it/s, loss=1.97]


Epoch 1 completed. Training Loss: 2.9196, Validation Loss: 2.5239


Epoch [2/3]: 100%|██████████| 7813/7813 [40:33<00:00,  3.21it/s, loss=2.07]


Epoch 2 completed. Training Loss: 1.9441, Validation Loss: 2.3028


Epoch [3/3]: 100%|██████████| 7813/7813 [40:32<00:00,  3.21it/s, loss=1.57]


Epoch 3 completed. Training Loss: 1.7412, Validation Loss: 2.2023
Test Loss: 2.1899


In [None]:
def translate_sentence(sentence, model, src_vocab, trg_vocab, src_pad_idx, trg_pad_idx, max_length=50, device="cpu"):
  model.eval()

  tokens = word_tokenize(sentence.lower())
  src_ids = [src_vocab.get(token, src_vocab["<UNK>"]) for token in tokens]
  src_ids = [src_vocab["<SOS>"]] + src_ids + [src_vocab["<EOS>"]]

  src_ids = src_ids + [src_pad_idx] * (max_length - len(src_ids)) if len(src_ids) < max_length else src_ids[:max_length]
  src_tensor = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)

  trg_ids = [trg_vocab["<SOS>"]]

  for _ in range(max_length):
    trg_tensor = torch.tensor(trg_ids, dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension

    with torch.no_grad():
      output = model(src_tensor, trg_tensor)

    next_token = output.argmax(dim=-1)[:, -1].item()

    trg_ids.append(next_token)

    if next_token == trg_vocab["<EOS>"]:
      break

  trg_tokens = [id_to_de[id] for id in trg_ids]

  translated_sentence = " ".join(trg_tokens[1:-1])

  return translated_sentence

In [None]:
input_sentence = "Hello, how are you?"

translated_sentence = translate_sentence(
    sentence=input_sentence,
    model=model,
    src_vocab=src_vocab,
    trg_vocab=trg_vocab,
    src_pad_idx=src_vocab["<PAD>"],
    trg_pad_idx=trg_vocab["<PAD>"],
    max_length=50,
    device=device
)

print(f"Translated Sentence: {translated_sentence}")

NameError: name 'translate_sentence' is not defined