This notebook implements a transformer for machine translation, inspired by:

[Attention is all you need](https://arxiv.org/abs/1706.03762)
A Vaswani - Advances in Neural Information Processing Systems, 2017

and 

[Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215)
I Sutskever - arXiv preprint arXiv:1409.3215, 2014 - jeremy-su1.github.io

The data was grabbed from:

[https://colab.research.google.com/drive/1GBC7eLlEM-HqKLUuMcFIQdVuYXzLoS_P?usp=sharing](https://colab.research.google.com/drive/1GBC7eLlEM-HqKLUuMcFIQdVuYXzLoS_P?usp=sharing)

Importantly, it provides the English to Italian data set I use.

In [1]:
!jupyter --version

Selected Jupyter core packages...
IPython          : 8.31.0
ipykernel        : 6.29.5
ipywidgets       : 8.1.5
jupyter_client   : 8.6.3
jupyter_core     : 5.7.2
jupyter_server   : 2.15.0
jupyterlab       : 4.3.4
nbclient         : 0.10.2
nbconvert        : 7.16.5
nbformat         : 5.10.4
notebook         : 7.3.2
qtconsole        : not installed
traitlets        : 5.14.3


In [1]:
# Imports
import itertools
import os

import numpy as np
import requests
import torch
import torch.nn as nn
import torch.optim as optim

from letsbuildmodels.devices import get_device
from nltk.lm.vocabulary import Vocabulary
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import Subset
from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import one_hot
from torch.nn.functional import softmax
from torcheval.metrics.functional import bleu_score

In [2]:
def check_memory():
    mem_allocated = torch.mps.current_allocated_memory()
    mem_recomended = torch.mps.recommended_max_memory()
    
    print(f"Memory allocated: {mem_allocated / (1024 ** 2):.2f} MB")
    print(f"Memory recommended: {mem_recomended / (1024 ** 2):.2f} MB")

check_memory()

Memory allocated: 0.00 MB
Memory recommended: 21845.34 MB


In [3]:
# Data pre-processing

# Download the data
local_path = os.path.join(os.getcwd(), "data", "eng_ita_v2.txt")

def download_file_if_not_exists():
    url = "https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/datasets/eng_ita_v2.txt"
    
    directory = os.path.dirname(local_path)
    os.makedirs(directory, exist_ok=True)
    
    if not os.path.exists(local_path):
        print(f"Downloading file from {url} to {local_path}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an error for bad HTTP responses
        with open(local_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print("Download complete.")
    else:
        print(f"File already exists at {local_path}. No download needed.")

download_file_if_not_exists()


# Read the data
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.read().strip().split('\n')
    pairs = [tuple([s for s in line.split(' -> ')]) for line in lines]
    return pairs

pairs = read_data(local_path)
print(f"File contains {len(pairs)} translations")

PAD_TOKEN = "<PAD>"
EOS_TOKEN = "<EOS>"
SOS_TOKEN = "<SOS>"
UNK_TOKEN = "<UNK>"

# Build vocabularies
def build_vocab(pairs):
    eng_tokens = list(itertools.chain.from_iterable([word_tokenize(eng) for (eng, _) in pairs]))
    ita_tokens = list(itertools.chain.from_iterable([word_tokenize(ita) for (_, ita) in pairs]))
    eng_vocab = Vocabulary(eng_tokens, unk_cutoff=2)
    ita_vocab = Vocabulary(ita_tokens, unk_cutoff=2)
    eng_vocab.update([PAD_TOKEN, EOS_TOKEN])
    ita_vocab.update([PAD_TOKEN, EOS_TOKEN, SOS_TOKEN])
    return eng_vocab, ita_vocab
    
eng_vocab, ita_vocab, = build_vocab(pairs)
eng_vocab_size = len(eng_vocab) + 1 # The + 1 is to represent the "<UNK>" token
ita_vocab_size = len(ita_vocab) + 1

print('English vocabulary size:', eng_vocab_size)
print('Italian vocabulary size:', ita_vocab_size)

# Creating integer <-> word mapping
class WordMapping:
    def __init__(self, vocab):
        self.word_to_int = {}
        self.int_to_word = {}
        word_counts = [(word, vocab[word]) for word in vocab]
        sorted_word_counts = sorted(word_counts, key=lambda t: t[1], reverse=True)
        sorted_word_counts = sorted_word_counts + [(PAD_TOKEN, 1), (EOS_TOKEN, 1), (SOS_TOKEN, 1)]
        for i, (word, _) in enumerate(sorted_word_counts):
            self.word_to_int[word] = i
            self.int_to_word[i] = word

    def __getitem__(self, key):
        if type(key) == str:
            if key in self.word_to_int:
                return self.word_to_int[key]
            elif key.lower() in self.word_to_int:
                return self.word_to_int[key.lower()]
            else:
                return self.word_to_int[UNK_TOKEN]                
        elif type(key) == int:
            return self.int_to_word[key]
        else:
            raise KeyError(f"Invalid key type: {type(key)}")

    def __len__(self):
        return len(self.word_to_int)

eng_mapping = WordMapping(eng_vocab)
ita_mapping = WordMapping(ita_vocab)

File already exists at /Users/jamescataldo/Code/letsbuildmodels/notebooks/transformer/data/eng_ita_v2.txt. No download needed.
File contains 120746 translations
English vocabulary size: 4888
Italian vocabulary size: 9276


In [4]:
check_memory()

Memory allocated: 0.00 MB
Memory recommended: 21845.34 MB


In [5]:
# Creating datasets and loaders
class TranslationDataset(Dataset):
    def __init__(self):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        eng, ita = self.pairs[idx]
        eng_tensor = torch.tensor([eng_mapping[word] for word in word_tokenize(eng)]
                                  + [eng_mapping[EOS_TOKEN]], dtype=torch.long)
        ita_tensor = torch.tensor([ita_mapping[word] for word in word_tokenize(ita)]
                                  + [ita_mapping[EOS_TOKEN]], dtype=torch.long)
        return eng_tensor, ita_tensor

seq_length = 128

# Custom collate function to handle padding
def collate_fn(batch):
    eng_batch, ita_batch = zip(*batch)
    def pad(seq, pad_token):
        if seq.size()[0] < seq_length:
            return torch.cat([seq, torch.full((seq_length - seq.size()[0],), pad_token)])
        else:
            return seq[:, :seq_length]
    eng_batch_padded = torch.stack([pad(x, eng_mapping[PAD_TOKEN]) for x in eng_batch])
    ita_batch_padded = torch.stack([pad(x, ita_mapping[PAD_TOKEN]) for x in eng_batch])
    return eng_batch_padded, ita_batch_padded

# Create the DataLoader
translation_dataset = TranslationDataset()
translations = len(translation_dataset)
indices = list(range(translations))
train_indices = indices[::2]
test_indices = indices[1::2]
train_dataset = Subset(translation_dataset, train_indices)
test_dataset = Subset(translation_dataset, test_indices)
batch_size = 8
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    drop_last=True
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    drop_last=True
)

batches = len(train_dataloader)
print(f"Training translations: {translations}")
print(f"Number of batches: {batches}")

Training translations: 120746
Number of batches: 7546


In [6]:
# Build the models
embed_size = 128

class PositionalEncoding(nn.Module):
    def __init__(self):
        super().__init__()
        self.p = torch.zeros((seq_length, embed_size))
        numerator = torch.arange(seq_length, dtype=torch.float).reshape((-1, 1))
        exponent = torch.arange(embed_size // 2, dtype=torch.float32) * (2 / embed_size)
        denominator = torch.pow(10000, exponent)
        quotient = numerator / denominator
        self.p[:, 0::2] = torch.sin(quotient).reshape(1, seq_length, embed_size // 2)
        self.p[:, 1::2] = torch.cos(quotient).reshape(1, seq_length, embed_size // 2)

    def forward(self, x):
        x = x + self.p.to(x.device)
        return x

class Translator(nn.Module):
    def __init__(self):
        super().__init__()
        self.eng_embedder = nn.Embedding(eng_vocab_size, embed_size)
        self.ita_embedder = nn.Embedding(ita_vocab_size, embed_size)
        self.pos_enc = PositionalEncoding()
        self.transformer = nn.Transformer(
            d_model=embed_size,
            num_encoder_layers=3,
            num_decoder_layers=3,
            batch_first=True)
        self.disembedder = nn.Linear(embed_size, ita_vocab_size)
        self.tgt_subsequent_mask = None

    def get_tgt_subsequent_mask(self, device):
        if self.tgt_subsequent_mask is None:
            self.tgt_subsequent_mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool().to(device)
        return self.tgt_subsequent_mask

    def forward(self, enc_x, dec_x):
        enc_emb = self.eng_embedder(enc_x)
        enc_pos = self.pos_enc(enc_emb)
        dec_emb = self.ita_embedder(dec_x)
        dec_pos = self.pos_enc(dec_emb)
        src_padding_mask = (enc_x == eng_mapping[PAD_TOKEN])
        tgt_padding_mask = (dec_x == ita_mapping[PAD_TOKEN])
        tgt_subsequent_mask = self.get_tgt_subsequent_mask(enc_x.device)
        print("entering transform")
        check_memory()
        transformed = self.transformer(
            enc_emb,
            dec_pos,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
            memory_key_padding_mask=src_padding_mask,
            tgt_mask=tgt_subsequent_mask
        )
        print("exiting transform")
        output = softmax(self.disembedder(transformed), dim=2)
        return output


model = Translator()
device = get_device()
model.to(device)
print(model)
check_memory()

Translator(
  (eng_embedder): Embedding(4888, 128)
  (ita_embedder): Embedding(9276, 128)
  (pos_enc): PositionalEncoding()
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
          )
          (linear1): Linear(in_features=128, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=128, bias=True)
          (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
    

In [7]:
check_memory()

Memory allocated: 25.81 MB
Memory recommended: 21845.34 MB


In [None]:
def train():
    loss_fn = nn.CrossEntropyLoss(ignore_index=ita_mapping[PAD_TOKEN])
    optimizer = optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09)
    num_epochs = 3
    
    model.train()
    
    for epoch in range(num_epochs):
        epoch_loss = 0
        count = 0
        for i, (encoder_input, target) in enumerate(train_dataloader):
            encoder_input = encoder_input.to(device)
            decoder_input = torch.empty_like(target).to(device)
            decoder_input[:, 0] = ita_mapping[SOS_TOKEN]
            decoder_input[:, 1:] = target[:, :-1]
            target = target.to(device)
            
            optimizer.zero_grad()

            check_memory()
            print(f"Target size: {target.size()}")
            output = model(encoder_input, decoder_input)
            check_memory()

            print(f"Output type: {output.dtype}, Target size: {target.dtype}")
            loss = loss_fn(target, output)
            print("Ran once")
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item() / target.size()[1]
            count += 1
            if i % 10 == 0:
                print(f"Epoch {epoch}, Batch {i}/{batches}, Loss: {epoch_loss / count:.4f}", end="\r")
            
        print(f"Epoch {epoch}, Loss: {epoch_loss / count:.4f}                             ")
                
train()

Memory allocated: 25.84 MB
Memory recommended: 21845.34 MB
Target size: torch.Size([8, 128])
entering transform
Memory allocated: 27.92 MB
Memory recommended: 21845.34 MB


In [None]:
specials = {
    PAD_TOKEN,
    EOS_TOKEN,
    SOS_TOKEN,
    UNK_TOKEN,
}

def to_ita_sentence(tensor):
    ita = [ita_mapping[x.item()] for x in tensor]
    strs = [y for y in ita if y not in specials]
    return " ".join(strs)

def to_eng_sentence(tensor):
    eng = [eng_mapping[x.item()] for x in tensor]
    strs = [y for y in eng if y not in specials]
    return " ".join(strs).replace(" '", "'")

def test(print_translations=False):
    model.eval()
    
    epoch_loss = 0
    count = 0
    with torch.no_grad():
        for i, (encoder_input, target) in enumerate(test_dataloader):
            encoder_input, target = encoder_input.to(device), target.to(device)
            
            output = model.translate(encoder_input, target.size()[1])
    
            for batch in range(batch_size):
                input_tokens = encoder_input[batch]
                input_str = to_eng_sentence(input_tokens)
                output_str = to_ita_sentence(output[batch])            
                target_tokens = target[batch]
                target_str = to_ita_sentence(target_tokens) 
                if print_translations:
                    print(f"English: {input_str}")
                    print(f"Desired Italian: {target_str}")
                    print(f"Generated Italian: {output_str}")
                    print()
    
                loss = bleu_score(output_str, [target_str], n_gram=min(2, len(output_str)))
        
                epoch_loss += loss.item()
                count += 1
            if i % 10 == 0:
                print(f"Batch {i}/{batches}, BLEU: {epoch_loss / count:.4f}", end="\r")
        print(f"BLEU: {epoch_loss / count:.4f}                             ")
                
# test()