In [1]:
####### It's commonly used in natural language processing tasks, especially in machine translation and text generation, where it allows for more flexible and efficient tokenization compared to traditional word-based tokenization methods.
# !pip install sentencepiece
####### Function: SacreBLEU is a library for computing the BLEU (Bilingual Evaluation Understudy) score, which is a metric used to evaluate the quality of machine-translated text.
####### Usage: It's widely used in the field of machine translation to assess the performance of machine translation systems. BLEU score compares the output of a machine translation system to one or more human reference translations and assigns a score based on the similarity between the machine translation and the reference translations
# !pip install sacrebleu
####### Function: TorchData is a library for handling datasets and data loading in PyTorch.
####### Usage: It provides utilities for creating custom datasets, data loaders, and data transformations. TorchData simplifies the process of working with datasets in PyTorch, making it easier to load and preprocess data for training machine learning models.
# !pip install torchdata

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.2
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.w

In [2]:
from dataclasses import dataclass          #Dataclasses provide a way to create classes in Python with less boilerplate code for common tasks like initializing objects, adding comparison methods, and more.
import numpy as np
import math
import sacrebleu                         #is a popular metric used to evaluate the quality of machine-translated text.
import sentencepiece as spm              #SentencePiece is a popular library for tokenizing text, especially in natural language processing tasks like machine translation.
import torch                             #a popular deep learning framework for building and training neural networks.
import torch.nn as nn
from torch import utils                  # PyTorch's utils module contains various utility functions and classes for tasks such as data loading and manipulation.
from torchtext.datasets import Multi30k  # Multi30k is a dataset commonly used for machine translation tasks, containing parallel sentences in multiple languages.
from torch.utils.data import Dataset
from tqdm import tqdm                     #package that provides a progress bar during iterations,
from numpy.lib.utils import lookfor       #function in the NumPy library that helps you search for specific keywords in NumPy's documentation.

In [3]:
# This sets the random seed to 7, which is used to initialize random number generators. Setting a random seed ensures reproducibility in experiments involving randomization.
# this means that we need to select a specific  data "known"  to help us if we needed to compare between experiments or any thing like that
seed = 7
torch.manual_seed(seed)
np.random.seed(seed)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")          #This line determines whether a CUDA-compatible GPU is available on the system. If a GPU is available, the code assigns the device to CUDA; otherwise, it assigns it to CPU. This is crucial for running PyTorch code on the appropriate hardware.
print("Device is : ",DEVICE)

SRC = "de"                        # Source Language
TRG = "en"                        # Target Language

train_iter = Multi30k(split='train',language_pair=(SRC,TRG))                      #This line initializes a data iterator for the Multi30k dataset, specifying that it should use the training split and the language pair defined by SRC (German) and TRG (English).

f_de = open('Muti30k_de_text.txt',"w")                        # open a files with a specific names to store data i it
f_en = open('Muti30k_en_text.txt',"w")

for pair in train_iter:
    f_de.write(pair[0] + '\n')
    f_en.write(pair[1] + '\n')

f_de.close()
f_en.close()

Device is :  cuda


In [None]:
# spm.SentencePieceTrainer.train(f'--input=Muti30k_de_text.txt --model_prefix=Muti30k_de --user_defined_symbols=<pad> --vocab_size={de_vocab_size}')
# This line trains a SentencePiece model for the German language. It uses the SentencePieceTrainer.train method to train the model. Here's what each argument means:

# 1-   --input=Muti30k_de_text.txt: Specifies the input text file for training. Adjust this to your actual German text corpus file.

# 2-   --model_prefix=Muti30k_de: Specifies the prefix for the model files generated during training.
# In the context of the --model_prefix parameter in the SentencePiece library, specifying a prefix for the model files generated during training means that all the files related to the trained model will have names starting with that prefix.
# For example, if you set --model_prefix=Muti30k_de, then the files generated during training might include:
# Muti30k_de.model: This file typically contains the model's architecture and parameters.
# Muti30k_de.vocab: This file contains the vocabulary generated by the model. --- represents a token along with its corresponding index and frequency in the training data
# Other auxiliary files related to training or the model might also use the same prefix.
# By setting a prefix, you can easily identify and manage the files associated with a particular trained model, especially if you're working with multiple models or need to organize your files systematically.

# 3- --user_defined_symbols=<pad>: Defines a special symbol <pad> which can be used later for padding sequences during sequence processing tasks like machine translation or text generation.
# 4- --vocab_size={de_vocab_size}: Specifies the vocabulary size for the German language.

In [4]:
# Here we give the tokenizer the vocab size(unique words) we get this numbers Through experimentation to get the best suitable results

en_vocab_size = 8200
de_vocab_size = 10000

vocab_sizes = {"en":en_vocab_size,"de":de_vocab_size}

# make training to extract a model to work on it
spm.SentencePieceTrainer.train(f'--input=Muti30k_de_text.txt --model_prefix=Muti30k_de --user_defined_symbols=<pad> --vocab_size={de_vocab_size}')
spm.SentencePieceTrainer.train(f'--input=Muti30k_en_text.txt --model_prefix=Muti30k_en --user_defined_symbols=<pad> --vocab_size={en_vocab_size}')

# loading and extracting models to make toknization
de_sp = spm.SentencePieceProcessor()
de_sp.load('Muti30k_de.model')

en_sp = spm.SentencePieceProcessor()
en_sp.load('Muti30k_en.model')

True

In [5]:
tokenizers = {"en":en_sp.encode_as_ids,"de":de_sp.encode_as_ids}
detokenizers = {"en":en_sp.decode_ids,"de":de_sp.decode_ids}

print(en_sp.encode_as_pieces("How are you doing?"))                                 # this sentence exicted in our data
print(en_sp.encode_as_ids("How are you doing?"))

print("-----------------------------------------------------")

print(en_sp.decode_pieces(['▁Ho', 'w', '▁are', '▁you', '▁doing', '?']))
print(en_sp.decode_ids([5234, 645, 20, 1277, 185, 0]))

['▁Ho', 'w', '▁are', '▁you', '▁doing', '?']
[5234, 645, 20, 1277, 185, 0]
-----------------------------------------------------
How are you doing?
How are you doing ⁇ 


In [6]:
print([en_sp.id_to_piece(id) for id in range(20)])
print([de_sp.id_to_piece(id) for id in range(20)])

['<unk>', '<s>', '</s>', '<pad>', '▁a', '.', '▁A', '▁in', '▁the', '▁on', '▁is', '▁man', '▁and', '▁of', '▁with', 's', 'ing', '▁', ',', '▁woman']
['<unk>', '<s>', '</s>', '<pad>', '.', '▁eine', '▁Ein', 'm', '▁in', '▁mit', ',', '▁und', '▁auf', '▁ein', '▁Mann', '▁einer', '▁Eine', 'n', '▁der', '▁Frau']


In [7]:
# the variables UNK, BOS, EOS, and PAD are assigned integer values: 0, 1, 2, and 3, respectively. These values likely represent special tokens used in natural language processing tasks, such as machine translation or language modeling. Here's what these tokens typically stand for:

# UNK: Short for "Unknown," this token is often used to represent words that are not found in the vocabulary during tokenization or encoding. When a word in the input text is not present in the model's vocabulary, it gets replaced with the UNK token.
# BOS: Stands for "Beginning of Sentence." It marks the beginning of a sentence in a sequence. This token is often used in sequence-to-sequence models, especially in tasks like machine translation, where it indicates the start of the source or target sentence.
# EOS: Stands for "End of Sentence." It marks the end of a sentence in a sequence. Similar to BOS, it's commonly used in sequence-to-sequence models to denote the end of the source or target sentence.
# PAD: Stands for "Padding." This token is used to pad sequences to a fixed length during training or inference. It ensures that all sequences in a batch have the same length, which is necessary for efficient processing in deep learning models, particularly in tasks like sequence classification or language modeling.
# By assigning integer values to these special tokens, it becomes easier to manipulate and incorporate them into the tokenization and encoding processes within NLP models. These tokens play a crucial role in handling out-of-vocabulary words, marking sentence boundaries, and ensuring consistent sequence lengths across batches.

UNK,BOS,EOS,PAD = 0,1,2,3

In [10]:
train_iter = Multi30k(split='train',language_pair=(SRC,TRG))                      #This line initializes a data iterator for the Multi30k dataset, specifying that it should use the training split and the language pair defined by SRC (German) and TRG (English).
valid_iter = Multi30k(split='valid',language_pair=(SRC,TRG))

# The train_iter iterator is iterated over, and for each pair of sentences (x, y) in the iterator, the rstrip('\n') method is applied to remove trailing newline characters. Additionally, it checks if x is not an empty string (if x!=''), ensuring that only non-empty examples are included in the training set.
train_set = [(x.rstrip('\n'),y.rstrip('\n')) for x,y in train_iter if x!='']
valid_set = [(x.rstrip('\n'),y.rstrip('\n')) for x,y in valid_iter if x!='']

print(len(train_set))
print(len(valid_set))

for i in range(10):
  print(train_set[i])

29000
1014
('Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.', 'Two young, White males are outside near many bushes.')
('Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.', 'Several men in hard hats are operating a giant pulley system.')
('Ein kleines Mädchen klettert in ein Spielhaus aus Holz.', 'A little girl climbing into a wooden playhouse.')
('Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster.', 'A man in a blue shirt is standing on a ladder cleaning a window.')
('Zwei Männer stehen am Herd und bereiten Essen zu.', 'Two men are at the stove preparing food.')
('Ein Mann in grün hält eine Gitarre, während der andere Mann sein Hemd ansieht.', 'A man in green holds a guitar while the other man observes his shirt.')
('Ein Mann lächelt einen ausgestopften Löwen an.', 'A man is smiling at a stuffed lion')
('Ein schickes Mädchen spricht mit dem Handy während sie langsam die Straße entlangschwebt.', 'A trendy girl talking on her cell

In [11]:
# This line sets the maximum sequence length (sentence) to 50 tokens. It's common in machine translation tasks to limit the length of input and output sequences to facilitate model training and inference.
max_seq_len = 50
# This function tokenize_dataset takes a dataset (presumably a list of tuples where each tuple contains a source text and its corresponding target text) as input and tokenizes each example in the dataset. Here's what it does:

# For each (src_text, trg_text) tuple in the dataset:
# It tokenizes the source text (src_text) and target text (trg_text) using the tokenizers associated with the source and target languages (SRC and TRG, respectively).
# It limits the tokenized sequences to max_seq_len - 2 tokens to leave room for the special tokens BOS (Beginning of Sentence) and EOS (End of Sentence).
# It adds the BOS token at the beginning and the EOS token at the end of both the source and target sequences.
# It converts the tokenized sequences into PyTorch tensors >>>> el tensores f pytorch zy el array f numpy
def tokenize_dataset(dataset):
  return[(torch.tensor([BOS]+tokenizers[SRC](src_text)[0:max_seq_len-2]+[EOS]),
          torch.tensor([BOS]+tokenizers[TRG](trg_text)[0:max_seq_len-2]+[EOS]))
            for src_text,trg_text in dataset]


train_tokenized = tokenize_dataset(train_set)
valid_tokenized = tokenize_dataset(valid_set)

In [12]:
# class  TranslationDataset, which inherits from PyTorch's Dataset class. This custom dataset class will be used to encapsulate the translation data.
# The __init__ method is the constructor of the class. It initializes a new instance of the TranslationDataset class with the provided data. The data parameter is expected to be a list-like object containing translation data, such as tokenized sentences.
# The __len__ method is a special method used to determine the length of the dataset. It returns the total number of examples in the dataset by returning the length of the data attribute.

class TraslationDataset(Dataset):
  def __init__(self,data):
    self.data = data
  def __len__(self):
    return len(self.data)
 # if i needed to get an sentence with it's index
  def __getitem__(self, idx):
    return self.data[idx]

In [13]:
def pad_sequence(batch):
  # These lines extract the source sequences (src_seqs) and target sequences (trg_seqs) from the batch of translation data. Each element in batch is a tuple containing a source sequence (src) and its corresponding target sequence (trg).
  src_seqs = [src for src,trg in batch]
  trg_seqs = [trg for src,trg in batch]
  # output tensors should have the batch dimension as the first dimension. The padding_value=PAD argument specifies the value to be used for padding, where PAD is likely a predefined integer value representing a padding token.
  src_padded = torch.nn.utils.rnn.pad_sequence(src_seqs,batch_first=True,padding_value=PAD)
  trg_padded = torch.nn.utils.rnn.pad_sequence(trg_seqs,batch_first=True,padding_value=PAD)

  return src_padded,trg_padded

# Overall, this function takes a batch of translation data, pads the source and target sequences to ensure uniform lengths within the batch, and returns the padded sequences as tensors. It's a common preprocessing step used when working with sequence-to-sequence models, such as those used in machine translation tasks.

In [14]:
batch_size = 128
# class responsible for creating data loaders for training and validation sets ,,, allowing for efficient batch processing during model training and evaluation.
class Dataloaders:
  def __init__(self):
    self.train_dataset = TraslationDataset(train_tokenized)
    self.valid_dataset = TraslationDataset(valid_tokenized)
    # collate_fn=pad_sequence helps handle variable-length sequences in NLP tasks, ensuring that batches of data fed into the model have consistent dimensions, which is necessary for proper training and inference.
    self.train_loader = torch.utils.data.DataLoader(self.train_dataset,batch_size=batch_size,shuffle=True,collate_fn=pad_sequence)
    self.valid_loader = torch.utils.data.DataLoader(self.valid_dataset,batch_size=batch_size,shuffle=True,collate_fn=pad_sequence)

In [15]:
# note that we have 8 Multi-head Attention       >>>>>>>>>   if we have 10 words > (10*512  to 512*64) >> 10*64       >>>    8*64 = 512
# suppose d_k >> 64 ,, d_embed >> 512 , h =8   ,, 10  no of words
class MultiHeadAttention(nn.Module):

    def __init__(self,h,d_embed,dropout=0):
        super().__init__()
        assert d_embed % h == 0                              #This statement ensures that the embedding dimension is divisible by the number of attention heads. This is necessary for later splitting the embedding into multiple heads.
        self.d_k = d_embed // h
        self.h = h
        self.d_embed = d_embed
        self.WQ = nn.Linear(d_embed, d_embed)                           #. It takes input of size d_embed and outputs a tensor of the same size d_embed.
        self.WK = nn.Linear(d_embed, d_embed)
        self.WV = nn.Linear(d_embed, d_embed)
        self.linear = nn.Linear(d_embed,d_embed)                     #This linear layer is used as the final linear transformation after combining the outputs of the attention heads. It takes input of size d_embed and outputs a tensor of the same size d_embed.
        self.dropout = nn.Dropout(dropout)


    def forward(self, x_query, x_key, x_value, mask=None):
        nbatch = x_query.size(0) # get batch size
        # 1) Linear projections to get the multi-head query, key and value tensors
        # x_query, x_key, x_value dimension: nbatch * seq_len * d_embed
        # LHS query, key, value dimensions: nbatch * h * seq_len * d_k
        query = self.WQ(x_query).view(nbatch, -1, self.h, self.d_k).transpose(1,2)
        key   = self.WK(x_key).view(nbatch, -1, self.h, self.d_k).transpose(1,2)
        value = self.WV(x_value).view(nbatch, -1, self.h, self.d_k).transpose(1,2)
        # 2) Attention
        # scores has dimensions: nbatch * h * seq_len * seq_len
        scores = torch.matmul(query, key.transpose(-2, -1))/math.sqrt(self.d_k)
        # 3) Mask out padding tokens and future tokens
        if mask is not None:
            scores = scores.masked_fill(mask, float('-inf'))
        # p_atten dimensions: nbatch * h * seq_len * seq_len
        p_atten = torch.nn.functional.softmax(scores, dim=-1) # attention filter
        p_atten = self.dropout(p_atten)
        # x dimensions: nbatch * h * seq_len * d_k
        x = torch.matmul(p_atten, value)  # filtered values
        # x now has dimensions:nbatch * seq_len * d_embed
        x = x.transpose(1, 2).contiguous().view(nbatch, -1, self.d_embed)
        return self.linear(x) # final linear layer                                                                #In the context of the forward method of multi-head attention or similar operations, the .contiguous() method is often used after reshaping or transposing operations to ensure that the tensor is contiguous before further processing. This helps in avoiding potential issues related to non-contiguous memory layout and ensures efficient computation.

In [None]:
class ResidualConnection(nn.Module):
    def __init__(self, dim, dropout):
        super().__init__()
        self.drop = nn.Dropout(dropout)
        #  Layer normalization helps in stabilizing the distribution of activations across different features. This can lead to faster training and improved convergence properties by reducing the internal covariate shift problem.
        self.norm = nn.LayerNorm(dim)  # (x-M)/std

    def forward(self, x, sublayer):                                   #The sublayer function (e.g., attention, feed-forward) to be applied within the residual connection.
        return x + self.drop(sublayer(self.norm(x)))
#Explaination
# Layer normalization (self.norm(x)): The input tensor is normalized along the feature dimension.
# Sublayer application (sublayer(...)): The normalized tensor is passed through the specified sublayer function (e.g., attention mechanism or feed-forward network).
# Dropout (self.drop(...)): Dropout is applied to the output of the sublayer.
# Residual connection (x + ...): The original input tensor x is added to the output of the dropout layer.


In [None]:
class Encoder(nn.Module):
    '''Encoder = token embedding + positional embedding -> a stack of N EncoderBlock -> layer norm'''
    # config provides a convenient way to pass various configuration settings to the encoder module, allowing for flexibility and easy experimentation with different hyperparameters and settings.
    def __init__(self, config):
        super().__init__()
        self.d_embed = config.d_embed  # 512
        self.tok_embed = nn.Embedding(config.encoder_vocab_size, config.d_embed) # Vocab Dictionary size , Embed size  ,,, layer converts input tokens into continuous vector representations. It's essentially a lookup table where each token is represented by a vector of config.d_embed dimensions.
        self.pos_embed = nn.Parameter(torch.zeros(1, config.max_seq_len, config.d_embed))          #In the context of the pos_embed tensor, which has shape (1, max_seq_len, d_embed), this slicing operation is used to ensure that the positional embeddings have the same length as the input sequence.
        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config.N_encoder)])
        self.dropout = nn.Dropout(config.dropout)
        self.norm = nn.LayerNorm(config.d_embed)

    def forward(self, input, mask=None):
        x = self.tok_embed(input) # Vectors
        x_pos = self.pos_embed[:, :x.size(1), :]  # Vectors'
        x = self.dropout(x + x_pos) # update vectors with position information
        for layer in self.encoder_blocks:
            x = layer(x, mask) # (50,512)
        return self.norm(x)


In [None]:
class EncoderBlock(nn.Module):
    '''EncoderBlock: self-attention -> position-wise fully connected feed-forward layer'''
    def __init__(self, config):
        super(EncoderBlock, self).__init__()
        self.atten = MultiHeadAttention(config.h, config.d_embed, config.dropout)
        self.feed_forward = nn.Sequential(
            nn.Linear(config.d_embed, config.d_ff),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(config.d_ff, config.d_embed)
        )
        self.residual1 = ResidualConnection(config.d_embed, config.dropout)
        self.residual2 = ResidualConnection(config.d_embed, config.dropout)

    def forward(self, x, mask=None):
        # self-attention
        x = self.residual1(x, lambda x: self.atten(x, x, x, mask=mask))
        # position-wise fully connected feed-forward layer
        return self.residual2(x, self.feed_forward)


In [None]:
class Decoder(nn.Module):
    '''Decoder = token embedding + positional embedding -> a stack of N DecoderBlock -> fully-connected layer'''
    def __init__(self, config):
        super().__init__()
        self.d_embed = config.d_embed
        self.tok_embed = nn.Embedding(config.decoder_vocab_size, config.d_embed)
        self.pos_embed = nn.Parameter(torch.zeros(1, config.max_seq_len, config.d_embed))
        self.dropout = nn.Dropout(config.dropout)
        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config.N_decoder)])
        self.norm = nn.LayerNorm(config.d_embed)
        self.linear = nn.Linear(config.d_embed, config.decoder_vocab_size)


# this function generates a mask that can be used to prevent the decoder from attending to tokens at positions beyond the current position in the sequence during self-attention. This is important for ensuring that the model only attends to tokens that have already been generated during training.
    def future_mask(self, seq_len):
        '''mask out tokens at future positions'''
        mask = (torch.triu(torch.ones(seq_len, seq_len, requires_grad=False), diagonal=1)!=0).to(DEVICE)
        return mask.view(1, 1, seq_len, seq_len)


    def forward(self, memory, src_mask, trg, trg_pad_mask):
        seq_len = trg.size(1)
        trg_mask = torch.logical_or(trg_pad_mask, self.future_mask(seq_len))
        x = self.tok_embed(trg) + self.pos_embed[:, :trg.size(1), :]
        x = self.dropout(x)
        for layer in self.decoder_blocks:
            x = layer(memory, src_mask, x, trg_mask)
        x = self.norm(x)
        logits = self.linear(x)
        return logits



In [None]:
class DecoderBlock(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.atten1 = MultiHeadAttention(config.h,config.d_embed,config.dropout)
        self.atten2 = MultiHeadAttention(config.h,config.d_embed,config.dropout)

        self.feed_forward = nn.Sequential(
                                    nn.Linear(config.d_embed , config.d_ff),                                #d_ff >> feed forward
                                    nn.ReLU(),
                                    nn.Dropout(config.dropout),
                                    nn.Linear(config.d_ff,config.d_embed)
        )
        self.residuals = nn.ModuleList([ResidualConnection(config.d_embed,config.dropout) for i in range(3)])



    def forward(self,memory,src_mask,trg,trg_mask):
        x = memory
        y = trg
        y = self.residuals[0](y,lambda y :self.atten1(y,y,y, mask = trg_mask))
        y = self.residuals[1](y,lambda y :self.atten1(y,x,x, mask =src_mask))
        return self.residuals[2](y,self.feed_forward)

In [None]:
class Transformer(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, src_mask, trg, trg_pad_mask):
        return self.decoder(self.encoder(src, src_mask), src_mask, trg, trg_pad_mask)

In [None]:
@dataclass
class ModelConfig:
    encoder_vocab_size: int
    decoder_vocab_size: int
    d_embed: int
    # d_ff is the dimension of the fully-connected  feed-forward layer
    d_ff: int
    # h is the number of attention head
    h: int
    N_encoder: int
    N_decoder: int
    max_seq_len: int
    dropout: float

In [None]:
def make_model(config):
    model = Transformer(Encoder(config),Decoder(config)).to(DEVICE)
    for p in model.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)                            #Xavier uniform initialization is a common technique for initializing the weights of neural network layers to ensure stable training.
    return model

In [None]:
def make_batch_input(x, y):
        src = x.to(DEVICE)
        trg_in = y[:, :-1].to(DEVICE)
        trg_out = y[:, 1:].contiguous().view(-1).to(DEVICE)
        src_pad_mask = (src == PAD).view(src.size(0), 1, 1, src.size(-1))
        trg_pad_mask = (trg_in == PAD).view(trg_in.size(0), 1, 1, trg_in.size(-1))
        return src, trg_in, trg_out, src_pad_mask, trg_pad_mask

In [None]:
def train_epoch(model, dataloaders):
    model.train()
    grad_norm_clip = 1.0
    losses, acc, count = [], 0, 0
    num_batches = len(dataloaders.train_loader)
    pbar = tqdm(enumerate(dataloaders.train_loader), total=num_batches)
    for idx, (x, y)  in  pbar:
        optimizer.zero_grad()
        src, trg_in, trg_out, src_pad_mask, trg_pad_mask = make_batch_input(x,y)
        pred = model(src, src_pad_mask, trg_in, trg_pad_mask).to(DEVICE)
        pred = pred.view(-1, pred.size(-1))
        loss = loss_fn(pred, trg_out).to(DEVICE)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_norm_clip)
        optimizer.step()
        scheduler.step()
        losses.append(loss.item())
        # report progress
        if idx>0 and idx%50 == 0:
            pbar.set_description(f'train loss={loss.item():.3f}, lr={scheduler.get_last_lr()[0]:.5f}')
    return np.mean(losses)


In [None]:
def train(model, dataloaders, epochs):
    global early_stop_count
    best_valid_loss = float('inf')
    train_size = len(dataloaders.train_loader)*batch_size
    for ep in range(epochs):
        train_loss = train_epoch(model, dataloaders)
        valid_loss = validate(model, dataloaders.valid_loader)

        print(f'ep: {ep}: train_loss={train_loss:.5f}, valid_loss={valid_loss:.5f}')
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
        else:
            if scheduler.last_epoch>2*warmup_steps:
                early_stop_count -= 1
                if early_stop_count<=0:
                    return train_loss, valid_loss
    return train_loss, valid_loss


def validate(model, dataloder):
    'compute the validation loss'
    model.eval()
    losses = []
    with torch.no_grad():
        for i, (x, y) in enumerate(dataloder):
            src, trg_in, trg_out, src_pad_mask, trg_pad_mask = make_batch_input(x,y)
            pred = model(src, src_pad_mask, trg_in, trg_pad_mask).to(DEVICE)
            pred = pred.view(-1, pred.size(-1))
            losses.append(loss_fn(pred, trg_out).item())
    return np.mean(losses)

In [None]:
def translate(model, x):
    'translate source sentences into the target language, without looking at the answer'
    with torch.no_grad():
        dB = x.size(0)
        y = torch.tensor([[BOS]*dB]).view(dB, 1).to(DEVICE)
        x_pad_mask = (x == PAD).view(x.size(0), 1, 1, x.size(-1)).to(DEVICE)
        memory = model.encoder(x, x_pad_mask)
        for i in range(max_seq_len):
            y_pad_mask = (y == PAD).view(y.size(0), 1, 1, y.size(-1)).to(DEVICE)
            logits = model.decoder(memory, x_pad_mask, y, y_pad_mask)
            last_output = logits.argmax(-1)[:, -1]
            last_output = last_output.view(dB, 1)
            y = torch.cat((y, last_output), 1).to(DEVICE)
    return y

def remove_pad(sent):
    if sent.count(EOS) > 0:
        sent = sent[0:sent.index(EOS) + 1]
    while sent and sent[-1] == PAD:
        sent = sent[:-1]
    return sent


def decode_sentence(detokenizer,sentence_ids):
    if not isinstance(sentence_ids,list):
        sentence_ids = sentence_ids.tolist()
    sentence_ids = remove_pad(sentence_ids)
    return detokenizer(sentence_ids).replace("<bos>","").replace("<eos>","").strip().replace(" .",".")

In [None]:
def evaluate(model, dataloader, num_batch=None):
    'evaluate the model, and compute the BLEU score'
    model.eval()
    refs, cans, bleus = [], [], []
    with torch.no_grad():
        for idx, (x, y) in enumerate(dataloader):
            src, trg_in, trg_out, src_pad_mask, trg_pad_mask = make_batch_input(x,y)
            translation = translate(model, src)
            trg_out = trg_out.view(x.size(0), -1)
            refs = refs + [decode_sentence(detokenizers[TRG], trg_out[i]) for i in range(len(src))]
            cans = cans + [decode_sentence(detokenizers[TRG], translation[i]) for i in range(len(src))]
            if num_batch and idx>=num_batch:
                break
        print(min([len(x) for x in refs]))
        bleus.append(sacrebleu.corpus_bleu(cans, [refs]).score)
        # print some examples
        for i in range(3):
            print(f'src:  {decode_sentence(detokenizers[SRC], src[i])}')
            print(f'trg:  {decode_sentence(detokenizers[TRG], trg_out[i])}')
            print(f'pred: {decode_sentence(detokenizers[TRG], translation[i])}')
        return np.mean(bleus)

In [None]:
config = ModelConfig(encoder_vocab_size = vocab_sizes[SRC],
                     decoder_vocab_size=vocab_sizes[TRG],
                     d_embed=512,
                     d_ff=512,
                     h=8,
                     N_encoder=3,  # 6
                     N_decoder=3,  # 6
                     max_seq_len=max_seq_len, #50
                     dropout=0.1
                     )

data_loaders = Dataloaders()
train_size = len(data_loaders.train_loader)*batch_size
model = make_model(config)
model_size = sum([p.numel() for p in model.parameters()])
print(f'model_size: {model_size}, train_set_size: {train_size}')

warmup_steps = 3*len(data_loaders.train_loader)
# lr first increases in the warmup steps, and then descreases
lr_fn = lambda step: config.d_embed**(-0.5) * min([(step+1)**(-0.5), (step+1)*warmup_steps**(-1.5)])
optimizer = torch.optim.Adam(model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_fn)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD)
early_stop_count = 2
train_loss, valid_loss = train(model, data_loaders, epochs=10)

print("train set examples:")
train_bleu = evaluate(model, data_loaders.train_loader, 20)
print("validation set examples:")
valid_bleu = evaluate(model, data_loaders.valid_loader)

model_size: 26201096, train_set_size: 29056


train loss=4.001, lr=0.00025: 100%|██████████| 227/227 [00:35<00:00,  6.47it/s]


ep: 0: train_loss=5.62009, valid_loss=3.82575


train loss=2.733, lr=0.00053: 100%|██████████| 227/227 [00:35<00:00,  6.47it/s]


ep: 1: train_loss=3.30335, valid_loss=2.76759


train loss=2.245, lr=0.00082: 100%|██████████| 227/227 [00:34<00:00,  6.49it/s]


ep: 2: train_loss=2.38205, valid_loss=2.18586


train loss=1.664, lr=0.00074: 100%|██████████| 227/227 [00:35<00:00,  6.45it/s]


ep: 3: train_loss=1.84586, valid_loss=1.91751


train loss=1.611, lr=0.00066: 100%|██████████| 227/227 [00:34<00:00,  6.50it/s]


ep: 4: train_loss=1.47627, valid_loss=1.80372


train loss=1.314, lr=0.00060: 100%|██████████| 227/227 [00:34<00:00,  6.50it/s]


ep: 5: train_loss=1.22688, valid_loss=1.78247


train loss=1.124, lr=0.00056: 100%|██████████| 227/227 [00:35<00:00,  6.46it/s]


ep: 6: train_loss=1.04471, valid_loss=1.78388


train loss=1.080, lr=0.00052: 100%|██████████| 227/227 [00:34<00:00,  6.51it/s]


ep: 7: train_loss=0.90393, valid_loss=1.79571
train set examples:
22
src:  Ein junges Mädchen ist im Freien und guckt sich Kleider an.
trg:  A young girl is outdoors looks at gowns.
pred: A young girl is outside looking at clothes.
src:  Der weiß-braune Hund schüttelt seine Ohren.
trg:  The white and brown dog is shaking its ears.
pred: The white and brown dog is shaking its ears.
src:  Ein schickes Mädchen spricht mit dem Handy während sie langsam die Straße entlangschwebt.
trg:  A trendy girl talking on her cellphone while gliding slowly down the street.
pred: A fancy girl talks on her cellphone while gliding slowly down the street.
validation set examples:
20
src:  Eine Person in einem roten Langarmshirt liegt auf sehr ungewöhnliche Art auf einer Mauer vor einem Laternenpfahl.
trg:  A person wearing a red long-sleeved shirt is lying down on a wall in front of a lamp post in a very unusual manner.
pred: A person with a red long-hair is laying on a wall in front of some kinds of odd o

In [None]:
def translate_this_sentence(text: str):
    'translate the source sentence in string formate into target language'
    input = torch.tensor([[BOS] + tokenizers[SRC](text) + [EOS]]).to(DEVICE)
    output = translate(model, input)
    return decode_sentence(detokenizers[TRG], output[0])


In [None]:
translate_this_sentence("Eine Gruppe von Menschen steht vor einem Iglu.")

'A group of people standing in front of an igloo'

In [None]:
translate_this_sentence("Wie geht es dir ")

'Enen walks down just directd by.'