
* WikiText2 dataset contains wiki.train.tokens, wiki.valid.tokens, and wiki.test.tokens. No processing is needed other than replacing newlines with ``<eos>`` tokens.

In [None]:
import os
import time
from pathlib import Path
import gc
import struct

import math
import copy
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [None]:
def seed_everything(val):
    """ For reproducibility - Seeds all relevant random generators to the same value. """
    random.seed(val)
    os.environ['PYTHONHASHSEED'] = str(val)
    np.random.seed(val)
    torch.manual_seed(val)
    torch.cuda.manual_seed(val)
    torch.cuda.manual_seed_all(val)
    torch.backends.cudnn.deterministic = True
    print('Manual seed changed successfully.')

""" DO NOT CHANGE THE SEED! """
seed = 42
seed_everything(seed)

## Corpus loader

In [None]:
class Multi_Corpus_loader:
# The Multi Corpus loader class loads all of the files to create a large 
# uniform dictionary with all of the words in the data files, in case 
# some words are missing from the test or validation datasets or if the
# train data set does not contain all of the words. This loader is not used in this model.
    def __init__(self, directory, files, max_seq):
        self.load_texts(directory, files)
        
        self.dictionary = {}  
        self.tokenize()

        self.flat_train = self.FlatTensor(self.train)
        self.flat_test  = self.FlatTensor(self.test )
        self.flat_valid = self.FlatTensor(self.valid)

        self.sequences_train = self.sequences_splitter(self.flat_train, max_seq)
        self.sequences_test  = self.sequences_splitter(self.flat_test , max_seq)
        self.sequences_valid = self.sequences_splitter(self.flat_valid, max_seq)

    def load_texts(self, directory, files):
        tokens = []
        for idx, file in enumerate(files):
            with open(os.path.join(directory,file), 'r', encoding="utf8") as f:

                # Set train dataset raw test into flat tensor
                if 'train' in file:
                    f.seek(0)
                    self.train = f.read().replace('\n','<eos>')                         
                    # Add new words to dictionary
                    tokens += self.train.strip().split()

                # Set test dataset raw test into flat tensor
                if 'test' in file:
                    f.seek(0)
                    self.test  = f.read().replace('\n','<eos>')
                    # Add new words to dictionary
                    tokens += self.train.strip().split()

                # Set validation dataset raw test into flat tensor
                if 'valid' in file:
                    f.seek(0)
                    self.valid = f.read().replace('\n','<eos>')
                    # Add new words to dictionary
                    tokens += self.train.strip().split()

            f.close()

        self.tokens = np.unique(tokens) 

    def tokenize(self):
        for i, token in enumerate(self.tokens):
            if token not in self.dictionary:
                # Create a dictionary mapping unique numbers to words
                self.dictionary[token] = len(self.dictionary) 

    def FlatTensor(self, text):
        tokens = text.strip().split() # Tokenize the raw text 
        # Flat tensor of the tokenized text
        flat_tensor = torch.tensor([self.dictionary[token] for token in tokens], dtype=torch.long)
        return flat_tensor

    def sequences_splitter(self, flat_tensor, sequence_length=64):
        # Divide the flat tensor into a list of sequences, removing extra elements
        num_sequences  = len(flat_tensor) // sequence_length
        sequences = torch.stack([flat_tensor[i*sequence_length:(i+1)*sequence_length] for i in range(num_sequences)])
        return sequences

In [None]:
class Corpus_loader:
  def __init__(self, directory, file, max_seq, dictionary=None):
    '''
    Input: 
        directory       Directory path for dataset
        file            Dataset file in the directory
        max_seq         Maximum sequence length
        dictionary      Loads existing dictionary


    load_raw_text function:
        This function loads the raw data and replaces newline with <eos>
    
    tokenize function:
        This function tokenize the input data from the file and creates a dictionary.
        for each word, sign, etc., there is a unique token according to its location in
        the dictionary. In addition, the function creates a flat tensor of the input data
        with tokens.

    sequences_splitter function:
        This function creates sequences in set sequence length.
    '''
    if dictionary == None:
      self.dictionary = {}
    else:
      self.dictionary = dictionary
      
    self.sequences  = []
    
    self.load_raw_text(directory, file)
    self.tokenize()
    self.sequences_splitter(max_seq)
    
  def load_raw_text(self, directory, file):
    # Load the raw text file and store as a string
    with open(os.path.join(directory, file), 'r', encoding="utf8") as f:     
      self.text = f.read().replace('\n','<eos>') # Replace newline with <eos>
      f.close() # Reset counter 
   
  def tokenize(self):
    tokens = self.text.strip().split() # Tokenize the raw text 

    # In case there is a dictionary already
    if len(self.dictionary) > 0:
      for i, token in enumerate(tokens):
        if token not in self.dictionary:
          # replaces words that are not in thge dictionary to the unknown token
          tokens[i] = '<unk>'

    # Creating new dictionary
    else:
      for i, token in enumerate(tokens):
        if token not in self.dictionary:
          # Create a dictionary mapping unique numbers to words
          self.dictionary[token] = len(self.dictionary) 

    # Flat tensor of the tokenized text
    self.flat_tensor = torch.tensor([self.dictionary[token] for token in tokens], dtype=torch.long)
    
  def sequences_splitter(self, sequence_length=64):
    # Divide the flat tensor into a tensor of sequences tensors (batch the text to sequence_length batches)
    num_sequences  = self.flat_tensor.size(0) // sequence_length
    self.sequences = torch.stack([self.flat_tensor[i*sequence_length:(i+1)*sequence_length] for i in range(num_sequences)])

## Embedding & positional encoding

In [None]:
class Embed_dict(nn.Module):
    def __init__(self, dictionary_size, d_model):
        super().__init__()
        '''
        Input: 
            d_model             Dimensions of the input
            dictionary_size     Dictionary size


        forward function:
            This function creates embedding matrix from the input tensor.
        '''
        # Creating embedding for input's dictionary
        self.embed = nn.Embedding(dictionary_size, d_model)
    
    def forward(self, x):
        return self.embed(x)

In [None]:
class Positional_Embedding(nn.Module):
    def __init__(self, d_model, max_len=64, dropout=0.1):
        super().__init__()
        '''
        Input: 
            d_model         Dimensions of the input
            max_len         Maximum sequence length
            dropout         Percentage of dropout

        positional_encoding function:
            This function calculates and adds the positional encoding to the input embedding in the forward function

        forward function:
            This function adds positional embedding to the input embedding 
        '''
        # Dropout layer
        self.dropout  = nn.Dropout(p=dropout)

        # Positional embedding vector
        pos_embedding = self.positional_encoding(d_model, max_len)
        self.register_buffer('pos_embedding', pos_embedding) # Registering the parameter as buffer so it won't be trained by the optimizer and remain constant troughout the training process.

    @staticmethod  # Static method are accessible directly from an API object's constructor, while not require to create an object of the class to access the function (example = Positional_Embedding.positional_encoding(d_model, seq_length))
    def positional_encoding(d_model, seq_length):
        # Preallocation positional vector
        pos_encoding = torch.zeros(seq_length, d_model)

        for pos in range(seq_length):
            for i in range(0, d_model//2):
                pos_encoding[pos, 2*i]     = torch.sin(torch.tensor(pos / (10000 ** (2*i / d_model))))
                pos_encoding[pos, 2*i + 1] = torch.cos(torch.tensor(pos / (10000 ** (2*i / d_model))))
        
        return pos_encoding

    def forward(self, x):
        # Add the positions tensor to the input tensor 
        if x.ndim == 2:
            return self.dropout(x + self.pos_embedding[:x.size(0)])
        else:
            return self.dropout(x + self.pos_embedding[:x.size(1)])

## Support functions

In [None]:
def get_clones(module, N):
    # generate multiple layers
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [None]:
def generate_square_subsequent_mask(size):
    # Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices input, the other elements of the result tensor out are set to 0.
    return torch.triu(torch.ones(size, size) * float('-inf'), diagonal=1) 

In [None]:
def sequence_mask(sequence_length, batch_size, index):
    # Sequence mask for the encoder 
    mask = torch.zeros(batch_size, sequence_length)
    mask[:,index] = 1
    mask = mask.bool()
    return mask

In [None]:
# Pytorch transformer hyperparameters optimizer https://github.com/dmlc/dgl/blob/master/examples/pytorch/transformer/optims/noamopt.py
class NoamOpt(object):
    def __init__(self, model_size, factor, warmup, optimizer):
        """
        model_size: hidden size
        factor: coefficient
        warmup: warm up steps(step ** (-0.5) == step * warmup ** (-1.5) holds when warmup equals step)
        """
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0

    def rate(self, step=None):
        if step is None:
            step = self._step
        return self.factor * (
            self.model_size ** (-0.5)
            * min(step ** (-0.5), step * self.warmup ** (-1.5)))

    def step(self):
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p["lr"] = rate
        self._rate = rate
        self.optimizer.step()

## Transformer layers & blocks

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model, output_dim, hidden_dim, num_heads=8, dropout=0.1):
        super().__init__()
        '''
        Input: 
            d_model         Dimensions of the input
            output_dim      Dimensions of the output for feedforward block
            hidden_dim      Dimensions of the inner layers (i.e., the first fully connected in Feed Forward block)
            num_heads       Number of heads for MultiheadAttention block (default=8)
            dropout         Percentage of dropout

        forward function:
            This function run a encoder block in a sequential manner according to transformer architecture
            and gives an output.
        '''
        # Transformer block dimensions
        self.input_dim  = d_model
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim 
        self.num_heads  = num_heads

        #==== ENCODER BLOCK LAYERS ====# 
        # Multi-Head Attention block
        self.attention   = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
       
        # Feed Forward block
        self.feedforward = nn.Sequential(nn.Linear(d_model, hidden_dim),     # Fully connected
                                         nn.ReLU(),                          # Activation
                                         nn.Linear(hidden_dim, output_dim))  # Fully connected
        # Add & Norm block
        self.attention_norm  = nn.LayerNorm(d_model)
        self.output_norm     = nn.LayerNorm(output_dim)
        
        # Dropout layer 
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, input, mask=None):
        # Gets input through self attention layer
        attention_output   = self.attention(input, input, input, key_padding_mask=mask)[0] # Takes only output (without weights)
        
        # Normalize the output from the attention layer
        attention_output   = self.attention_norm(input + self.dropout1(attention_output))
        
        # Feedforward layer 
        feedforward_output = self.feedforward(attention_output)    
        
        # Encoder layer's output with normalization
        output = self.output_norm(attention_output + self.dropout2(feedforward_output))
        
        return output

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, output_dim, hidden_dim, num_heads=8, dropout=0.1):
        super().__init__()
        '''
        Input: 
            d_model         Dimensions of the input
            output_dim      Dimensions of the output for feedforward block
            hidden_dim      Dimensions of the inner layers (i.e., fully connected in Feed Forward block)
            num_heads       Number of heads for MultiheadAttention block (default=8)
            dropout         Percentage of dropout

        forward function:
            This function run a Decoder block in a sequential manner according to transformer architecture
            and gives an output.
        '''
        # Transformer block dimensions
        self.input_dim  = d_model
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim 
        self.num_heads  = num_heads

        #==== DECODER BLOCK LAYERS ====# 
        # Multi-Head Attention block
        self.masked_attention  = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
        self.encoder_attention = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)

        # Feed Forward block
        self.feedforward = nn.Sequential(nn.Linear(d_model, hidden_dim),     # Fully connected
                                         nn.ReLU(),                          # Activation
                                         nn.Linear(hidden_dim, output_dim))  # Fully connected
        
        # Add & Norm block
        self.masked_attention_norm  = nn.LayerNorm(d_model)
        self.encoder_attention_norm = nn.LayerNorm(d_model)
        self.output_norm            = nn.LayerNorm(output_dim)
        
        # Dropout layer 
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
    
    def forward(self, input, encoder_input, sequence_mask, target_mask):
        # Gets the input to the decoder layer through masked Multi-head attention layer
        masked_attention_output  = self.masked_attention(input, input, input, attn_mask=target_mask)[0]              # Takes only output (without weights)
        # Normalize and add the input to the output of the masked Multi-head attention layer
        masked_attention_output  = self.masked_attention_norm(input + self.dropout1(masked_attention_output))
        
        
        # Gets the input to the decoder layer and the output of the encoder through Multi-head attention layer
        encoder_attention_output = self.encoder_attention(masked_attention_output, encoder_input, encoder_input)[0]  # Takes only output (without weights)
        # Normalize and add the input to the output of the second masked Multi-head attention layer
        encoder_attention_output = self.encoder_attention_norm(masked_attention_output + self.dropout2(encoder_attention_output))

        
        # Feedforward layer 
        feedforward_output = self.feedforward(encoder_attention_output)       
        # Decoder layer's output with normalization
        output = self.output_norm(encoder_attention_output + self.dropout3(feedforward_output))
        
        return output

In [None]:
class Encoder(nn.Module):
    # This class defines the Encoder of the transformer. Since the Encoder is composed
    # from several Encoder blocks, this class calls for N number of blocks and replaces
    # the input tensor with the output of the previous block in the chain, so each block
    # process the data given by the previous block and not the initial input tensor.

    def __init__(self, dictionary_size, d_model, output_dim, hidden_dim, max_seq, N, heads):
        super().__init__()
        '''
        Input: 
            dictionary_size     Dictionary size
            d_model             Dimensions of the input
            output_dim          Dimensions of the output for feedforward block
            hidden_dim          Dimensions of the inner layers (i.e., fully connected in Feed Forward block)
            max_seq             Maximum sequence length
            N                   Number of blocks
            heads               Number of heads for MultiheadAttention block (default=8)

        forward function:
            This function set the ENCODER in a sequential manner with Nx blocks according to transformer architecture
            and gives an output.
        '''
        self.N         = N                                      # Number of Encoder blocks
        self.embed     = Embed_dict(dictionary_size, d_model)   # embedded input
        self.pos_embed = Positional_Embedding(d_model, max_seq) # Positional embedding (added to embedded input)
        self.norm      = nn.LayerNorm(d_model)                  # Normalize layer
        
        # Multiply number of blocks Nx times
        self.layers = get_clones(EncoderBlock(d_model, output_dim, hidden_dim, heads), N)

    def forward(self, input, sequence_mask):
        # Create embedding of the input data and adds positional encoding
        x = self.embed(input)
        x = self.pos_embed(x)

        for i in range(self.N):
            # Replaces the input tensor with the output of the previous block
            x = self.layers[i](x, sequence_mask)
        
        return self.norm(x)
  

    
class Decoder(nn.Module):
    # This class defines the Decoder of the transformer. Since the Decoder is composed
    # from several Decoder blocks, this class calls for N number of blocks and replaces
    # the input tensor with the output of the previous block in the chain, so each block
    # process the data given by the previous block and not the initial input tensor.
    
    def __init__(self, dictionary_size, d_model, output_dim, hidden_dim, max_seq, N, heads):
        super().__init__()
        '''
        Input: 
            dictionary_size     Dictionary size
            d_model             Dimensions of the input
            output_dim          Dimensions of the output for feedforward block
            hidden_dim          Dimensions of the inner layers (i.e., fully connected in Feed Forward block)
            max_seq             Maximum sequence length
            N                   Number of blocks
            heads               Number of heads for MultiheadAttention block (default=8)

        forward function:
            This function set the DECODER in a sequential manner with Nx blocks according to transformer architecture
            and gives an output.
        '''
        self.N         = N                                      # Number of Encoder blocks
        self.embed     = Embed_dict(dictionary_size, d_model)   # embedded input
        self.pos_embed = Positional_Embedding(d_model, max_seq) # Positional embedding (added to embedded input)
        self.norm      = nn.LayerNorm(d_model)                  # Normalize layer
        
        # Multiply number of blocks Nx times
        self.layers = get_clones(DecoderBlock(d_model, output_dim, hidden_dim, heads), N)

    def forward(self, input, encoder_outputs, sequence_mask, target_mask):
        # Create embedding of the input data and adds positional encoding
        x = self.embed(input)
        x = self.pos_embed(x)
 
        for i in range(self.N):
            # Replaces the input tensor with the output of the previous block           
            x = self.layers[i](x, encoder_outputs, sequence_mask, target_mask)
        
        return self.norm(x)

# THE Transformer

In [None]:
class Transformer(nn.Module):
    def __init__(self, dictionary_size, d_model, output_dim, hidden_dim, max_seq, N, heads):
        super().__init__()
        '''
        Input: 
            dictionary_size     dictionary size
            d_model             dimensions of the input
            output_dim          dimensions of the output for feedforward block
            hidden_dim          dimensions of the inner layers (i.e., fully connected in Feed Forward block)
            max_seq             Maximum sequence length
            N                   Number of blocks
            heads               number of heads for MultiheadAttention block (default=8)

        forward function:
            This function set the TRANSFORMER in a sequential manner with with linear layer before output. 
            In contrast to the transformer architecture, the class do not include softmax layer, since it is included in the
            loss function.
        '''
        self.encoder = Encoder(dictionary_size, d_model, output_dim, hidden_dim, max_seq, N, heads)
        self.decoder = Decoder(dictionary_size, d_model, output_dim, hidden_dim, max_seq, N, heads)
        self.linear  = nn.Linear(d_model, dictionary_size)

    def forward(self, src, trg, src_mask, trg_mask):
        encoder_output = self.encoder(src, src_mask)
        decoder_output = self.decoder(trg, encoder_output, src_mask, trg_mask)
        output         = self.linear(decoder_output)
        
        return output

## Train, Evaluation, Prediction functions

In [None]:
def train(model, train_loader, seq_len, epoch, scheduler, optimizer, criterion, device, chosen_sched):
    '''
    Input: 
        model               The transformer model
        train_loader        Data loader of the train sequences in batches
        seq_len             Maximum sequence length
        epoch               Number of current epoch
        scheduler           Learning rate scheduler 
        optimizer           Type of optimizer for model's weights
        Criterion           Loss criterion
        device              Computation device
        chosen_sched        Name of the chosen scheduler

    Output:
        ppl_train_monitor   List of model's training performance - include: ppl, batch number, epoch, learning rate
    '''
    ppl_train_monitor = []
    gc.collect()
    model.train()            # turn on train mode
    total_loss = 0.          # Loss count
    log_interval = 200       # Set log interval to measure performance
    start_time = time.time() # Start timer

    for idx, batch in enumerate(train_loader):
        # This deals with last batch, which may include smaller sized sequences
        cor_seq_len = min(seq_len, batch.size()[1])

        target_mask = generate_square_subsequent_mask(cor_seq_len-1).to(device) # Generate target mask
        seq_mask = sequence_mask(cor_seq_len, batch.size(0), -1).to(device)     # Generate sequence mask
        data    = batch[:, :-1]   # Set train data (for prediction)
        targets = batch[:, 1: ]   # Set target data

        target_mask = generate_square_subsequent_mask(cor_seq_len-1).to(device) # Generate target mask
        seq_mask = sequence_mask(cor_seq_len, batch.size(0), -1).to(device)     # Generate sequence mask
        
        output  = model(batch, data, seq_mask, target_mask)
        loss    = criterion(output.view(-1, output.size(-1)), targets.contiguous().view(-1))

        # back propagation 
        if chosen_sched == 'SGD':                                                       
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)         # Prevent exploding gradients
            optimizer.step()
        
        elif chosen_sched == 'Adam':  
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)         # Prevent exploding gradients
            scheduler.step()
            scheduler.optimizer.zero_grad()

        # Count total loss
        total_loss += loss.item()                                          
        
        # See performance throughout the model's training
        if idx % log_interval == 0 and idx > 0:
            # Try to get lr depends on the scheduler
            try:
                lr = scheduler.get_last_lr()[0]                             # read current learning rate
            except:
                lr = scheduler._rate                                        # read current learning rate

            ms_per_batch = (time.time() - start_time) * 1000 / log_interval # calculate mean computation time for each batch [ms]
            cur_loss     = total_loss / log_interval                        # mean loss for log_intervals batches
            ppl          = math.exp(cur_loss)                               # perplexity

            # Print performance 
            print(f'| epoch {epoch:3d} | {idx:5d}/{len(train_loader):5d} batches | '
                  f'lr {lr:02.4f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f} |')
            
            total_loss = 0                                                  # reset loss counter
            start_time = time.time()                                        # reset timer
            
            ppl_train_monitor.append([ppl, idx, epoch, lr])  
            
            #return ppl_train_monitor                                       # release '#' for debugging
   
    return ppl_train_monitor 

In [None]:
def evaluate(model, data_loader, seq_len, criterion, device, mode=0):
    '''
    Input: 
        model               The transformer model
        data_loader         Data loader of the evaluation sequences 
        seq_len             Maximum sequence length
        Criterion           Loss criterion
        device              computation device
        mode                Mode: 0 - Evaluation of performance, 1 - Get model output.

    Output:
        outputs             List of model's training performance - include: ppl, batch number, epoch, learning rate
        loss                Normalized total loss by the size of the data_loader
    '''
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    outputs    = []

    with torch.no_grad():
        for idx, batch in enumerate(data_loader):
            # The following section deals with prediction function, which is not batches based.   
            if batch.ndim == 1:
                cor_seq_len = min(seq_len, len(batch))
                data        = batch[:-1]
                targets     = batch[1: ]
                target_mask = generate_square_subsequent_mask(cor_seq_len-1).to(device) # Generate target mask
                seq_mask    = sequence_mask(cor_seq_len, batch.size(0), -1).to(device)  # Generate sequence mask
                seq_mask    = seq_mask[0,:]
            else:
                cor_seq_len = min(seq_len, batch.size()[1])
                data        = batch[:, :-1]
                targets     = batch[:, 1: ]
                target_mask = generate_square_subsequent_mask(cor_seq_len-1).to(device) # Generate target mask
                seq_mask    = sequence_mask(cor_seq_len, batch.size(0), -1).to(device)  # Generate sequence mask
            
            output      = model(batch, data, seq_mask, target_mask)
            total_loss += criterion(output.view(-1, output.size(-1)), targets.contiguous().view(-1)).item()
            
            if mode == 1:
                outputs.append(output)
    
    if mode == 1:
        outputs = torch.stack(outputs)
        return outputs
    
    else:
        return total_loss / (len(data_loader) - 1)

In [None]:
def predict(model, input, dictionary, seq_len, criterion, device, top_k=2, explore_last=0):
    '''
    Input: 
        model                   The transformer model
        input                   The input sequences the model will predict 
        dictionary              The corpus dictionary
        seq_len                 The sequences length
        top_k                   The top k probabilities. Useful to generate random sentences and not only the most likley next word
        Criterion               Loss criterion
        device                  Computation device
        explore_last            Flag for exploring only last words or not (1 - engaged, 0 - disengaged)

    Output:
        generated_sentences     The genereated sequences
        real_sentences          The real sequences
    '''
    model.eval()                # Set model to evaluation mode
    generated_sentences = []    # Generated sequences list
    real_sentences      = []    # Real sequences list

    # Find the output of the model using the set imput
    output = evaluate(model.to(device), 
                      input.to(device), 
                      seq_len, 
                      criterion, 
                      device, 
                      mode=1)

    # If explore_last is engaged, the output will show only the k-predicted words in the end of a sequence
    if explore_last == 1:
        
        generated_last_words = []
        real_last_words      = []
        
        # Iterate over each sequence
        for sequence in range(output.size(0)):
            output_array    = output[sequence, -1, :].cpu().detach().numpy() # Convert output to a numpy array
            word_indices    = output_array.argsort()[-top_k:][::-1]          # get the index of the k-most likely words
    
            # Find the k-most likely words
            predicted_words = [list(dictionary.keys())[word_index] for word_index in word_indices]

            generated_last_words.append(predicted_words)
            real_last_words.append(list(dictionary.keys())[input[sequence, -1]])
        return generated_last_words, real_last_words
    
    # If explore_last is disengaged, the output will give generated sentences and real sentences
    else:
        # Iterate over each sequence
        for sequence in range(output.size(0)):

            generated_words = []
            real_words      = []
            
            # Iterate over each word
            for word in range(output.size(1)):
                output_array = output[sequence, word, :].cpu().detach().numpy() # Convert output to a numpy array
                word_indices = output_array.argsort()[-top_k:][::-1]            # get the index of the k-most likely words
                
                # Find the k-most likely words
                predicted_words = [list(dictionary.keys())[word_index] for word_index in word_indices]

                # Generate randomization (choose one of the k-likely words, or just the most likely word)
                if np.random.randint(2) == 1:
                    chosen_word = predicted_words[np.random.randint(top_k)]
                    generated_words.append(chosen_word)
                else:
                    generated_words.append(predicted_words[0])
                
                # Add the real sequence words
                real_words.append(list(dictionary.keys())[input[sequence, word + 1]])
            
            # Stack generated and real sentences in lists
            generated_sentences.append(generated_words)
            real_sentences.append(real_words)
        
        return generated_sentences, real_sentences

## Train and evaluate the model

In [None]:
# Model's dimensions
d_model    = 256
output_dim = d_model
hidden_dim = 4 * d_model
max_seq    = 36
heads      = 8
N          = 6
batch_size = 32
epochs     = 10

# Set device for computation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache() 

# Datasets path and files
dataset_path = os.path.join(Path(os.getcwd()), 'Dataset\wikitext-2')
files  = ('wiki.train.tokens', 'wiki.valid.tokens', 'wiki.test.tokens')

# Load corpus from data (including dictionary and sequence splitting)
train_corpus = Corpus_loader(dataset_path, files[0], max_seq)
valid_corpus = Corpus_loader(dataset_path, files[1], max_seq, dictionary=train_corpus.dictionary)
test_corpus  = Corpus_loader(dataset_path, files[2], max_seq, dictionary=train_corpus.dictionary)

dictionary_size = len(train_corpus.dictionary) 

# Set loss function and choose optimizer
criterion    = nn.CrossEntropyLoss()
chosen_sched = 'Adam'

if chosen_sched == 'SGD':
    heads     = 4
    N         = 2
    # Define Transformer model
    model     = Transformer(dictionary_size, 
                            d_model, 
                            output_dim, 
                            hidden_dim, 
                            max_seq, 
                            N, 
                            heads).to(device)
                            
    # Set optimizer
    optimizer = torch.optim.SGD(model.parameters(), lr=5.0)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.5)

elif chosen_sched == 'Adam':
    # Define Transformer model
    model     = Transformer(dictionary_size, 
                            d_model, 
                            output_dim, 
                            hidden_dim, 
                            max_seq, 
                            N, 
                            heads).to(device)

    # Set optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
    scheduler = NoamOpt(hidden_dim, factor=1, warmup=400, optimizer=optimizer)

# Load data sequences into batches
train_loader = DataLoader(train_corpus.sequences.to(device), batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_corpus.sequences.to(device), batch_size=batch_size, shuffle=True)

In [None]:
## START TRAINING! ##
log_interval      = 200       # Set log interval to measure performance
best_val_loss     = float('inf')
best_model        = None
ppl_eval_monitor  = []
ppl_train_monitor = []

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time() 
    
    # Train and extract perplexity monitoring for each epoch, and adds it to general perplexity monitor. 
    ppl_monitor = train(model, 
                        train_loader, 
                        max_seq, 
                        epoch, 
                        scheduler, 
                        optimizer, 
                        criterion, 
                        device, 
                        chosen_sched)

    ppl_train_monitor.append(ppl_monitor)
    
    # Evaluate the model in corrent epoch
    val_loss = evaluate(model, 
                        valid_loader, 
                        max_seq, 
                        criterion, 
                        device)
                        
    val_ppl  = math.exp(val_loss)

    # Extract perplexity monitor of the evaluation data to general monitor perplexity 
    ppl_eval_monitor.append([val_ppl,
                             ppl_train_monitor[-1][-1][1],
                             epoch,
                             ppl_train_monitor[-1][-1][3]]) 
    
    # Calculate run time for epoch
    elapsed = time.time() - epoch_start_time
    
    # Print progress of validation data
    print('-' * 94)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f} [sec] | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}  | ')
    print('-' * 94)

    # Choose best model and save it
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)  
    
    # Update the  learning rate and empty cache
    scheduler.step()
    torch.cuda.empty_cache()

### Add Saving Trained Model ###
save_path = 'best_trained_model_w_nn_atten.pt'
torch.save(best_model.state_dict(), save_path)

In [None]:
# Plot train and validation perplexity
ppl_train_monitor = torch.Tensor(ppl_train_monitor)
ppl_eval_monitor  = torch.Tensor(ppl_eval_monitor)

ppl_steps_train   = torch.flatten(ppl_train_monitor[:,:,0])
idx_steps_train   = torch.flatten(ppl_train_monitor[:,:,1])
epoch_steps_train = torch.flatten(ppl_train_monitor[:,:,2])
lr_steps_train    = torch.flatten(ppl_train_monitor[:,:,3])

ppl_steps_eval    = torch.flatten(ppl_eval_monitor[:,0])
idx_steps_eval    = torch.flatten(ppl_eval_monitor[:,1])
epoch_steps_eval  = torch.flatten(ppl_eval_monitor[:,2])
lr_steps_eval     = torch.flatten(ppl_eval_monitor[:,3])


plt.plot(idx_steps_train + epoch_steps_train*log_interval*10, ppl_steps_train, label='Train Perplexity')
plt.plot(idx_steps_eval  + epoch_steps_eval*log_interval*10 , ppl_steps_eval , label='Validation Perplexity')
plt.xlabel('batch number * Epoch * log interval')
plt.ylabel('Perplexity')
plt.title('Perplexity over time: Train vs. Validation datasets')
plt.legend()

# Draw vertical lines at the epochs where the learning rate was decreased
lr_change_indexes = [i for i,l in enumerate(lr_steps_train) if l != lr_steps_train[i-1]]
for i in lr_change_indexes:
    plt.axvline(x = (idx_steps_train[i] + epoch_steps_train[i]*log_interval*10), 
                color='r', 
                linestyle='--', 
                label='Validation Perplexity',
                linewidth = 0.5)

plt.show()

In [None]:
# Load saved model
save_path = 'best_trained_model_w_nn_atten.pt'
new_model = Transformer(dictionary_size, 
                        d_model, 
                        output_dim, 
                        hidden_dim, 
                        max_seq, 
                        N, 
                        heads)

new_model.load_state_dict(torch.load(save_path, map_location=torch.device(device)))

# Set model for evaluation only
new_model.eval()

# Evaluate the model on test dataset
test_loader   = DataLoader(test_corpus.sequences.to(device) , batch_size=batch_size, shuffle=True)
start_test    = time.time() 

test_loss     = evaluate(new_model.to(device), 
                         test_loader, 
                         max_seq, 
                         criterion, 
                         device)

elapsed_time  = time.time() - start_test
test_ppl      = math.exp(test_loss)

print('-' * 60)
print(f'|   Test perplexity {test_ppl:8.2f}   |   Elapsed {elapsed_time:8.2f} [sec]  |')
print('-' * 60)


In [None]:
# Predict from 3 random sequences in test dataset
random_sequences = test_corpus.sequences[random.sample(range(test_corpus.sequences.size(0)), 3),:]

generated_sentences, real_sentences = predict(new_model, 
                                              random_sequences, 
                                              train_corpus.dictionary, 
                                              max_seq, 
                                              criterion, 
                                              device, 
                                              top_k=1)

for i in range(len(generated_sentences)):
    print(f"output of model - Sentence number: {i:3d}\n================")
    print(' '.join(generated_sentences[i]))
    print('\n')
    print(f"Actual sequence - Sentence number: {i:3d}\n================")
    print(' '.join(real_sentences[i]))
    print('\n')

In [None]:
predicted_words, real_last_words = predict(new_model, 
                                           random_sequences, 
                                           train_corpus.dictionary, 
                                           max_seq, 
                                           criterion, 
                                           device, 
                                           top_k=5, 
                                           explore_last=1)
                                           
for i in range(len(predicted_words)):
    print(f"output of model - Sentence number: {i:3d}\n================")
    print(predicted_words[i])
    print('\n')
    print(f"Actual sequence - Sentence number: {i:3d}\n================")
    print(real_last_words[i])
    print('\n')


## TRANSFER LEARNING

In [None]:
# Datasets path and files
Pen_Tree_dataset_path = os.path.join(Path(os.getcwd()), 'Dataset\PennTreeBank')
files_Pen_Tree        = ('ptb.train.txt', 'ptb.valid.txt', 'ptb.test.txt')

# Load corpus from data (including dictionary and sequence splitting)
Pen_Tree_train_corpus = Corpus_loader(Pen_Tree_dataset_path, files_Pen_Tree[0], max_seq)
Pen_Tree_valid_corpus = Corpus_loader(Pen_Tree_dataset_path, files_Pen_Tree[1], max_seq, Pen_Tree_train_corpus.dictionary)
Pen_Tree_test_corpus  = Corpus_loader(Pen_Tree_dataset_path, files_Pen_Tree[2], max_seq, Pen_Tree_train_corpus.dictionary)

dictionary_size_trans = len(Pen_Tree_train_corpus.dictionary) 

# Load data sequences into batches
train_loader = DataLoader(Pen_Tree_train_corpus.sequences.to(device), batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(Pen_Tree_valid_corpus.sequences.to(device), batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(Pen_Tree_test_corpus.sequences.to(device) , batch_size=batch_size, shuffle=True)

In [None]:
modified_model = copy.deepcopy(new_model)
# Freeze layers
for params in modified_model.parameters():
    params.requires_grad = False

# Modify last layer
modified_model.linear = nn.Linear(d_model, dictionary_size_trans)
for params in modified_model.linear.parameters():
    params.requires_grad = True

# Set optimizer
if chosen_sched == 'SGD':
    # Set optimizer
    optimizer = torch.optim.SGD(modified_model.parameters(), lr=5.0)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.5)

else:
    # Set optimizer
    optimizer = torch.optim.Adam(modified_model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
    scheduler = NoamOpt(hidden_dim, factor=1, warmup=400, optimizer=optimizer)

In [None]:
## START TRAINING! ##
log_interval      = 200       # Set log interval to measure performance
best_val_loss     = float('inf')
best_model        = None
ppl_eval_monitor  = []
ppl_train_monitor = []

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time() 
    
    # Train and extract perplexity monitoring for each epoch, and adds it to general perplexity monitor. 
    ppl_monitor = train(modified_model.to(device), 
                        train_loader, 
                        max_seq, 
                        epoch, 
                        scheduler, 
                        optimizer, 
                        criterion, 
                        device, 
                        chosen_sched)

    ppl_train_monitor.append(ppl_monitor)
    
    # Evaluate the model in corrent epoch
    val_loss = evaluate(modified_model, 
                        valid_loader,
                        max_seq, 
                        criterion, 
                        device)
                        
    val_ppl  = math.exp(val_loss)

    # Extract perplexity monitor of the evaluation data to general monitor perplexity 
    ppl_eval_monitor.append([val_ppl,
                             ppl_train_monitor[-1][-1][1],
                             epoch,
                             ppl_train_monitor[-1][-1][3]]) 
    
    # Calculate run time for epoch
    elapsed = time.time() - epoch_start_time
    
    # Print progress of validation data
    print('-' * 94)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f} [sec] | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}  | ')
    print('-' * 94)

    # Choose best model and save it
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model    = copy.deepcopy(modified_model)  

    # Update the  learning rate and empty cache
    scheduler.step()
    torch.cuda.empty_cache()

### Add Saving Trained Model ###
save_path = 'best_trained_transfered_model_w_nn_atten.pt'
torch.save(best_model.state_dict(), save_path)

In [None]:
# Load saved model
save_path = 'best_trained_transfered_model_w_nn_atten.pt'
test_the_modified_model = Transformer(dictionary_size, 
                                      d_model, 
                                      output_dim, 
                                      hidden_dim, 
                                      max_seq, 
                                      N, 
                                      heads)

test_the_modified_model.linear = nn.Linear(d_model, dictionary_size_trans)

test_the_modified_model.load_state_dict(torch.load(save_path, map_location=torch.device(device)))

# Set model for evaluation only
test_the_modified_model.eval()

# Evaluate the model on test dataset
start_test    = time.time() 
test_loss     = evaluate(test_the_modified_model.to(device), 
                         test_loader, 
                         max_seq, 
                         criterion, 
                         device)
                         
elapsed_time  = time.time() - start_test
test_ppl      = math.exp(test_loss)

print('-' * 60)
print(f'|   Test perplexity {test_ppl:8.2f}   |   Elapsed {elapsed_time:8.2f} [sec]  |')
print('-' * 60)

In [None]:
# Predict from 3 random sequences in test dataset
random_sequences = Pen_Tree_test_corpus.sequences[random.sample(range(Pen_Tree_test_corpus.sequences.size(0)), 3),:]

generated_sentences, real_sentences = predict(test_the_modified_model, 
                                              random_sequences, 
                                              Pen_Tree_train_corpus.dictionary, 
                                              max_seq, 
                                              criterion, 
                                              device, 
                                              top_k=1)

for i in range(len(generated_sentences)):
    print(f"output of model - Sentence number: {i:3d}\n================")
    print(' '.join(generated_sentences[i]))
    print('\n')
    print(f"Actual sequence - Sentence number: {i:3d}\n================")
    print(' '.join(real_sentences[i]))
    print('\n')