Import packages

In [None]:
import torch
from torch import nn
import torchtext
from torchtext.datasets import Multi30k
from torchtext.utils import download_from_url, extract_archive

import spacy

Download and extract the training, validation, and test data for English and German from the Multi30k dataset.

In [None]:
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

Understand downloaded data. What format of train, val, and test data were downloaded exactly, and how much data is in it?

In [None]:
print('Train file paths: ', train_filepaths)
print('Val file paths: ', val_filepaths)
print('Test file paths: ', test_filepaths)

From the above output, there is 6 files in total: 2 training files, 2 validation files, and 2 test files. Now to check what is in the files.

In [None]:
# Open training files
german_train_file = open(train_filepaths[0], "r")
english_train_file = open(train_filepaths[1], "r")

# Check length of german and english training data
german_train_length = len(german_train_file.readlines())
english_train_length = len(english_train_file.readlines())
print('German train sentences length: ', german_train_length)
print('English train sentences lenght: ', english_train_length)

german_train_file.seek(0)
english_train_file.seek(0)

# Visualize first 2 sentences from english and german data
print(german_train_file.readlines()[0:2])
print(english_train_file.readlines()[0:2])

Repeat the same steps for val and test data. From the above output, we know we have 29,000 training samples. Repeating the steps for val and test below shows that we have 1014 validation samples and 1000 testing samples in the dataset.

In [None]:
# Open val files
german_val_file = open(val_filepaths[0], "r")
english_val_file = open(val_filepaths[1], "r")

# Check length of german and english val data
german_val_length = len(german_val_file.readlines())
english_val_length = len(english_val_file.readlines())
print('German val sentences length: ', german_val_length)
print('English val sentences lenght: ', english_val_length)

german_val_file.seek(0)
english_val_file.seek(0)

# Visualize first 2 sentences from english and german val data
print(german_val_file.readlines()[0:2])
print(english_val_file.readlines()[0:2])

# Open test files
german_test_file = open(test_filepaths[0], "r")
english_test_file = open(test_filepaths[1], "r")

# Check length of german and english test data
german_test_length = len(german_test_file.readlines())
english_test_length = len(english_test_file.readlines())
print('German val sentences length: ', german_test_length)
print('English val sentences lenght: ', english_test_length)

german_test_file.seek(0)
english_test_file.seek(0)

# Visualize first 2 sentences from english and german test data
print(german_test_file.readlines()[0:2])
print(english_test_file.readlines()[0:2])

In [None]:
german_train_file.seek(0)
english_train_file.seek(0)
german_val_file.seek(0)
english_val_file.seek(0)
german_test_file.seek(0)
english_test_file.seek(0)

Load English and German tokenizers using Spacy.

In [None]:
eng_lang = spacy.load("en_core_web_sm")
ger_lang = spacy.load("de_core_news_sm")

Set hyperparameters. I used the same ones as those used in the paper.

In [None]:
num_heads = 8
d_model = 512

Begin building the Transformer. The first step is to build the 'Scaled Dot-Product Attention' block mentioned in the paper. This is still just the first draft; it will probably need some fixes once I get to later stages.

In [None]:
class ScaledDotProduct(nn.Module):
    def __init__(self):
        super(ScaledDotProduct, self).__init__()
        
        # I probably don't need to initialize K, Q, and V here, since they will be passed to the 
        # scaled dot product when we call it from the MultiHeadAttention class in the forward method.
        # Will delete the variable below later if I turn out to be right.
        #self.queries = queries
        #self.keys = keys
        #self.values = values
        self.dk = self.queries.shape[1]

        # Softmax operator. 'dim' still needs to be specified
        self.softmax = nn.Softmax()

    # Define the forward function
    def forward(self, queries, keys, values):
        compatibility = torch.bmm(queries, torch.transpose(keys))   # first batch MatMul operation
        compatibility = compatibility / torch.sqrt((self.dk))       # scaling down by sqrt(dk)
        compatibility_softmax = self.softmax(compatibility)         # normalizing using Softmax
        output = torch.bmm(compatibility_softmax, values)           # final batch MatMul operation

        return output

Build the 'Multi-Head Attention' block.

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, h, d_model, queries, keys, values):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = h
        self.batch_num = queries.shape[0]
        self.seq_len = queries.shape[1]
        self.embed_len = queries.shape[2]
        self.d_model = d_model
        self.queries = queries
        self.keys = keys
        self.values = values
        self.head_length = self.d_model/self.num_heads

        # For an input, Q for example, which would originally have a shape
        # of (N, seq_len, embed_len), it would be split up into the number of 
        # heads that we define (ex: 8). So, the new shape would be
        # (N, seq_len, embed_len/8). This would also apply to K and V too.

        # Since we are flattening batches of matrices, I'm not sure if the flattening
        # should be done in another way. I'll come back to this later if it needs changing.
        self.q_in = torch.flatten(self.queries).shape / self.num_heads
        self.k_in = torch.flatten(self.keys).shape / self.num_heads
        self.v_in = torch.flatten(self.values).shape / self.num_heads
        
        # For the input of each Linear layer, we would have the divided Q, K, 
        # and V calculated above. 
        self.q_linear = nn.Linear(self.q_in, self.q_in)
        self.k_linear = nn.Linear(self.k_in, self.k_in)
        self.v_linear = nn.Linear(self.v_in, self.v_in)

        # Attention layer.
        self.attention = ScaledDotProduct()

        # This is the final Linear layer, after the outputs of all the heads
        # from the Scaled Dot Product layer have been concatenated together. The
        # output dimension of this layer is a hyperparameter that we define. Here
        # we use d_model, which is 512.
        self.output_linear = nn.Linear(self.v_in*self.num_heads, self.d_model)

    def forward(self, queries, keys, values):
        # Feed the 8 heads of Q, K, and V into the linear layers in parallel, and then into the
        # attention block. Let's say the original tensor Q has the following shape: 
        # (N, seq_len, embed_len) -> (64, 20, 512).
        # The segment that will go into each head will be of the following size:
        # (N, seq_len, embed_len/num_heads) -> (64, 20, 64). So we need to slice the third dimension.
        for i in range(self.num_heads):

            # The output of each of the linear layers has length -> (N*seq_len*embed_len/num_heads)
            q_linear_output = self.q_linear(torch.flatten(queries[:, :, i*self.head_length:(i+1)*self.head_length]))
            k_linear_output = self.k_linear(torch.flatten(keys[:, :, i*self.head_length:(i+1)*self.head_length]))
            v_linear_output = self.v_linear(torch.flatten(values[:, :, i*self.head_length:(i+1)*self.head_length]))

            # Since the three outputs computed from the linear layers above are just 1D vectors of length
            # (N*seq_len*embed_len/num_heads), and the ScaledDotProduct forward method expects 3D tensors,
            # I will reshape the 1D vectors into 3D tensors of shape (N, seq_len, embed_len/num_heads)
            q_reshaped_output = torch.reshape(q_linear_output, (self.batch_num, self.seq_len, self.embed_len))
            k_reshaped_output = torch.reshape(k_linear_output, (self.batch_num, self.seq_len, self.embed_len))
            v_reshaped_output = torch.reshape(v_linear_output, (self.batch_num, self.seq_len, self.embed_len))

            # Feed reshaped Q, K, and V into ScaledDotProduct layer
            sdp_output = self.attention.forward(q_reshaped_output, k_reshaped_output, v_reshaped_output)         
            


Building the Encoder. 

In [None]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        