In [1]:

import torch
import numpy as np
from torchtext.data.metrics import bleu_score


def get_padding_mask(query, key, pad_idx):
    batch_size, len_query = query.size()
    batch_size, len_key = key.size()
    masking = key.data.eq(pad_idx).unsqueeze(1)
    return masking.expand(batch_size, len_query, len_key)

def get_subsequent_mask(query, device):
    shape = [query.size(0), query.size(1), query.size(1)]
    subsequent_mask = np.tril(np.ones(shape), k=0) == 0
    subsequent_mask = torch.from_numpy(subsequent_mask).to(device)
    return subsequent_mask


# This method translate a sentence to target language
def translate_sentence(model, sentence, src_spacy_model, source_field, targer_field, device, max_length=60):
    
    if type(sentence) == str:
        input_tokens = [token.text.lower() for token in src_spacy_model(sentence)]
    else:
        input_tokens = [token.lower() for token in sentence]


    # Add <sos> and <eos>
    input_tokens.insert(0, source_field.init_token)
    input_tokens.append(source_field.eos_token)

    # List of indices
    source_text_to_indices = [source_field.vocab.stoi[token] for token in input_tokens]
    source_tensor = torch.LongTensor(source_text_to_indices).unsqueeze(0).to(device)
   
    outputs = [targer_field.vocab.stoi["<sos>"]]
    for i in range(max_length):
        target_tensor = torch.LongTensor(outputs).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(source_tensor, target_tensor)

        best_guess = output[0].argmax(1)[-1].item()
        outputs.append(best_guess)

        if best_guess == targer_field.vocab.stoi["<eos>"]:
            break

    translated_sentence = [targer_field.vocab.itos[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def bleu(data, model, spacy_german, german_field, english_field, device):

    targets = []
    outputs = []


    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]
 
        predict = translate_sentence(model, src, spacy_german, german_field, english_field, device)
        predict = predict[:-1]

        targets.append([trg])
        outputs.append(predict)

    return bleu_score(outputs, targets)


In [2]:
from turtle import forward
from unicodedata import bidirectional
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np

class Embedder(nn.Module):
    def __init__(self, input_size, d_model):
        super().__init__()
        self.embedding = nn.Embedding(input_size, d_model)
    def forward(self, x):
        return self.embedding(x)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_length = 100):
        super().__init__()
        self.d_model = d_model
        positional_data = torch.zeros(max_length, d_model)

        def get_positional_value(pos):
            return [pos / np.power(10000, 2 * (i // 2) / d_model) for i in range(d_model)]

        positional_data = np.array([get_positional_value(pos) for pos in range(max_length)])
        positional_data[:, 0::2] = np.sin(positional_data[:, 0::2])  # dim 2i
        positional_data[:, 1::2] = np.cos(positional_data[:, 1::2])  # dim 2i+1

        positional_data = torch.Tensor(positional_data).unsqueeze(0)
        self.register_buffer('pe', positional_data)
 
    def forward(self, x):
        seq_length = x.size(1)
        x = x + Variable(self.pe[:,:seq_length], requires_grad=False)
        return x


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        # Divide vector into equal n_heads part
        d_h = d_model // n_heads
        self.d_h = d_h

        # Following linear models produce Query, Key & Value for words
        # d_model -> d_h * n_heads
        self.WQ = nn.Linear(d_model, d_model)
        self.WK = nn.Linear(d_model, d_model)
        self.WV = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim=-1)

        self.linear = nn.Linear(d_model, d_model)


    def forward(self, Q, K, V, masked=None):
        # Q: (batch_size, seq_length, d_model) -- d_model == embedding_dim
        # K: (batch_size, seq_length, d_model)
        # V: (batch_size, seq_length, d_model)

        batch_size = Q.size(0)
 
        Q = self.WQ(Q).view(batch_size, -1, self.n_heads, self.d_h).transpose(1, 2)
        # Q: (batch_size, seq_length, n_heads, d_h) -> (batch_size, n_heads, seq_length, d_h)
        K = self.WK(K).view(batch_size, -1, self.n_heads, self.d_h).transpose(1, 2)
        # K: (batch_size, seq_length, n_heads, d_h) -> (batch_size, n_heads, seq_length, d_h)
        V = self.WV(V).view(batch_size, -1, self.n_heads, self.d_h).transpose(1, 2)
        # V: (batch_size, seq_length, n_heads, d_h) -> (batch_size, n_heads, seq_length, d_h)

        # Q: (batch_size, n_heads, seq_length, d_h)
        # K: (batch_size, n_heads, seq_length, d_h) -> (batch_size, n_heads, d_h, seq_length)
        # Formula: (a,b,c,d)*(a,b,d,f) -> (a,b,c,f)
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(self.d_h)
        # scores: (batch_size, n_heads, seq_length, seq_length)
        # This is actually calculating word-by-word score. Thats why shape is (-, -, seq_length, seq_length)


        if masked is not None:
            # pad_masked: (batch_size, seq_length, seq_length)
            masked = masked.unsqueeze(1).repeat(1, self.n_heads, 1, 1)
            # pad_masked: (batch_size, n_heads, seq_length, seq_length)
            # The shape of the mask is exactly same as scores.
            scores = scores.masked_fill(masked, -1e9)
            # scores: (batch_size, n_heads, seq_length, seq_length)

        attention = self.softmax(scores)

        # attention: (batch_size, n_heads, seq_length, seq_length)
        # V: (batch_size, n_heads, seq_length, d_h)
        context = torch.matmul(attention, V).transpose(1, 2).contiguous()
        # context: (batch_size, n_heads, seq_length, d_h) -> (batch_size, seq_length, n_heads, d_h)

        # The following part concat several heads into one
        output = context.view(batch_size, -1, self.n_heads * self.d_h)
        # output: (batch_size, seq_length, d_model)
        output = self.linear(output)
        # output: (batch_size, seq_length, d_model)

        return output


class PoswiseFeedForwardNet(nn.Module):

    def __init__(self, d_model, ffn_dim,):
        super(PoswiseFeedForwardNet, self).__init__()
        self.l1 = nn.Linear(d_model, ffn_dim)
        self.l2 = nn.Linear(ffn_dim, d_model)

    def forward(self, x):
        output = self.l1(x)
        output = torch.relu(output)
        output = self.l2(output)
        return output


# Layer of Encoder
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ffn_dim, dropout=0.1):
        super(EncoderLayer, self).__init__()

        # The first sub-layer of Encoder. Its a multi-head self-attention.
        self.self_attention = MultiHeadAttention(d_model, n_heads)
        self.self_attention_norm = nn.LayerNorm(d_model)
        # The second sub-layer of Encoder. Its a positionwise fully connected feed-forward network.
        self.ff_layer = PoswiseFeedForwardNet(d_model, ffn_dim)
        self.ff_layer_norm = nn.LayerNorm(d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, source, pad_masked):
        # As we are feeding same input as query, key & value, its called self-attention.
        # source: (batch_size, seq_length, d_model)
        # pad_masked: (batch_size, seq_length, seq_length)
        output_from_attn = self.self_attention(source, source, source, pad_masked)
        output_from_attn = self.dropout(output_from_attn)
        output_from_attn = self.self_attention_norm(source + output_from_attn)
        # output_from_attn: (batch_size, seq_length, d_model)
        output_from_ffn = self.ff_layer(output_from_attn)
        output_from_ffn = self.dropout(output_from_ffn)
        output_from_ffn = self.ff_layer_norm(output_from_attn + output_from_ffn)
        # output_from_ffn: (batch_size, seq_length, d_model)

        return output_from_ffn


# Encoder is responsible to represent
# a source sentence into a context state
class Encoder(nn.Module):
    def __init__(self, source_vocab_size, d_model, n_layers, n_heads, ffn_dim, source_pad_idx, dropout=0.1) :
        super(Encoder, self).__init__()

        self.source_pad_idx = source_pad_idx

        # It's like Word2Vec
        self.embedding = nn.Embedding(source_vocab_size, d_model)
        # This is used to add position data with embedded words.
        self.position_embedding = PositionalEncoding(d_model)

        self.dropout = nn.Dropout(dropout)

        # Series of similar layers.
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, ffn_dim) for _ in range(n_layers)])

    def forward(self, source):
        # This method will add masking to padded position (<pad>).
        # source: (batch_size, seq_length)
        pad_masked = get_padding_mask(source, source, self.source_pad_idx)
        # pad_masked: (batch_size, seq_length, seq_length)

        embedded = self.embedding(source)
        # embedded: (batch_size, seq_length, embedding_dim)
        embedded = self.position_embedding(embedded)
        # embedded: (batch_size, seq_length, embedding_dim)
        embedded = self.dropout(embedded)

        # The first input to the series of layer comes from embedded input.
        # After that, output of one layer is fed into the next layer.
        for layer in self.layers:
            embedded = layer(embedded, pad_masked)

        return embedded


class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ffn_dim, dropout=0.1):
        super(DecoderLayer, self).__init__()
        
        # The first sub-layer of Decoder. Its a multi-head self-attention.
        self.self_attention = MultiHeadAttention(d_model, n_heads)
        self.self_attention_norm = nn.LayerNorm(d_model)
        # The second sub-layer of Decoder. Its an encoder-decoder multi-head self-attention.
        self.codec_attention= MultiHeadAttention(d_model, n_heads)
        self.codec_attention_norm = nn.LayerNorm(d_model)
        # The third sub-layer of Encoder. Its a positionwise fully connected feed-forward network.
        self.ffn_layer = PoswiseFeedForwardNet(d_model, ffn_dim)
        self.ffn_layer_norm = nn.LayerNorm(d_model)
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, target, source, pad_masked):
        # As we are feeding same input as query, key & value, its called self-attention.
        # target: (batch_size, seq_length, d_model)
        # pad_masked: (batch_size, seq_length, seq_length)
        output_from_attn = self.self_attention(target, target, target, pad_masked)
        output_from_attn = self.dropout(output_from_attn)
        output_from_attn = self.self_attention_norm(target + output_from_attn)
        # output_from_attn
        # output_from_attn : (batch_size, seq_length, d_model)
        # source : (batch_size, seq_length, d_model)
        output_from_codec_attn= self.codec_attention(output_from_attn, source, source, None)
        output_from_codec_attn = self.dropout(output_from_codec_attn)
        output_from_codec_attn = self.codec_attention_norm(output_from_attn + output_from_codec_attn)
        # output_from_codec_attn
        output_ffn = self.ffn_layer(output_from_codec_attn)
        output_ffn = self.dropout(output_ffn)
        output_ffn = self.ffn_layer_norm(output_from_codec_attn + output_ffn)
        # output_ffn

        return output_ffn

# Decoder is responsible to generate
# a target sentence from a context state
class Decoder(nn.Module):
    def __init__(self, target_vocab_size, d_model, n_layers, n_heads, ffn_dim, source_pad_idx, target_pad_idx, device, dropout=0.1):
        super(Decoder, self).__init__()

        self.source_pad_idx = source_pad_idx
        self.target_pad_idx = target_pad_idx
        self.device = device
        
        # It's like Word2Vec
        self.embedding = nn.Embedding(target_vocab_size, d_model)
        # This is used to add position data with embedded words.
        self.position_embedding = PositionalEncoding(d_model)

        self.dropout = nn.Dropout(dropout)

        # Series of similar layers.
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, ffn_dim) for _ in range(n_layers)])
        # This is to project output
        self.fc_out = nn.Linear(d_model, target_vocab_size)

    def forward(self, target, source_output):
        # This method will add masking to padded position (<pad>).
        # target: (batch_size, seq_length)
        target_mask = get_padding_mask(target, target, self.target_pad_idx)
        # This method will add masking to subsequent position.
        subsequent_mask = get_subsequent_mask(target, self.device)
        target_mask = target_mask | subsequent_mask

        embedded = self.embedding(target)
        # embedded: (batch_size, seq_length, embedding_dim)
        embedded = self.position_embedding(embedded)
        # embedded: (batch_size, seq_length, embedding_dim)
        embedded = self.dropout(embedded)

        # The first input to the series of layer comes from embedded input.
        # After that, output of one layer is fed into the next layer. 
        for layer in self.layers:
            embedded = layer(embedded, source_output, target_mask)

        # This is to project output
        output = self.fc_out(embedded)

        return output


# Seq2Seq combines Encoder & Decoder
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target):
        # source: (batch_size, seq_length)
        # target: (batch_size, seq_length)
        encoded = self.encoder(source)
        # encoded: (batch_size, seq_length, hidden_dim)
        output = self.decoder(target, encoded)
        # output: (batch_size, seq_length, target_vocab_size)

        return output


In [3]:
import torch
import time
import math


class Process():
    def __init__(self, model, src_spacy_model, source_field, target_field, optimizer, scheduler, loss_func, test_sentence, clip, device):
        self.model = model
        self.src_spacy_model = src_spacy_model
        self.source_field = source_field
        self.target_field = target_field
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.loss_func = loss_func
        self.test_sentence = test_sentence
        self.clip = clip
        self.device = device
        self.step = 0

    # Train model
    def train(self, iterator):

        epoch_loss = 0

        self.model.train()

        for batch in iterator:
            source = batch.src.to(self.device)
            target = batch.trg.to(self.device)
            # source: (batch_size, seq_length)
            # target: (batch_size, seq_length)

            # Last target token is <eos>. Do no need to pass it.
            output = self.model(source, target[:,:-1])
            # output: (batch_size, seq_length, target_vocab_size)
 
            target_vocab_size = output.shape[-1]
            output = output.contiguous().view(-1, target_vocab_size)
            # output: (batch_size * seq_length, target_vocab_size)

            target = target[:,1:].contiguous().view(-1)
            # target: (batch_size, seq_length)

            self.optimizer.zero_grad()
            loss = self.loss_func(output, target)
            loss.backward()

            # This is used to prevent gradient exploding. Clipping to 1.
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)

            # This updates the trainable parameters
            self.optimizer.step()

            self.step += 1

            epoch_loss += loss.item()
            
            
        return epoch_loss / len(iterator)

    def evaluate(self, iterator):

        self.model.eval()

        epoch_loss = 0

        with torch.no_grad():

            for batch in iterator:

                source = batch.src.to(self.device)
                target = batch.trg.to(self.device)

                output = self.model(source, target[:,:-1])
                
                target_vocab_size = output.shape[-1]

                output = output.contiguous().view(-1, target_vocab_size)
                target = target[:,1:].contiguous().view(-1)

                loss = self.loss_func(output, target)

                epoch_loss += loss.item()


        return epoch_loss / len(iterator)

    def run(self, num_epochs, train_iterator, valid_iterator):

        best_lost = 1e10
        no_better = 0

        for epoch in range(num_epochs):

            start_time = time.time()
            
            train_loss = self.train(train_iterator)
            valid_loss = self.evaluate(valid_iterator)

            end_time = time.time()
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            lr = self.scheduler.get_last_lr()[0]

            print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train Exp: {math.exp(train_loss):7.3f}')
            print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Exp: {math.exp(valid_loss):7.3f}')
            print(f'\t Learning Rate: {lr:.7f}')

            # This translate the sample sentence.
            translated_sentence = translate_sentence(
                self.model, self.test_sentence,
                self.src_spacy_model, self.source_field, self.target_field,
                self.device, max_length=50,
            )

            print(f"Translated example sentence: \n {' '.join(translated_sentence[:-1])}")
            print()
            self.scheduler.step()

            if valid_loss > best_lost:
              no_better += 1
              if no_better == 3:
                break
            else: 
              best_lost = valid_loss
              no_better=0


In [4]:
# !pip install spacy==v3.2
# !python -m spacy download de_core_news_lg
# !python -m spacy download en_core_web_lg

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
import numpy as np
import spacy
import random


SEED = 1876189809
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# I have used spacy dataset for low volume
# Sentences:
#   Training:   29000
#   Validating: 1014
#   Testing:    1000
# Vocabulary:
#   German:     7853
#   English:    5893
spacy_german = spacy.load("de_core_news_lg")
spacy_english = spacy.load("en_core_web_lg")

def tokenize_ger(text):
    return [tok.text for tok in spacy_german.tokenizer(text)]

def tokenize_eng(text):
    return [tok.text for tok in spacy_english.tokenizer(text)]


# This is used to tokenize and append extra token
german_field = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>", batch_first=True)
english_field = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>", batch_first=True)


# To load data from local file
# fields = {"src": ("src", german_field), "trg": ("trg", english_field)}
# train_data, valid_data, test_data = TabularDataset.splits(
#     path="/content", train="train.txt", validation="val.txt", test="test.txt", format="json", fields=fields
# )


train_data, valid_data, test_data = Multi30k.splits(
    exts=(".de", ".en"),
    fields=(german_field, english_field)
)


# This will generate vocabulary with minimum freq.
german_field.build_vocab(train_data, min_freq=2)
english_field.build_vocab(train_data, min_freq=2)
source_vocab_size = len(german_field.vocab)
target_vocab_size = len(english_field.vocab)

num_epochs = 1000
learning_rate = 0.0005
# Large batch size exceeds free GPU memory
batch_size = 100
# Dimension for all matrix
# I will call it hidden_dim & embedding_dim as well
d_model = 300
# Number of repeated layers
n_layers = 4
# Number of heads
n_heads = 4
ffn_dim = 1200

# Use this to clip gradient norm to avoid exploding
clip = 1
dropout = 0.1

source_pad_idx = german_field.vocab.stoi[german_field.pad_token]
target_pad_idx = english_field.vocab.stoi[english_field.pad_token]

encoder = Encoder(source_vocab_size, d_model, n_layers, n_heads, ffn_dim, source_pad_idx, dropout=dropout).to(device)
decoder = Decoder(target_vocab_size, d_model, n_layers, n_heads, ffn_dim, source_pad_idx, target_pad_idx, device, dropout=dropout).to(device)
# german_vector = torch.FloatTensor(spacy_german.vocab.vectors.data)
# encoder.embedding = nn.Embedding.from_pretrained(german_vector, freeze=False)
# english_vector = torch.FloatTensor(spacy_english.vocab.vectors.data)
# decoder.embedding = nn.Embedding.from_pretrained(english_vector, freeze=False)


model = Seq2Seq(encoder, decoder).to(device)

# This is to use Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


# This is to use CrossEntropy loss function.
# Ignore padding entry
loss_func = nn.CrossEntropyLoss(ignore_index=target_pad_idx)

train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_key = lambda x: len(x.src),
    sort_within_batch=True,
    device=device,
)


test_sentence = "Die alte Frau sieht sich das Fußballspiel an und isst Süßigkeiten."
test_translation = "The old woman is watching the football match and eating candy."

process = Process(model, spacy_german, german_field, english_field, optimizer, scheduler, loss_func, test_sentence, clip, device)

process.run(num_epochs, train_iterator, validation_iterator)

score = bleu(test_data, model, spacy_german, german_field, english_field, device)
print(f"Bleu score {score*100:.2f}")

Epoch: 01 | Time: 0m 15s
	Train Loss: 3.831 | Train Exp:  46.098
	 Val. Loss: 2.854 |  Val. Exp:  17.349
	 Learning Rate: 0.0005000
Translated example sentence: 
 the old woman is looking at the park and holding her face .

Epoch: 02 | Time: 0m 15s
	Train Loss: 2.561 | Train Exp:  12.947
	 Val. Loss: 2.257 |  Val. Exp:   9.553
	 Learning Rate: 0.0004750
Translated example sentence: 
 the old woman is looking at the soccer game .

Epoch: 03 | Time: 0m 15s
	Train Loss: 2.085 | Train Exp:   8.047
	 Val. Loss: 2.009 |  Val. Exp:   7.458
	 Learning Rate: 0.0004512
Translated example sentence: 
 the old woman is looking at the soccer game .

Epoch: 04 | Time: 0m 15s
	Train Loss: 1.786 | Train Exp:   5.965
	 Val. Loss: 1.860 |  Val. Exp:   6.425
	 Learning Rate: 0.0004287
Translated example sentence: 
 the old woman is looking at the soccer game .

Epoch: 05 | Time: 0m 15s
	Train Loss: 1.562 | Train Exp:   4.768
	 Val. Loss: 1.779 |  Val. Exp:   5.925
	 Learning Rate: 0.0004073
Translated exa