# 8.3 Exercise

In the first part of this practical, we developed an RNN-based machine translation algorithm, while in the second part, we tackled the sentiment analysis task using Transformers.

Let's now combine these two practicals and develop a Transformer-based machine translation method.

Remember to use the Pytorch implementation of [Transformers](https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html).
You do not need to change anything else in this notebook, except implementing the neural network.

However, please note that there are several small changes in this notebook compared to the first one. Although there are some comments identifying these in the code, it might be a good idea to study and understand the code again.


In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Dataset

Reading the dataset. Note that now we have `pad` and `unk` tokens.

In [None]:
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # SOS = start of sentence
EOS_token = 2  # EOS = end of sentence
UNK_token = 3  # UNK = unknown token

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS", 3: "UNK"}  # we will use the index to create the one-hot vector
        self.n_words = 4  # Count special tokens

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [None]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('lab8-%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [None]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence, target=False):
    indexes = indexesFromSentence(lang, sentence)
    if target is False:
        indexes.append(EOS_token)
    else:
        indexes.insert(0, SOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1], target=True)
    return (input_tensor, target_tensor)

def collate_fn(batch):
    input_text, output_text = zip(*batch)
    input_text = pad_sequence(input_text, batch_first=True, padding_value=PAD_token) # padding sequence
    input_padding_mask = (input_text == PAD_token)  # create mask to ignore the pad tokens - 1/True indicates pad, 0/False indicates not pad
    output_text = pad_sequence(output_text, batch_first=True, padding_value=PAD_token)
    output_padding_mask = (output_text == PAD_token)
    return input_text, input_padding_mask, output_text, output_padding_mask

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('eng', 'fra', True)

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH+1), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH+1), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)  # adding EOS token for input sentence
        tgt_ids.insert(0, SOS_token)  # adding SOS token for input sentence
        tgt_ids.append(EOS_token)  # adding EOS token for target sentence
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, collate_fn=collate_fn)
    return input_lang, output_lang, train_dataloader

In [None]:
batch_size = 64

input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

## Model

Implement your model here!

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size=512,
        src_vocab_size=input_lang.n_words,
        trg_vocab_size=output_lang.n_words,
        src_pad_idx=PAD_token,
        num_heads=8,
        num_encoder_layers=4,
        num_decoder_layers=4,
        dim_feedforward=2048,
        dropout=0.1,
        max_len=MAX_LENGTH+1
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)  # word emb
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)  # pos emb

        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)  # word emb
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)  # pos emb

        # define your transformer here


        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def forward(self, src, src_pad_mask, trg, trg_pad_mask, train=True):
        N, src_seq_length = src.shape
        N, trg_seq_length = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N).transpose(0, 1)
            .to(device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N).transpose(0, 1)
            .to(device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        if train is True:
            # this generates the attention mask so the decoder does not "look into the future"
            trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(device)
        else:
            # no need to generate the attention mask during inference
            trg_mask = None

        # call your transformer here
        out = None

        out = self.fc_out(out)
        return out

In [None]:
model = Transformer().to(device)

## Training

In [None]:
import time
import math
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

Note that here, we shift the target sequence to the right to enable teacher forcing.

Suppose the translated target sentence during training is:

`['SOS', 'I', 'love', 'artificial', 'intelligence', 'EOS']`

In this case, the input to the decoder should be:

`['SOS', 'I', 'love', 'artificial', 'intelligence']` (no `EOS`)

While the expected output is:

`['I', 'love', 'artificial', 'intelligence', 'EOS']` (no `SOS`)

Therefore, when the decoder sees the first word `SOS`, it should predict `I`.
when it sees the word `I`, it should predict `love`, and so on.

In [None]:
def train_epoch(dataloader, out_voc_size, model, optimizer, criterion):
    total_loss = 0
    for data in dataloader:
        input_tensor, input_pad_mask, target_tensor, target_pad_mask = data
        mask = (target_tensor == EOS_token)

        optimizer.zero_grad()
        outputs = model(input_tensor, input_pad_mask,
                        target_tensor[~mask].view(target_tensor.shape[0], -1),  # Remove last token for teacher forcing
                        target_pad_mask[~mask].view(target_pad_mask.shape[0], -1))  # Remove last token for teacher forcing

        loss = criterion(outputs.reshape(-1, out_voc_size),
                         target_tensor[:, 1:].reshape(-1))  # shift target
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
def train(train_dataloader, out_voc_size, model, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    criterion = nn.CrossEntropyLoss(ignore_index=PAD_token)
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, out_voc_size, model, optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
train(train_dataloader, output_lang.n_words, model, 100, print_every=5, plot_every=5)

## Evaluate

In [None]:
def evaluate(model, sentence, input_lang, output_lang):
    model.eval()
    input_tensor = tensorFromSentence(input_lang, sentence).to(device)
    EOS_cuda = torch.tensor([[EOS_token]]).to(device)
    input_tensor = torch.cat([input_tensor, EOS_cuda], dim=1).to(device)  # including EOS in the input sentence

    outputs = [SOS_token]  # the output/translated sentence only starts with SOS
    for i in range(MAX_LENGTH):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(input_tensor, None, trg_tensor, None, train=False)

        best_guess = output.argmax(2)[0, -1].item()
        if best_guess == EOS_token:
            break
        outputs.append(best_guess)

    translated_sentence = [output_lang.index2word[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]

In [None]:
def evaluateRandomly(model, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(model, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
evaluateRandomly(model)