In [None]:
!pip install -U catalyst torch==1.6 torchtext==0.7.0 youtokentome nltk

In [None]:
import torch
from catalyst.utils import set_global_seed, get_device

set_global_seed(42)
device = "cuda:0"

# Seminar

Hi! Today we are going to learn a new tokenization algorithm, seq2seq metrics and a machine translation task. We will be acquainted with an attention mechanism.

## BPE. YouTokenToMe

Previously we have discussed a text preprocessing pipeline. We used `WordPunctTokenizer`, that tokenize text to words and punctuations. But this tokenization algorithm isn't perfect. Some languages have many word-forms. Many languages have words modification, like prefixes and suffixes. We want to save morphology information in text, but save every possible word-form isn't memory-efficient and isn't easy to train. However, we can create tokenziation mechanism, that will tokenize every word by subword morphology. And there is unsupervised algorithm to do it. It's called Byte Pair Encoding. How it works:

![](https://lena-voita.github.io/resources/lectures/seq2seq/bpe/build_merge_table.gif)

1. We split texts into characters
2. Count bigrams on characters
3. Merge the most popular pair
4. Continue until we reach given vocabulary size.

It's easy algorithm, and we have several implementations:
- SentencePiece
- fastBPE
- Tokenizers by 🤗
- YouTokenToMe

The fastes one is YouTokenToMe by VK Team. Let's look how it works:

In [None]:
from typing import List, Tuple

import youtokentome as yttm
from torchtext.utils import download_from_url, extract_archive
from torchtext.vocab import Vocab
from torchtext.experimental.datasets import WMT14

Download WMT14 dataset. It have pair texts on English and German languages, processed by Google Brain.

In [None]:
wmt_url = "https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8"
dataset_tar = download_from_url(wmt_url, root="wmt14")
extracted = extract_archive(dataset_tar)

We will use `newtest2016` data us train part in training pipeline. Now we need to train BPE tokenizers for English and German languages. Consider vocabulary size as 10000 tokens.

In [None]:
train_data_en_path = "wmt14/newstest2016.en"
tokenizer_en_path = "en.tok"

yttm.BPE.train(
    data=train_data_en_path, vocab_size=10000, model=tokenizer_en_path
)

In [None]:
train_data_de_path = "wmt14/newstest2016.de"
tokenizer_de_path = "de.tok"

yttm.BPE.train(
    data=train_data_de_path, vocab_size=10000, model=tokenizer_de_path
)

Training procedure in `YTTM` run in a background. We need to load tokenizers to work with them:

In [None]:
tokenizer_en = yttm.BPE(model=tokenizer_en_path)
tokenizer_de = yttm.BPE(model=tokenizer_de_path)

Our text example will be:

In [None]:
test_text = "Tinkoff loves VK!"

Try to get tokens, ids, add special tokens:

In [None]:
tokenizer_en.encode([test_text], output_type=yttm.OutputType.SUBWORD)

In [None]:
tokenizer_en.encode([test_text], output_type=yttm.OutputType.ID)

In [None]:
tokenizer_en.encode(
    [test_text], output_type=yttm.OutputType.SUBWORD, bos=True, eos=True
)

In [None]:
tokenizer_en.encode(
    [test_text], output_type=yttm.OutputType.ID, bos=True, eos=True
)

To join `YTTM` tokenizer and `TorchText` dataset abstraction we need to code couple functions:

In [None]:
# Code them

def tokenize_de(text: str) -> List[str]:
    return tokenizer_de.encode(
        [text], output_type=yttm.OutputType.SUBWORD, bos=True, eos=True
    )[0]


def tokenize_en(text: str) -> List[str]:
    return tokenizer_en.encode(
        [text], output_type=yttm.OutputType.SUBWORD, bos=True, eos=True
    )[0]


In [None]:
(train_dataset, valid_dataset, test_dataset) = WMT14(
    train_filenames=("newstest2016.en", "newstest2016.de"),
    valid_filenames=("newstest2010.en", "newstest2010.de"),
    test_filenames=("newstest2009.en", "newstest2009.de"),
    tokenizer=(tokenize_en, tokenize_de),
)

Check how `dataset` works:

In [None]:
train_dataset[0]

In [None]:
tokens = [train_dataset.get_vocab()[0].itos[i] for i in train_dataset[0][0]]
"".join(tokens)

In [None]:
tokens = [train_dataset.get_vocab()[1].itos[i] for i in train_dataset[0][1]]
"".join(tokens)

Let's code special function to decode input ids into human-readable text:

In [None]:
# code function to decode input ids to pretty output

def decoding(input_ids: torch.Tensor, vocab: Vocab) -> str:
    result_text = ""
    for input_id in input_ids:
        if input_id == vocab.stoi["<EOS>"]:
            break
        elif input_id != vocab.stoi["<BOS>"]:
            result_text += vocab.itos[input_id]
    return "".join(t if t != "▁" else " " for t in result_text )

In [None]:
decoding(train_dataset[0][0], train_dataset.get_vocab()[0])

In [None]:
decoding(train_dataset[0][1], train_dataset.get_vocab()[1])

We need to code padding code:

In [None]:
PAD_ID_src = train_dataset.get_vocab()[0].stoi["<PAD>"]
PAD_ID_trg = train_dataset.get_vocab()[1].stoi["<PAD>"]
max_length = 64 # 128

def collate_fn(batch: Tuple[torch.Tensor]) -> Tuple[torch.Tensor]:
    max_len_src = min(max(b[0].size(0) for b in batch), max_length)
    max_len_trg = min(max(b[1].size(0) for b in batch), max_length)
    all_src = torch.zeros(max_len_src, len(batch)) + PAD_ID_src
    all_trg = torch.zeros(max_len_trg, len(batch)) + PAD_ID_trg

    for num, (src, trg) in enumerate(batch):
        all_src[: src.size(0), num] = src[:max_length]
        all_trg[: trg.size(0), num] = trg[:max_length]
    return all_src.type(torch.LongTensor), all_trg.type(torch.LongTensor)

And bucketing sampler! It's special sampler, that will reduce padding in batches. We need to sort our text by lens in tokens, and form batches using text order. We'll implement this by `SortedSampler` and `RandomSubsetSampler`:



In [None]:
from typing import Any, Callable, Iterable

from torch.utils.data import Dataset
from torch.utils.data.sampler import Sampler


class SortedSampler(Sampler):
    def __init__(self, data: Dataset, sort_key: Callable[[Any], Any] = lambda x: x):
        super().__init__(data)
        self.data = data
        self.sort_key = sort_key
        zip_ = [(i, self.sort_key(row)) for i, row in enumerate(self.data)]
        zip_ = sorted(zip_, key=lambda r: r[1])
        self.sorted_indexes = [item[0] for item in zip_]

    def __iter__(self) -> Iterable[int]:
        return iter(self.sorted_indexes)

    def __len__(self) -> int:
        return len(self.data)

`BucketBatchSampler`'s algorithm is this:

- Create buckets, subsets on random order.
- Sort data in each bucket
- Generate sample by getting items from buckets

In [None]:
import math
from typing import Generator, List

from torch.utils.data.sampler import BatchSampler
from torch.utils.data.sampler import SubsetRandomSampler


class BucketBatchSampler(BatchSampler):
    def __init__(
        self,
        sampler: Sampler,
        batch_size: int,
        drop_last: bool,
        sort_key: Callable[[Any], Any] = lambda x: x,
        bucket_size_multiplier: int = 100
    ):
        super().__init__(sampler, batch_size, drop_last)
        self.sort_key = sort_key
        self.bucket_sampler = BatchSampler(
            sampler,
            min(batch_size * bucket_size_multiplier, len(sampler)),
            False
        )

    def __iter__(self) -> Generator[List[int], None, None]:
        for bucket in self.bucket_sampler:
            sorted_sampler = SortedSampler(bucket, self.sort_key)
            for batch in SubsetRandomSampler(
                list(
                    BatchSampler(
                        sorted_sampler, 
                        self.batch_size, 
                        self.drop_last
                    )
                )
            ):
                yield [bucket[i] for i in batch]

    def __len__(self):
        if self.drop_last:
            return len(self.sampler) // self.batch_size
        else:
            return math.ceil(len(self.sampler) / self.batch_size)

And now we just need to create data loaders:

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data.sampler import RandomSampler


batch_size = 128


train_sampler = RandomSampler(train_dataset)
sort_key = lambda row: len(train_dataset[row][0])
train_batch_sampler = BucketBatchSampler(
    train_sampler, 
    batch_size=batch_size,
    drop_last=True,
    sort_key=sort_key
)

train_loader = DataLoader(
    train_dataset,
    batch_sampler=train_batch_sampler,
    collate_fn=collate_fn
)
valid_loader = DataLoader(
    valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn
)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn
)

## BLEU

In this section we will discuss metrics for Seq2Seq models. There are several metrics: BLEU, ROUGE, METEOR, WER, etc. They used to understand how well model solve any task with generating texts with target text. Let's look on to BLEU.

BLEU stands for "BIlingual Evaluation Understudy". To compute it, we need n-grams for predicted text(hypothesis) and target text(references) and compare them. And BLEU would be a number of n-grams from predicted text, appears in target text. Let's look at the example:

In [None]:
test_target = "Die Prager Börse stürzt gegen Geschäftsschluss ins Minus"
test_predicted = "Das Prager Börse stürzt gegest Geschäftschlus uns Minus"

In [None]:
target_tokens = test_target.split() # Simple way to get tokens

unigrams_target = [(t_0,) for t_0 in target_tokens]
bigrams_target = [
    (t_0, t_1) for t_0, t_1 in zip(target_tokens[:-1], target_tokens[1:])
]
trigrams_target = [
    (t_0, t_1, t_2)
    for t_0, t_1, t_2 in zip(
        target_tokens[:-2], target_tokens[1:-1], target_tokens[2:]
    )
]

In [None]:
predicted_tokens = test_predicted.split()


# find ngrams for predicted text
unigrams_predicted = [(t_0,) for t_0 in predicted_tokens]
bigrams_predicted = [
    (t_0, t_1) for t_0, t_1 in zip(predicted_tokens[:-1], predicted_tokens[1:])
]
trigrams_predicted = [
    (t_0, t_1, t_2)
    for t_0, t_1, t_2 in zip(
        predicted_tokens[:-2], predicted_tokens[1:-1], predicted_tokens[2:]
    )
]

Count number of n-grams appeard in target text:

In [None]:
count_unigrams = sum(
    uni in unigrams_target for uni in unigrams_predicted
) / len(unigrams_predicted)

# Count statistic for bigrams and trigrams
count_bigrams = sum(
    bi in bigrams_target for bi in bigrams_predicted
) / len(bigrams_predicted)
count_trigrams = sum(
    tri in trigrams_target for tri in trigrams_predicted
) / len(trigrams_predicted)
print(f"Uni: {count_unigrams}\nBi: {count_bigrams}\nTri: {count_trigrams}")

In [None]:
bleu = (count_unigrams + count_bigrams + count_trigrams) / 3
print(f"Our BLEU: {bleu}")

We don't need to implement BLEU score from scratch. In `nltk` we have algorithms to calculate it:

In [None]:
from nltk.translate.bleu_score import corpus_bleu


def compute_bleu(predicted, target):
    return corpus_bleu([[ref] for ref in target], predicted)

In [None]:
compute_bleu([test_predicted], [test_target])

## Seq2Seq. Translation

Translation is one of the task, where we need to have Seq2Seq models, that consist of an Encoder and a Decoder. An encoder should return an informative vector, that will represent an input text. A decoder should generate translation, based on the vector. We will use a Recurrent Neural Network with additional component called attention.

### RNN + Attention 

There are two famous attention formulation in the nlp. One of them is [by Luong](https://arxiv.org/pdf/1508.04025.pdf). Another one is [by Bahdanau](https://arxiv.org/pdf/1409.0473.pdf). We will implement an aproximation of Luong attention, that can be showed like this:

![](https://lena-voita.github.io/resources/lectures/seq2seq/attention/luong_model-min.png)

Let's code this.

In [None]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        emb_size: int,
        hidden_size: int,
        num_layers: int,
        dropout: float,
    ):
        super().__init__()

        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, emb_size)

        self.rnn = nn.LSTM(
            emb_size, hidden_size, num_layers=num_layers, dropout=dropout
        )

        self.dropout = nn.Dropout(dropout)

    def forward(
        self, src: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        outputs, (hidden, cell) = self.rnn(embedded)
        return outputs, hidden, cell


In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_size: int):
        super().__init__()
        self.hidden_size = hidden_size

        # Instead from one matrix we will use two linear modules
        self.enc_linear = nn.Linear(hidden_size, hidden_size)
        self.dec_linear = nn.Linear(hidden_size, hidden_size)

    def forward(
        self, last_hidden: torch.Tensor, encoder_outputs: torch.Tensor
    ) -> torch.Tensor:
        bs = last_hidden.size(1)

        # Prepare our examples
        encoder_outputs = self.enc_linear(encoder_outputs).reshape(
            bs, -1, self.hidden_size
        )
        last_hidden = self.dec_linear(last_hidden).reshape(
            bs, self.hidden_size, 1
        )

        # Compute logits by batch matrix multiplication
        logits = torch.bmm(encoder_outputs, last_hidden)

        attn = torch.softmax(logits, 1).reshape(-1, bs, 1)
        return attn
        

class DecoderAttn(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        emb_size: int,
        hidden_size: int,
        num_layers: int,
        attention: Attention,
        dropout: float,
    ):
        super().__init__()

        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers

        self.attn = attention

        self.embedding = nn.Embedding(vocab_size, emb_size)

        self.rnn = nn.LSTM(
            emb_size, hidden_size, num_layers=num_layers, dropout=dropout
        )
        self.out = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(
        self,
        input_: torch.Tensor,
        hidden: torch.Tensor,  # hidden_state from t-1
        cell: torch.Tensor,
        encoder_output: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        embedded = self.embedding(input_)
        embedded = self.dropout(embedded)

        attn = self.attn(hidden[-1:], encoder_output)
        # Generating new cell state by attention and encoder output
        new_cell = (encoder_output * attn).sum(0)
        cell[-1] = new_cell

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.out(output)

        return prediction, hidden, cell

One important point about training Seq2Seq models it's adding target tokens in a Decoder training loop. While our model is not good enough, it's generating "trash" tokens, that hasn't any information for generating. That's why we try to feed the decoder. However, it's not good too! The decoder will generate text via its generated tokens. Fopr this purpose we try to train the Decoder with random decited tokens source (target or itself).

In [None]:
from random import random


BOS_IDX = train_dataset.get_vocab()[1].stoi["<BOS>"]

class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: DecoderAttn):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.max_len = max_length

    def forward(
        self,
        src: torch.Tensor,
        trg: torch.Tensor,
        teacher_forcing_ratio: float = 0.1,
    ) -> torch.Tensor:
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.vocab_size

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(device)

        enc_out, hidden, cell = self.encoder(src)

        input_ = torch.zeros(1, batch_size) + BOS_IDX
        input_ = input_.type(torch.LongTensor).to(device)

        for t in range(1, max_len):
            output, hidden, cell = self.decoder(input_, hidden, cell, enc_out)
            outputs[t] = output
            teacher_force = random() < teacher_forcing_ratio
            top1 = output.max(2)[1]
            input_ = (trg[t] if teacher_force else top1).reshape(1, -1)

        return outputs[1:]

    def translate(self, src: torch.Tensor) -> torch.Tensor:
        batch_size = src.shape[1]
        outputs = torch.zeros(self.max_len, batch_size).to(device)

        enc_out, hidden, cell = self.encoder(src)

        input_ = torch.zeros(1, batch_size) + BOS_IDX
        input_ = input_.type(torch.LongTensor).to(device)

        for t in range(1, self.max_len):
            output, hidden, cell = self.decoder(input_, hidden, cell, enc_out)
            top1 = output.max(2)[1].reshape(-1)
            outputs[t] = top1
            input_ = top1.reshape(1, -1)

        return outputs[1:]

Create a model, special runner for Seq2Seq models and train the model!

In [None]:
source_vocab, target_vocab = train_dataset.get_vocab()

input_size = len(source_vocab)
output_size = len(target_vocab)
src_emb_size = tgt_emb_size = 100
hidden_size = 300
num_layers =  2
dropout_p = 0.1

enc = Encoder(input_size, src_emb_size, hidden_size, num_layers, dropout_p)
attention = Attention(hidden_size)
dec = DecoderAttn(
    output_size, tgt_emb_size, hidden_size, num_layers, attention, dropout_p
)
model = Seq2Seq(enc, dec).to(device)

To train model, we will compare generated tokens with a target for each source. To compare, use `CrossEntropyLoss`!

In [None]:
from catalyst.dl import Runner


class Seq2SeqRunner(Runner):
    def __init__(
        self, source_vocab: Vocab, target_vocab: Vocab, *args, **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab

    def predict_batch(self, batch) -> torch.Tensor:
        source, target = batch
        predictions = self.model.translate(source).type(torch.LongTensor)
        translations = [
            decoding(sentence, self.target_vocab)
            for sentence in predictions.t()
        ]
        return translations

    def handle_batch(self, batch) -> None:
        source, target = batch
        self.batch = {}

        if self.is_valid_loader:
            target_decoded = [
                decoding(sentence, runner.target_vocab)
                for sentence in target.t()
            ]
            predicted = runner.predict_batch(batch)
            self.batch["predicted"] = predicted
            self.batch["target_decoded"] = target_decoded

        logits = self.model(source, target)
        target = target[1:].reshape(-1)
        logits = logits.reshape(target.size(0), -1)
        self.batch.update(
            **{"source": source, "target": target, "logits": logits}
        )

To calculate BLEU score in train loop, we need to code Callback for this.

In [None]:
import numpy as np

from catalyst.dl import Callback, CallbackOrder


class BLEUCallback(Callback):
    def __init__(self):
        super().__init__(CallbackOrder.Metric)

    def on_batch_end(self, runner: Runner) -> None:
        if runner.is_valid_loader:
            predicted = runner.batch["predicted"]
            target = runner.batch["target_decoded"]
            bleu = compute_bleu(predicted, target)
            runner.batch_metrics.update(**{"bleu": bleu})

In [None]:
from catalyst.contrib.nn import RAdam
from torch.nn.utils import clip_grad_norm_
from catalyst.dl import CriterionCallback, OptimizerCallback


lr = 1e-2

optimizer = RAdam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID_trg)
callbacks = [
    CriterionCallback("logits", "target", "loss"),
    OptimizerCallback(
        "loss", grad_clip_fn=clip_grad_norm_, grad_clip_params={"max_norm": 1}
    ),
    BLEUCallback(),
]
loaders = {"train": train_loader, "valid": valid_loader}


runner = Seq2SeqRunner(source_vocab=source_vocab, target_vocab=target_vocab)

In [None]:
from datetime import datetime
from pathlib import Path


logdir = Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S")

In [None]:
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=loaders,
    callbacks=callbacks,
    num_epochs=5,
    verbose=True,
    logdir=logdir,
)

Our model, trained on small data, is not well prepared to be a good translator. Anyway, let's test code and the model.

In [None]:
test = "A cat eats a fish"
test_input_ids = train_dataset.transforms[0](test)
test_input_ids = test_input_ids.reshape(-1, 1).to(device)

prediction = model.translate(test_input_ids).to("cpu").type(torch.LongTensor)
decoding(prediction, target_vocab)