In [None]:
!pip install -qU datasets transformers catalyst==20.12 sacrebleu

In [None]:
from catalyst.utils import set_global_seed, get_device


set_global_seed(42)
device = get_device()

# Homework

Hi! Today we are going to create Neural Machine Translation system! It will read english sentences and write their tranlsations on German. This is the Seq2Seq task. So, we need an Encoder and a Decoder models. You can use any architecture and pretrained models. But you have several rules:

- Not copy your classmates works
- If you will use pretrained weights, check that this weights wasn't gotten by solving Machine Translation task.
- Your model score on BLEU must be higher that `0.15` points in a testing pipeline.

To get 10 points (full score), your model have to get `0.20` points BLEU on a test dataset: test part from WMT14.


!! **WARNING** !!

You need several **HOURS** to train models for translation task!

!! **WARNING** !!


There is a basic model, that you can use as a starting point to create your solution.

In [None]:
# You can use google drive to save data and best models
# Only work in colab

# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
from datasets import load_dataset, load_metric

It's basic datasets: WMT14. For a training process, you can add any additional samples. But you need to leave `wmt14_test` and metric `sacrebleu` unchangable.

In [None]:
cache_dir = None

wmt14_train = load_dataset(
    "wmt14", "de-en", split="train[:20%]", cache_dir=cache_dir
)
wmt14_valid = load_dataset(
    "wmt14", "de-en", split="validation", cache_dir=cache_dir
)

# DO NOT TOUCH!
wmt14_test = load_dataset("wmt14", "de-en", split="test", cache_dir=cache_dir)

In [None]:
bleu = load_metric('sacrebleu')
print(bleu)

In this baseline, we encorage you to use pretrained models. The easiest way to get them: by HuggingFace🤗 library. Next code blocks contain models preparation.

In [None]:
from transformers import (
    AutoTokenizer,
    EncoderDecoderModel,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    default_data_collator
)

Regardless of the fact that our target language is German, we will use a model trained on English texts. These models are easier to find and add in the pipeline. And German and English languages are very similiar, too.

In [None]:
source_model = "google/bert_uncased_L-6_H-128_A-2"
target_model = "google/bert_uncased_L-6_H-128_A-2"

In [None]:
tokenizer_source = AutoTokenizer.from_pretrained(source_model)
tokenizer_target = AutoTokenizer.from_pretrained(target_model)

In [None]:
import typing as tp

from catalyst.utils import get_loader


collate_fn_source = DataCollatorWithPadding(tokenizer_source)
collate_fn_target = DataCollatorForLanguageModeling(
    tokenizer_target, mlm=False
)

def collate_fn(
        batch: tp.Sequence[tp.Dict[str, tp.Any]]
) -> tp.Tuple[tp.Dict[str, tp.Any], tp.Dict[str, tp.Any]]:
    batch_source = collate_fn_source([b["source"] for b in batch])
    batch_target = collate_fn_source([b["target"] for b in batch])
    return batch_source, batch_target

In [None]:
max_length = 64

def text_data_transforms(
        row: tp.Dict[str, tp.Any]
) -> tp.Dict[str, tp.Dict[str, tp.Any]]:
    source = row["translation"]["en"]
    target = row["translation"]["de"]
    source_tokens = tokenizer_source.encode_plus(
        source, max_length=max_length, truncation=True, padding="max_length"
    )
    target_tokens = tokenizer_target.encode_plus(
        target, max_length=max_length, truncation=True,
    )
    return {"source": source_tokens, "target": target_tokens}

In [None]:
train_dataloader = get_loader(
    wmt14_train,
    open_fn=lambda x: x,
    dict_transform=text_data_transforms,
    batch_size=128,
    num_workers=4,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn,
)

valid_dataloader = get_loader(
    wmt14_valid,
    open_fn=lambda x: x,
    dict_transform=text_data_transforms,
    batch_size=128,
    num_workers=4,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn,
)

test_dataloader = get_loader(
    wmt14_test,
    open_fn=lambda x: x,
    dict_transform=text_data_transforms,
    batch_size=128,
    num_workers=4,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn,
)

Two `BERT` models are used to create one Seq2Seq model. HuggingFace gave us elegant way to do so. More explanation in [docs](https://huggingface.co/transformers/model_doc/encoderdecoder.html).

In [None]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    source_model, target_model
)

In [None]:
# from torch.utils set_requires_grad

# You can freeze some parameters in models to speed up training loops.

# set_requires_grad(model.encoder.embeddings, False)
# set_requires_grad(model.decoder.bert.embeddings, False)

# set_requires_grad(model.encoder.encoder.layer[3:], False)
# set_requires_grad(model.decoder.bert.encoder.layer[3:], False)

Next there are some not commented example of the model API:

In [None]:
batch = next(iter(train_dataloader))

In [None]:
outputs = model(
    input_ids=batch[0]["input_ids"], 
    attention_mask=batch[0]["attention_mask"],
    decoder_input_ids=batch[1]["input_ids"],
    decoder_attention_mask=batch[1]["attention_mask"],
    labels=batch[1]["input_ids"],
    return_dict=True,
)

In [None]:
outputs

In [None]:
logits = outputs["logits"]

In [None]:
decoded_reference = tokenizer_target.decode(batch[1]["input_ids"][0])
decoded_hypothesis = tokenizer_target.decode(logits[0].max(0)[1])

Our main metric: BLEU. To show your performance, we will use ScareBLEU as common implementation.
It's very slow, that's why we haven't added this is the training loop. And it's API works like this:

In [None]:
bleu.add_batch(
    predictions=[decoded_hypothesis], references=[[decoded_reference]],
)

In [None]:
bleu.compute()

Typical Catalyst routune located here:

In [None]:
import torch.nn as nn
from catalyst.dl import SchedulerCallback
from catalyst.contrib.nn import RAdam

from transformers import get_linear_schedule_with_warmup


optimizer = RAdam(model.parameters(), 1e-3)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=1 * len(train_dataloader),
)
criterion = nn.CrossEntropyLoss()
callbacks = [SchedulerCallback(mode="batch")]

In [None]:
import typing as tp

import torch

from catalyst.dl import Runner


class TranslationRunner(Runner):
    # This function will be used to test you models. Don't forget about it!
    def predict_batch(
        self,
        batch: tp.Tuple[
            tp.Dict[str, torch.Tensor], tp.Dict[str, torch.Tensor]
        ],
    ) -> None:
        output = model.generate(
            input_ids=batch[0]["input_ids"].to(self.device),
            decoder_start_token_id=tokenizer_target.cls_token_id,
            max_length=max_length,
        )
        # output: not the logits, but token ids, already have been chosen by the model
        # target: ground truth
        return {"output": output, "target": batch[1]["input_ids"]}

    def _handle_batch(
        self,
        batch: tp.Tuple[
            tp.Dict[str, torch.Tensor], tp.Dict[str, torch.Tensor]
        ],
    ) -> None:
        source = batch[0]
        target = batch[1]

        output = model(
            input_ids=source["input_ids"],
            attention_mask=source["attention_mask"],
            decoder_input_ids=target["input_ids"],
            decoder_attention_mask=target["attention_mask"],
            labels=target["input_ids"],
            return_dict=True,
        )
        loss, logits = output.loss, output.logits

        self.input = {"source": source, "target": target}
        self.output = logits

        if self.is_train_loader:
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        self.batch_metrics.update({"loss": loss})


In [None]:
from pathlib import Path
from datetime import datetime


runner = TranslationRunner()
runner.train(
    model=model,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders={"train": train_dataloader, "valid": valid_dataloader},
    criterion=criterion,
    callbacks=callbacks,
    logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S"),
    num_epochs=1,
    verbose=True,
)

Next block contains testing pipeline. You can't change it, but you can use it to check your model performance.

In [None]:
from tqdm.notebook import tqdm


for output in tqdm(
    runner.predict_loader(loader=test_dataloader), total=len(test_dataloader)
):
    # If you change tokenizer type, you will be allowed to change these code
    ####
    hypothesis = [
        tokenizer_target.decode(o, skip_special_tokens=True)  # here
        for o in output["output"]
    ]
    references = [
        [tokenizer_target.decode(o, skip_special_tokens=True)]  # and here
        for o in output["target"]
    ]
    ####
    bleu.add_batch(
        predictions=hypothesis, references=references,
    )
print(f"Test BLEU: {bleu.compute()['score']}")

Several ideas, that you can use to upgrade your model:

- Change model type/architecture/config
- Tune hyperparameters
- Change generate process to BEAM search. link: https://en.wikipedia.org/wiki/Beam_search

Feel free to delete almost entire the notebook. However, you **have no permission** to change the testing pipeline.