# Sequence-to-sequence

## Одним из возможных способов улучшения качества системы распознавания рукописных документов является пост-обработка предиктов с помощью модели sequence-to-sequence.

## Подготовка данных

### В качестве дополнительных данных для обучения модели можно использовать коллекцию текстов 17 века, которая была предложена организаторами соревнования GramEval2020.

In [2]:
fname = "/home/jovyan/grameval_data/grameval_17_century.txt"


def read_grameval(fname=fname):
    with open(fname, "r", encoding='utf-8') as f:
        lines = [x[:-1] for x in f.readlines()]
        return lines

In [3]:
grameval_texts = read_grameval(fname)

In [4]:
grameval_texts[-5]

'и то явное ихъ съ откащикомъ воровство не поставя столба въ отказные книги за споромъ чювашина бортнички написали'

### Зададим набор правил для аугментации данных с использованием шума и специфики стиля Петра I.

In [5]:
import random


# p of substitution = 1/znam
znam = 2

rules = []

#Традиционно над строкой Петр пишет «з» и «с», конечное «х», также «к» перед широкой размашистой «ж»
rules.append(('з',''))
rules.append(('c',''))
rules.append(('x',''))
rules.append(('кж', 'ж'))
#вместо старого «ѧ» уже регулярно употребляет вполне современное «я»
rules.append(('ѧ', 'я'))
#Не любит буквы «s» («зело») и «ѵ» 
rules.append(('s', ''))
rules.append(('ѵ', ''))
#Мягкий знак пропускает
rules.append(('ь', ''))


def replace_letters(line, to_replace, replace_by, znam = 2):
    new_line = ''
    for letter in line:
        if letter == to_replace and random.randint(0, znam - 1) % znam == 0:
            new_line = new_line + replace_by
        else:
            new_line = new_line + letter
    return new_line
    

def apply_rule(lines, rule, znam = 2):
    to_replace, replace_by = rule
    res = [
        replace_letters(line, to_replace, replace_by) for line in lines
    ]
    return res

In [6]:
print("Peter's writing rules:")
print(rules)

Peter's writing rules:
[('з', ''), ('c', ''), ('x', ''), ('кж', 'ж'), ('ѧ', 'я'), ('s', ''), ('ѵ', ''), ('ь', '')]


In [7]:
from tqdm.auto import tqdm


for rule in tqdm(rules, total=len(rules), desc="Generating data..."):
    grameval_texts = apply_rule(grameval_texts, rule)

HBox(children=(FloatProgress(value=0.0, description='Generating data...', max=8.0, style=ProgressStyle(descrip…




### Построим словарь для генерации шума.

In [8]:
import os
from collections import Counter


trans_dir = '/home/jovyan/PETR/baseline/train_test_data/train/words'
image_dir = '/home/jovyan/PETR/baseline/train_test_data/train/images'

english = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'm', 'n' ,'o', 'p', 'r', 's', 't', 'u', 'w']


def process_texts(image_dir,trans_dir):
    lens = []
    include_english = 0
    letters = ''

    lines = []
    names = []
    
    all_files = os.listdir(trans_dir)
    for filename in os.listdir(image_dir):
        if filename[:-3]+'txt' in all_files:
            name, ext = os.path.splitext(filename)
            txt_filepath = os.path.join(trans_dir, name + '.txt')
            with open(txt_filepath, 'r') as file:
                data = file.read()
                if len(data)==0:
                    continue
                if len(set(data).intersection(english))>0:
                    continue

                lines.append(data)
                names.append(filename)
                lens.append(len(data))
                letters += data

    print('Максимальная длина строки:', max(lens))
    print('Количество строк с английскими буквами ', include_english)

    return names,lines,Counter(letters)

In [9]:
names, lines, cnt = process_texts(image_dir,trans_dir)

Максимальная длина строки: 63
Количество строк с английскими буквами  0


In [10]:
letters = sorted(list(cnt.keys()))
print('Символы train:', ' '.join(letters))

Символы train:   + 0 1 2 3 4 5 6 7 8 9 [ ] i а б в г д е ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я і ѣ … ⊕ ⊗


### Пример аугментации

* добавление шума из словаря на посимвольном уровне;
* удаление пробелов с определенной вероятностью.

In [11]:
import numpy as np
import pandas as pd


df = pd.DataFrame(lines + grameval_texts, columns=["trg"])

def add_noise(text, symbols=letters, znam=4):
    text = list(text)
    num = len(text) // znam
    indexes = random.sample(range(0, len(text)), num)
    for i in indexes:
        if text[i]!=' ':
            text[i] = random.choice(symbols)
        else:
            del_space = np.random.choice([True, False], p=[0.3, 0.7])
            if del_space:
                text[i] = text[i].replace(" ", "")
    return ''.join(text)


df["src"] = [add_noise(t, znam=4) for t in df["trg"].tolist()]

In [12]:
df.head()

Unnamed: 0,trg,src
0,[а iменно при дворѣ аглинском] былъ,[а iм5нпо при ⊗ворѣ ашлинс8ом] бтлъ
1,по поставълѣнным с ни,мо 5оставълѣнншм с ну
2,неволником ослабу а астраханца жало,+еволнэ[лм о5лабу а жстраханца жало
3,ного не отби,ного ве оцни
4,нее здѣлат и i не допустит до войны,уее здѣлйт и з н7 допустит то ѣойны


### Приведем в пример заранее аугментированные данные, где:

* id – id семпла;
* src – исходная последовательность;
* trg – целевая последовательность;
* cn – длина исходной последовательности в символах.

In [13]:
df = pd.read_csv("/home/jovyan/grameval_data/augmented_data.csv", sep=",")
df.sample(5)

Unnamed: 0,id,src,trg,cn
372,12750,вѣншгра1⊕ а0 гопльдный уже не 2[даси плгда сір...,виноградѣ многоплодный уже не подаси плода сер...,76
757,50825,всего неметйкихф стяуникфвъ 12… ч,всего неметцкихъ урядниковъ 122 ч,33
512,114776,⊗ахлвлш кч мнѣ иптинной хриэтобо михаил9 арх⊕н...,паслалъ ко мнѣ истинной христосъ михаила архан...,74
15,345702,на поляхъ степенныя степемь 13 5лава 28,на поляхъ степенная степень 13 глава 28,39
595,3048,1 пъшечка нн стаску,1 пушечка на станку,19


### Вариант 1: Базовая модель Encoder-Decoder with Bahdanau Attention

##### На основе https://bastings.github.io/annotated_encoder_decoder/

In [14]:
!python -m pip install --quiet python-Levenshtein

In [15]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from IPython.core.debugger import set_trace
import Levenshtein as lev


USE_CUDA = torch.cuda.is_available()
DEVICE=torch.device('cuda:0')
print("CUDA:", USE_CUDA)
print(DEVICE)

CUDA: True
cuda:0


In [16]:
def init_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [17]:
init_seed()

### Архитектура модели

In [18]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many
    other models.
    """

    def __init__(self, encoder, decoder, src_embed, trg_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.trg_embed = trg_embed
        self.generator = generator

    def forward(self, src, trg, src_mask, trg_mask, src_lengths, trg_lengths):
        """Take in and process masked src and target sequences."""
        encoder_hidden, encoder_final = self.encode(src, src_mask, src_lengths)
        return self.decode(encoder_hidden, encoder_final, src_mask, trg, trg_mask)

    def encode(self, src, src_mask, src_lengths):
        return self.encoder(self.src_embed(src), src_mask, src_lengths)

    def decode(
        self,
        encoder_hidden,
        encoder_final,
        src_mask,
        trg,
        trg_mask,
        decoder_hidden=None,
    ):
        return self.decoder(
            self.trg_embed(trg),
            encoder_hidden,
            encoder_final,
            src_mask,
            trg_mask,
            hidden=decoder_hidden,
        )

In [19]:
class Generator(nn.Module):
    """Define standard linear + softmax generation step."""

    def __init__(self, hidden_size, vocab_size):
        super(Generator, self).__init__()
        self.proj = nn.Linear(hidden_size, vocab_size, bias=False)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

In [20]:
class Encoder(nn.Module):
    """Encodes a sequence of word embeddings"""

    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.0):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.rnn = nn.GRU(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout,
        )

    def forward(self, x, mask, lengths):
        """
        Applies a bidirectional GRU to sequence of embeddings x.
        The input mini-batch x needs to be sorted by length.
        x should have dimensions [batch, time, dim].
        """
        packed = pack_padded_sequence(x, lengths, batch_first=True)
        output, final = self.rnn(packed)
        output, _ = pad_packed_sequence(output, batch_first=True)

        # we need to manually concatenate the final states for both directions
        fwd_final = final[0 : final.size(0) : 2]
        bwd_final = final[1 : final.size(0) : 2]
        # [num_layers, batch, 2*dim]
        final = torch.cat([fwd_final, bwd_final], dim=2)

        return output, final

In [21]:
class Decoder(nn.Module):
    """A conditional RNN decoder with attention."""

    def __init__(
        self, emb_size, hidden_size, attention, num_layers=1, dropout=0.5, bridge=True
    ):
        super(Decoder, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.attention = attention
        self.dropout = dropout

        self.rnn = nn.GRU(
            emb_size + 2 * hidden_size,
            hidden_size,
            num_layers,
            batch_first=True,
            dropout=dropout,
        )

        # to initialize from the final encoder state
        self.bridge = (
            nn.Linear(2 * hidden_size, hidden_size, bias=True) if bridge else None
        )

        self.dropout_layer = nn.Dropout(p=dropout)
        self.pre_output_layer = nn.Linear(
            hidden_size + 2 * hidden_size + emb_size, hidden_size, bias=False
        )

    def forward_step(self, prev_embed, encoder_hidden, src_mask, proj_key, hidden):
        """Perform a single decoder step (1 word)"""

        # compute context vector using attention mechanism
        query = hidden[-1].unsqueeze(1)  # [#layers, B, D] -> [B, 1, D]
        context, attn_probs = self.attention(
            query=query, proj_key=proj_key, value=encoder_hidden, mask=src_mask
        )

        # update rnn hidden state
        rnn_input = torch.cat([prev_embed, context], dim=2)
        output, hidden = self.rnn(rnn_input, hidden)

        pre_output = torch.cat([prev_embed, output, context], dim=2)
        pre_output = self.dropout_layer(pre_output)
        pre_output = self.pre_output_layer(pre_output)

        return output, hidden, pre_output

    def forward(
        self,
        trg_embed,
        encoder_hidden,
        encoder_final,
        src_mask,
        trg_mask,
        hidden=None,
        max_len=None,
    ):
        """Unroll the decoder one step at a time."""

        # the maximum number of steps to unroll the RNN
        if max_len is None:
            max_len = trg_mask.size(-1)

        # initialize decoder hidden state
        if hidden is None:
            hidden = self.init_hidden(encoder_final)

        # pre-compute projected encoder hidden states
        # (the "keys" for the attention mechanism)
        # this is only done for efficiency
        proj_key = self.attention.key_layer(encoder_hidden)

        # here we store all intermediate hidden states and pre-output vectors
        decoder_states = []
        pre_output_vectors = []

        # unroll the decoder RNN for max_len steps
        for i in range(max_len):
            prev_embed = trg_embed[:, i].unsqueeze(1)
            output, hidden, pre_output = self.forward_step(
                prev_embed, encoder_hidden, src_mask, proj_key, hidden
            )
            decoder_states.append(output)
            pre_output_vectors.append(pre_output)

        decoder_states = torch.cat(decoder_states, dim=1)
        pre_output_vectors = torch.cat(pre_output_vectors, dim=1)
        return decoder_states, hidden, pre_output_vectors  # [B, N, D]

    def init_hidden(self, encoder_final):
        """Returns the initial decoder state,
        conditioned on the final encoder state."""

        if encoder_final is None:
            return None  # start with zeros

        return torch.tanh(self.bridge(encoder_final))

In [22]:
class BahdanauAttention(nn.Module):
    """Implements Bahdanau (MLP) attention"""

    def __init__(self, hidden_size, key_size=None, query_size=None):
        super(BahdanauAttention, self).__init__()

        # We assume a bi-directional encoder so key_size is 2*hidden_size
        key_size = 2 * hidden_size if key_size is None else key_size
        query_size = hidden_size if query_size is None else query_size

        self.key_layer = nn.Linear(key_size, hidden_size, bias=False)
        self.query_layer = nn.Linear(query_size, hidden_size, bias=False)
        self.energy_layer = nn.Linear(hidden_size, 1, bias=False)

        # to store attention scores
        self.alphas = None

    def forward(self, query=None, proj_key=None, value=None, mask=None):
        assert mask is not None, "mask is required"

        # We first project the query (the decoder state).
        # The projected keys (the encoder states) were already pre-computated.
        query = self.query_layer(query)

        # Calculate scores.
        scores = self.energy_layer(torch.tanh(query + proj_key))
        scores = scores.squeeze(2).unsqueeze(1)

        # Mask out invalid positions.
        # The mask marks valid positions so we invert it using `mask & 0`.
        scores.data.masked_fill_(mask == 0, -float("inf"))

        # Turn scores to probabilities.
        alphas = F.softmax(scores, dim=-1)
        self.alphas = alphas

        # The context vector is the weighted sum of the values.
        context = torch.bmm(alphas, value)

        # context shape: [B, 1, 2D], alphas shape: [B, 1, M]
        return context, alphas

In [23]:
class Batch:
    def __init__(self, src, trg, pad_index=0):

        src, src_lengths = src

        self.src = src
        self.src_lengths = src_lengths
        self.src_mask = (src != pad_index).unsqueeze(-2)
        self.nseqs = src.size(0)

        self.trg = None
        self.trg_y = None
        self.trg_mask = None
        self.trg_lengths = None
        self.ntokens = None

        if trg is not None:
            trg, trg_lengths = trg
            self.trg = trg[:, :-1]
            self.trg_lengths = trg_lengths
            self.trg_y = trg[:, 1:]
            self.trg_mask = self.trg_y != pad_index
            self.ntokens = (self.trg_y != pad_index).data.sum().item()

        if USE_CUDA:
            self.src = self.src.cuda()
            self.src_mask = self.src_mask.cuda()

            if trg is not None:
                self.trg = self.trg.cuda()
                self.trg_y = self.trg_y.cuda()
                self.trg_mask = self.trg_mask.cuda()

In [24]:
class SimpleLossCompute:
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt

    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(
            x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)
        )
        loss = loss / norm

        if self.opt is not None:
            loss.backward()
            self.opt.step()
            self.opt.zero_grad()

        return loss.data.item() * norm

In [25]:
def make_model(
    src_vocab, tgt_vocab, emb_size=256, hidden_size=512, num_layers=1, dropout=0.1
):
    "Helper: Construct a model from hyperparameters."

    attention = BahdanauAttention(hidden_size)

    model = EncoderDecoder(
        Encoder(emb_size, hidden_size, num_layers=num_layers, dropout=dropout),
        Decoder(
            emb_size, hidden_size, attention, num_layers=num_layers, dropout=dropout
        ),
        nn.Embedding(src_vocab, emb_size),
        nn.Embedding(tgt_vocab, emb_size),
        Generator(hidden_size, tgt_vocab),
    )

    return model.cuda() if USE_CUDA else model

In [26]:
def print_examples(
    example_iter,
    model,
    n=2,
    max_len=256,
    src_vocab=None,
    trg_vocab=None,
):
    """Prints N examples. Assumes batch size of 1."""

    model.eval()
    count = 0
    print()

    if src_vocab is not None and trg_vocab is not None:
        src_eos_index = src_vocab.stoi[EOS_TOKEN]
        trg_sos_index = trg_vocab.stoi[SOS_TOKEN]
        trg_eos_index = trg_vocab.stoi[EOS_TOKEN]
    else:
        src_eos_index = None
        trg_sos_index = 1
        trg_eos_index = None

    for i, batch in enumerate(example_iter):

        src = batch.src.cpu().numpy()[0, :]
        trg = batch.trg_y.cpu().numpy()[0, :]

        src = src[:-1] if src[-1] == src_eos_index else src
        trg = trg[:-1] if trg[-1] == trg_eos_index else trg

        result, _ = greedy_decode(
            model,
            batch.src,
            batch.src_mask,
            batch.src_lengths,
            max_len=max_len,
            sos_index=trg_sos_index,
            eos_index=trg_eos_index,
        )
        print("Example #%d" % (i + 1))
        print("Src : ", " ".join(lookup_words(src, vocab=src_vocab)))
        print("Trg : ", " ".join(lookup_words(trg, vocab=trg_vocab)))
        print("Pred: ", " ".join(lookup_words(result, vocab=trg_vocab)))
        print()

        count += 1
        if count == n:
            break

In [27]:
from torchtext import data, datasets


def tokenize(text):
    return list(text)


PAD_TOKEN = "{"
SOS_TOKEN = "~"
EOS_TOKEN = "^"
UNK_TOKEN = "&"


ID = data.Field(sequential=False, use_vocab=False)

SOURCE = data.Field(
    tokenize=tokenize,
    batch_first=True,
    lower=False,
    include_lengths=True,
    unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    init_token=None,
    eos_token=EOS_TOKEN,
)

TARGET = data.Field(
    tokenize=tokenize,
    batch_first=True,
    lower=False,
    include_lengths=True,
    unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    init_token=SOS_TOKEN,
    eos_token=EOS_TOKEN,
)

LEN = data.Field(sequential=False, use_vocab=False)

data_fields = [("id", ID), ("src", SOURCE), ("trg", TARGET), ("cn", LEN)]

In [28]:
import os


encoder_decoder_model_path = "encoder_decoder_model"


if not os.path.exists(encoder_decoder_model_path):
    os.makedirs(encoder_decoder_model_path)

In [29]:
data_path = "/home/jovyan/grameval_data/augmented_data.csv"

encoder_decoder_data = data.TabularDataset(
    path=data_path, format="csv", skip_header=True, fields=data_fields
)

train_data, dev_data = encoder_decoder_data.split(
    split_ratio=[0.9, 0.1], stratified=True, strata_field="cn"
)

SOURCE.build_vocab(train_data.src)
TARGET.build_vocab(train_data.trg)
PAD_INDEX = TARGET.vocab.stoi[PAD_TOKEN]

In [31]:
BATCH_SIZE = 4

train_iter = data.BucketIterator(
    train_data,
    batch_size=BATCH_SIZE,
    train=True,
    sort_within_batch=True,
    sort_key=lambda x: (len(x.src), len(x.trg)),
    repeat=False,
    device=DEVICE,
    shuffle=True,
)

valid_iter_batch = data.Iterator(
    dev_data,
    batch_size=BATCH_SIZE,
    train=False,
    sort_within_batch=True,
    sort_key=lambda x: (len(x.src), len(x.trg)),
    repeat=False,
    device=DEVICE,
    shuffle=False,
)

In [32]:
def rebatch(pad_idx, batch):
    """Wrap torchtext batch into our own Batch class for pre-processing"""
    return Batch(batch.src, batch.trg, pad_idx)


def run_epoch(data_iter, model, loss_compute, print_every=50, num_batches=100):
    """Standard Training and Logging Function"""

    start = time.time()
    total_loss = 0
    print_tokens = 0
    total_tokens = 0

    with tqdm(total=num_batches) as pbar:
        for i, batch in enumerate(data_iter, 1):

            out, _, pre_output = model.forward(
                batch.src,
                batch.trg,
                batch.src_mask,
                batch.trg_mask,
                batch.src_lengths,
                batch.trg_lengths,
            )
            loss = loss_compute(pre_output, batch.trg_y, batch.nseqs)
            total_loss += loss
            print_tokens += batch.ntokens
            total_tokens += batch.ntokens

            if model.training and i % print_every == 0:
                elapsed = time.time() - start
                print(
                    "Epoch Step: %d Loss: %f Tokens per Sec: %f"
                    % (i, loss / batch.nseqs, print_tokens / elapsed)
                )
                start = time.time()
                print_tokens = 0

            pbar.update(1)

    loss = total_loss / float(total_tokens)
    perplexity = math.exp(loss)

    return perplexity, loss

In [33]:
def translate_batch(batch, vocab, target=True):
    res = []
    eos_index = vocab.stoi[EOS_TOKEN]
    batch = batch.trg.tolist() if target else batch.src.tolist()
    for s in batch:
        first_eos = np.where(np.array(s) == eos_index)[0]
        if len(first_eos) > 0:
            res.append(
                "".join(lookup_words(s[: first_eos[0]], vocab=vocab))
                .replace("~", "")
                .strip()
            )
        else:
            res.append(
                "".join(lookup_words(s[:], vocab=vocab)).replace("~", "").strip()
            )
    return res

In [34]:
MAX_LEN = 256


def greedy_decode_batch(
    model, src, src_mask, src_lengths, max_len=MAX_LEN, sos_index=1, eos_index=None
):
    """Greedily decode a sentence."""
    batch_size = src.size(0)

    with torch.no_grad():
        encoder_hidden, encoder_final = model.encode(src, src_mask, src_lengths)
        prev_y = torch.ones(batch_size, 1).fill_(sos_index).type_as(src)
        trg_mask = torch.ones_like(prev_y)

    output, hidden = [], None

    for i in range(max_len):
        with torch.no_grad():
            out, hidden, pre_output = model.decode(
                encoder_hidden, encoder_final, src_mask, prev_y, trg_mask, hidden
            )
            prob = model.generator(pre_output[:, -1])

        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data
        output.append(next_word.cpu().numpy())
        prev_y = next_word.unsqueeze(dim=1)

    output = np.array(output)
    output = np.stack(output).T

    return output


def predict(
    example_iter,
    model,
    max_len=MAX_LEN,
    src_vocab=None,
    trg_vocab=None,
    num_batches=100,
):
    model.eval()

    if src_vocab is not None and trg_vocab is not None:
        trg_sos_index = trg_vocab.stoi[SOS_TOKEN]
        trg_eos_index = trg_vocab.stoi[EOS_TOKEN]
    else:
        trg_sos_index = 1
        trg_eos_index = None

    preds, sources, targets = [], [], []

    with tqdm(total=num_batches) as pbar:
        for i, batch in enumerate(example_iter):

            source_batch = translate_batch(batch, vocab=SOURCE.vocab, target=False)
            target_batch = translate_batch(batch, vocab=TARGET.vocab, target=True)

            sources.extend(source_batch)
            targets.extend(target_batch)

            output = greedy_decode_batch(
                model,
                batch.src,
                batch.src_mask,
                batch.src_lengths,
                max_len=max_len,
                sos_index=trg_sos_index,
                eos_index=trg_eos_index,
            )

            if trg_eos_index is not None:
                for pred in output:
                    if type(pred) == list:
                        pred = np.array(pred)
                    first_eos = np.where(pred == trg_eos_index)[0]
                    if len(first_eos) > 0:
                        # produce sentences
                        preds.append(
                            "".join(lookup_words(pred[: first_eos[0]], vocab=trg_vocab))
                        )
                    else:
                        preds.append("".join(lookup_words(pred[:], vocab=trg_vocab)))
            pbar.update(1)
    return preds, sources, targets

In [35]:
def compute_metrics(tr_loss, tr_ppl, val_loss, val_ppl):
    res = {
        "Train Loss": np.mean(tr_loss),
        "Train PPL": np.mean(tr_ppl),
        "Validation Loss": np.mean(val_loss),
        "Validation PPL": np.mean(val_ppl)
    }
    return res

In [36]:
import json


def save_json(fname, obj):
    with open(fname, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=4)


def save_results(all_predictions, all_targets, all_sources, model_path=encoder_decoder_model_path):
    res = {
        "predictions": all_predictions,
        "targets": all_targets,
        "sources": all_sources,
    }

    save_json(os.path.join(model_path, "encoder_decoder_results.json"), res)

In [37]:
def train(
    model, criterion, optim, source_vocab, target_vocab, num_epochs=10, print_every=500, model_path=encoder_decoder_model_path
):
    if USE_CUDA:
        model.cuda()

    train_losses, valid_losses = [], []
    train_perplexities, valid_perplexities = [], []

    for epoch in range(num_epochs):
        epoch = epoch + 1
        print("Epoch", epoch)
        print("Training the model")
        model.train()

        train_perplexity, train_loss = run_epoch(
            (rebatch(PAD_INDEX, b) for b in train_iter),
            model,
            SimpleLossCompute(model.generator, criterion, optim),
            print_every=print_every,
            num_batches=len(train_iter),
        )

        print("Train Loss: %f" % train_loss)
        model.eval()
        with torch.no_grad():
            print("Evaluating the model")
            # print_examples((rebatch(PAD_INDEX, x) for x in valid_iter), model, n=3, src_vocab=source_vocab.vocab, trg_vocab=target_vocab.vocab)
            dev_perplexity, dev_loss = run_epoch(
                (rebatch(PAD_INDEX, b) for b in valid_iter_batch),
                model,
                SimpleLossCompute(model.generator, criterion, None),
                num_batches=len(valid_iter_batch),
            )

            train_losses.append(train_loss)
            train_perplexities.append(train_perplexity)
            valid_losses.append(dev_loss)
            valid_perplexities.append(dev_perplexity)

            print("*" * 30)
            print("Epoch metrics\n")
            print("Validation perplexity: %3.f \n" % dev_perplexity)
            print("Validation Loss: %3.f " % dev_loss)

            print("*" * 30)

            if epoch == num_epochs:
                model_name = os.path.join(model_path, "encoder_decoder_model.pt")

                print("Saving model %s" % model_name)

                torch.save(model.state_dict(), model_name)

                preds, sources, targets = predict(
                    (rebatch(PAD_INDEX, x) for x in valid_iter_batch),
                    model,
                    max_len=MAX_LEN,
                    src_vocab=source_vocab.vocab,
                    trg_vocab=target_vocab.vocab,
                    num_batches=len(valid_iter_batch),
                )

                save_results(preds, targets, sources)

    return train_perplexities, valid_perplexities

In [40]:
def lookup_words(x, vocab=None):
    if vocab is not None:
        x = [vocab.itos[i] for i in x]
    return [str(t) for t in x]

In [41]:
EMB_SIZE = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 2
DROPOUT_RATE = 0.25
LEARNING_RATE = 2*1e-5

model = make_model(
    len(SOURCE.vocab),
    len(TARGET.vocab),
    emb_size=EMB_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT_RATE,
)

criterion = nn.CrossEntropyLoss(reduction="sum", ignore_index=PAD_INDEX)
optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
NUM_EPOCHS = 15
PRINT_EVERY = 500

train_perplexities, valid_perplexities = train(
    model=model,
    num_epochs=NUM_EPOCHS,
    print_every=PRINT_EVERY,
    criterion=criterion,
    optim=optim,
    source_vocab=SOURCE,
    target_vocab=TARGET,
)

In [43]:
ppls = {
    "train": train_perplexities,
    "val": valid_perplexities
}


save_json(
    os.path.join(encoder_decoder_model_path, "ppls.json"), ppls
)

save_json(
    os.path.join(encoder_decoder_model_path, "source_vocab.json"), SOURCE.vocab.stoi
)

save_json(
    os.path.join(encoder_decoder_model_path, "target_vocab.json"), TARGET.vocab.stoi
)

### Вариант 2

### Модель sequence-to-sequence на основе архитектуры трансформера
https://github.com/CyberZHG/keras-transformer

In [44]:
!python -m pip install --quiet keras-transformer

In [45]:
df.sample(5)

Unnamed: 0,id,src,trg,cn
521,29782,о опричн нс,о опричнинѣ,11
737,12885,стряпшiйдмитр6йниепауовъ сынъ голее⊗щевъ,стряпчей дмитрей степановъ сынъ голенищевъ,40
740,172887,отдато граммткв у боахинъ кнть ннъърееви92 хов...,отдать грамотка у боярина кнзь анъдреевича хо...,62
660,8469,игналью т⊕уъа3евском[,игнатью трухачевскому,21
411,164681,показуетъ эвитокъ и рл]ъ0лет4,показуетъ свитокъ и глаголетъ,29


In [48]:
from sklearn.model_selection import train_test_split


transformer_model_path = "transformer_model"
if not os.path.exists(transformer_model_path):
    os.makedirs(transformer_model_path)


strata = df["cn"].values
train_df, test_df = train_test_split(df, stratify=strata, train_size=0.9, shuffle=True)


train_df.to_csv(os.path.join(transformer_model_path, "train.tsv"), sep="\t", index=False)
test_df.to_csv(os.path.join(transformer_model_path, "test.tsv"), sep="\t", index=False)

In [49]:
def build_token_dict(text_list):
    token_dict = {
        '<PAD>': 0,
        '<START>': 1,
        '<END>': 2,
    }
    for text in tqdm(text_list, total=len(text_list)):
        for token in text:
            if token not in token_dict:
                token_dict[token] = len(token_dict)
    return token_dict


def prepare_data(df, source_token_dict, target_token_dict):
    df["src_tok"] = df["src"].apply(tokenize)
    df["trg_tok"] = df["trg"].apply(tokenize)

    source_tokens = df["src_tok"].tolist()
    target_tokens = df["trg_tok"].tolist()

    encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
    decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
    output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]
    
    source_max_len = max(map(len, encode_tokens))
    target_max_len = max(map(len, decode_tokens))
    
    encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
    decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
    output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]
    
    encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
    decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
    decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]
    
    return encode_input, decode_input, decode_output

In [50]:
tr_source_token_dict = build_token_dict(train_df.src.tolist())
tr_target_token_dict = build_token_dict(train_df.trg.tolist())

tr_target_token_dict_inv = {v: k for k, v in tr_target_token_dict.items()}
tr_source_token_dict_inv = {v: k for k, v in tr_source_token_dict.items()}

HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))




In [51]:
save_json(
    os.path.join(transformer_model_path, "source_token_dict.json"), tr_source_token_dict
)

save_json(
    os.path.join(transformer_model_path, "target_token_dict.json"), tr_target_token_dict
)

In [52]:
from keras.optimizers import Adam
from keras_transformer import get_model
import numpy as np


EMBED_DIM = 512
HIDDEN_DIM = 256
HEAD_NUM = 4
ENC_NUM = 3
DEC_NUM = 3
DROPOUT_RATE = 0.2
LEARNING_RATE = 0.00001


model = get_model(
    token_num=max(len(tr_source_token_dict), len(tr_target_token_dict)),
    embed_dim=EMBED_DIM,
    encoder_num=ENC_NUM,
    decoder_num=DEC_NUM,
    head_num=HEAD_NUM,
    hidden_dim=HIDDEN_DIM,
    dropout_rate=DROPOUT_RATE,
    use_same_embed=False,
)

model.compile(
    optimizer=Adam(lr=LEARNING_RATE), loss='sparse_categorical_crossentropy'
)

In [None]:
BATCH_SIZE = 32
EPOCH_NUM = 15


encode_input, decode_input, decode_output = prepare_data(
    train_df, tr_source_token_dict, tr_target_token_dict
)

model.fit(
    x=[np.array(encode_input), np.array(decode_input)],
    y=np.array(decode_output),
    epochs=EPOCH_NUM,
    batch_size=BATCH_SIZE,
    verbose=2
)

In [54]:
model.save(os.path.join(transformer_model_path, "transformer_model_base.h5"))

### Оценка моделей на внутренней тестовой выборке

In [55]:
!python -m pip install --quiet editdistance

In [56]:
import editdistance


def evaluate(y_true, y_pred, print_num=50):
    numCharErr = 0
    numCharTotal = 0
    numStringOK = 0
    numStringTotal = 0
    counter = 0

    word_eds, word_true_lens = [], []
    
    for i, pred in enumerate(y_pred):
        true = y_true[i]
        
        numStringOK += 1 if true == pred else 0
        
        numStringTotal += 1
        dist = editdistance.eval(pred, true)
        
        numCharErr += dist
        numCharTotal += len(true)
        
        pred_words = pred.split()
        true_words = true.split()
        word_eds.append(editdistance.eval(pred_words, true_words))
        word_true_lens.append(len(true_words))
        
        is_print = np.random.choice([True, False], p=[0.05, 0.95])
        if is_print and counter < print_num and len(true) > 15:
            print('[OK]' if dist==0 else '[ERR:%d]' % dist,'"' + true + '"', '->', '"' + pred + '"')
            counter += 1

    charErrorRate = numCharErr / numCharTotal
    wordErrorRate = sum(word_eds) / sum(word_true_lens) 
    stringAccuracy = numStringOK / numStringTotal
    print(
        'Character error rate: %f%%. Word error rate: %f%%. String accuracy: %f%%.' % \
        (charErrorRate*100.0,wordErrorRate*100.0, stringAccuracy*100.0)
    )

#### Encoder-decoder model

In [57]:
class CharTokenizer(object):

    def __init__(self, config):
        self.config = config
        self.src_stoi = self.config["vocab"]["src_stoi"]
        self.trg_stoi = self.config["vocab"]["trg_stoi"]
        self.src_itos = {v: k for k, v in self.src_stoi.items()}
        self.trg_itos = {v: k for k, v in self.trg_stoi.items()}
        self.eos_token = self.config["tok"]["eos_token"]
        self.unk_token = self.config["tok"]["unk_token"]
        self.pad_token = self.config["tok"]["pad_token"]
        self.sos_token = self.config["tok"]["sos_token"]
    
    def encode(self, sequence):
        enc = [self.src_stoi[char] if char in self.src_stoi else self.stoi[self.unk_token_id] for char in list(sequence)] + [self.src_stoi[self.eos_token]]
        return torch.tensor(enc).unsqueeze(0)
    
    def create_mask(self, enc):
        return (enc != self.src_stoi[self.pad_token]).unsqueeze(-2)
    
    def get_length(self, enc):
        return torch.tensor(enc.shape[-1], dtype=torch.int64).unsqueeze(0)


def load_model(config, device):

    model_params = config["model"]
    model_path = model_params["model_path"]
    emb_size = model_params["emb_size"]
    hidden_size = model_params["hidden_size"]
    num_layers = model_params["num_layers"]
    dropout = model_params["dropout"]

    state_dict = torch.load(model_path)
    source_dim = state_dict["src_embed.weight"].shape[0]
    target_dim = state_dict["trg_embed.weight"].shape[0]

    model = EncoderDecoder(
        Encoder(emb_size, hidden_size, num_layers=num_layers, dropout=dropout),
        Decoder(emb_size, hidden_size, BahdanauAttention(hidden_size), num_layers=num_layers, dropout=dropout),
        nn.Embedding(source_dim, emb_size),
        nn.Embedding(target_dim, emb_size),
        Generator(hidden_size, target_dim)
    )

    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    return model


def load_json(fname_path):
    with open(fname_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def build_config(encoder_decoder_model_path,
                 model_name="encoder_decoder_model.pt",
                 emb_size=256,
                 hidden_size=512,
                 num_layers=2,
                 dropout_rate=0.2,
                 source_vocab="source_vocab.json",
                 target_vocab="target_vocab.json"
                ):
    config = {
        "model": {
            "model_path": os.path.join(encoder_decoder_model_path, model_name),
            "emb_size": emb_size,
            "hidden_size": hidden_size,
            "num_layers": num_layers,
            "dropout": dropout_rate
        },
        "tok": {
            "pad_token": PAD_TOKEN,
            "sos_token": SOS_TOKEN,
            "eos_token": EOS_TOKEN,
            "unk_token": UNK_TOKEN
        },
        "vocab": {
            "src_stoi": load_json(
                os.path.join(encoder_decoder_model_path, source_vocab)
            ),
            "trg_stoi": load_json(
                os.path.join(encoder_decoder_model_path, target_vocab)
            )
        }
    }
    return config

In [58]:
def dummy_copy(sequence, output):
    diff = len(sequence) - len(output)
    if diff > 0:
        return output + sequence[-diff:]
    return output


def greedy_decode(sequence, model, tokenizer, device, copy=False, max_len=256):
    src = tokenizer.encode(sequence).to(device)
    src_mask = tokenizer.create_mask(src).to(device)
    src_length = tokenizer.get_length(src).to(device)
    sos_index = tokenizer.trg_stoi[tokenizer.sos_token]
    eos_index = tokenizer.trg_stoi[tokenizer.eos_token]

    with torch.no_grad():
        encoder_hidden, encoder_final = model.encode(src, src_mask, src_length)
        prev_y = torch.ones(1, 1).fill_(sos_index).type_as(src)
        trg_mask = torch.ones_like(prev_y)

    output = []
    hidden = None

    for i in range(max_len):
        with torch.no_grad():
            out, hidden, pre_output = model.decode(
              encoder_hidden, encoder_final, src_mask,
              prev_y, trg_mask, hidden)
            prob = model.generator(pre_output[:, -1])

        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data.item()
        output.append(next_word)
        prev_y = torch.ones(1, 1).type_as(src).fill_(next_word)
    
    output = np.array(output)
    if eos_index is not None:
        first_eos = np.where(output==eos_index)[0]
        if len(first_eos) > 0:
            output = output[:first_eos[0]]      
    
    output = "".join([tokenizer.trg_itos[token_id] for token_id in output.tolist()])
    return output

In [59]:
import json


encoder_decoder_model_config = build_config(encoder_decoder_model_path=encoder_decoder_model_path)
tokenizer = CharTokenizer(config=encoder_decoder_model_config)
encoder_decoder_model = load_model(config=encoder_decoder_model_config, device=DEVICE)

In [60]:
import json


encoder_decoder_result_path = os.path.join(encoder_decoder_model_path, "encoder_decoder_results.json")
encoder_decoder_result = load_json(encoder_decoder_result_path)

In [61]:
encoder_decoder_prediction = encoder_decoder_result["predictions"]
encoder_decoder_trgs = encoder_decoder_result["targets"]
encoder_decoder_srcs = encoder_decoder_result["sources"]

#### Пример генерации на аугментированных данных

In [52]:
encoder_decoder_srcs[-100]

'отъсихъ конфшенъпри м0рикоградуидущиобрѣеаютсявертоградысултанскіятакожде иъ другуюсторонуградхвелми'

In [50]:
encoder_decoder_prediction[-100]

'отъ сихъ коняшенъ при морико граду и дущи обрѣзаются вертограды султанскія такожде изъ другую сторону градовелми'

In [51]:
encoder_decoder_trgs[-100]

'отъ сихъ конюшенъ при мори ко граду идущи обрѣтаются вертограды султанскія такожде и зъ другую сторону града велми'

### Исходные значения метрик

In [85]:
evaluate(
    y_true=encoder_decoder_trgs,
    y_pred=encoder_decoder_srcs
)

[ERR:8] "быша сія  6968го" -> "дыч3сія6м68бо"
[ERR:5] "и невидимъ бысть" -> "инетидимлбыст"
[ERR:6] "повѣсть  полезна" -> "щовѣсщрполена"
[ERR:4] "ноября въ 27й день" -> "ноябрявъ27йден"
[ERR:7] "августа въ 18 день" -> "авгизтащъ18ден"
[ERR:9] "о  взятіи  7071го" -> "орзят2г7071ххо"
[ERR:5] "апрѣля въ 28 день" -> "апрѣлявъ2йденг"
[ERR:3] "ноября въ 24 день" -> "ноябрявъ24день"
[ERR:3] "сенке омельянову" -> "се8кеомелянову"
[ERR:5] "куря подъ лимоны" -> "кузяпо6ълимоныт"
[ERR:2] "оношке савостину" -> "оношкшсавостину"
[ERR:3] "но и то тебѣ мало" -> "но итотебѣ тало"
[ERR:5] "гришкѣ мерзлюкину" -> "гришаѣмецююкину"
[ERR:4] "федке ратманцову" -> "федкер1тм…нцо7у"
[ERR:3] "паки  1й  ангелъ" -> "па7и 1й  ангплъ"
[ERR:5] "власку степанову" -> "влщвкуьтепжнову"
[ERR:1] "5 черевъ болшихъ" -> "5 черевъболшихъ"
[ERR:2] "по 140 четвертей" -> "по 1ю0 четве2тей"
[ERR:4] "марта въ 20 день" -> "⊕а]та въ 20 дрнѣ"
[ERR:5] "павлику захарову" -> "тав1икузахдровук"
[ERR:4] "филипку данилову" -> "филипоу д

### Целевые значения метрик

In [86]:
evaluate(
    y_true=encoder_decoder_trgs,
    y_pred=encoder_decoder_prediction
)

[ERR:3] "авгута въ 8 день" -> "а голта въ 8 день"
[ERR:2] "и соромъ и не смѣю" -> "и соромъ и не мѣню"
[OK] "ноября въ 22 день" -> "ноября въ 22 день"
[ERR:2] "корова рыжа безъ" -> "коромъ рыжа безъ"
[OK] "ноября въ 24 день" -> "ноября въ 24 день"
[ERR:5] "ваше   низлагаетъ" -> "вашенть злагаетъ"
[ERR:2] "онъ же тамерланъ" -> "онъ же тамъ рланъ"
[ERR:2] "куря подъ лимоны" -> "кузя подъ лимоный"
[ERR:2] "апрѣля въ 6 день" -> "апрѣлянъ 6 день"
[ERR:1] "но и то тебѣ мало" -> "но и то тебѣ тало"
[OK] "тренке михаилову" -> "тренке михаилову"
[ERR:2] "о измѣнѣ донской" -> "о имѣнѣ чонской"
[ERR:1] "путилу неболсину" -> "путилу наболсину"
[ERR:2] "майя въ 28й день" -> "майя въ 12й день"
[ERR:4] "177 г іюля въ 22 день" -> "177 г іюля 192 день"
[ERR:2] "на ныхъ узди  шитіе" -> "на ныхъ уздишитіе"
[ERR:1] "селуяшке фатееву" -> "келуяшке фатееву"
[ERR:1] "списокъ съ рѣчей" -> "списокъ съ вѣчей"
[ERR:4] "казаня латынскые" -> "казанъ латеншкіе"
[ERR:4] "такъ   отскочили" -> "дакъ   восковили"
[OK] 

#### Transformer model

In [None]:
import tensorflow.python.util.deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False


te_encode_input, te_decode_input, te_decode_output = prepare_data(
    test_df, tr_source_token_dict, tr_target_token_dict
)

In [None]:
from keras_transformer import decode


def transformer_decode(decode_input, vocab=tr_target_token_dict_inv):
    decode_input = [x for x in decode_input if not vocab[x] in ("<PAD>", "<END>", "<START>")]
    return "".join(map(lambda x: vocab[x], decode_input))


decoded = decode(
    model,
    te_encode_input,
    start_token=tr_target_token_dict['<START>'],
    end_token=tr_target_token_dict['<END>'],
    pad_token=tr_target_token_dict['<PAD>'],
    temperature=1.0,
)

In [191]:
y_transformer_test_true = [transformer_decode(x) for x in te_decode_input]
y_transformer_test_true[:5]

['при себѣ внука своего нарекъ великимъ княземъ всеа русіи',
 'по нашему указу отпущены были  на соловки и съ соловковъ пришли къ намъ къ москвѣ',
 'покупалъ тѣ лотки приказу тайныхъ дѣлъ подьячей перфирей оловяниковъ',
 'была въ венецы процесія празднуютъ католики тотъ день святому кресту христову',
 'злѣ сотворили есте и невозможно вамъ будетъ предъ старымъ отцемъ отвещати']

In [192]:
y_transformer_test_pred = [transformer_decode(x) for x in decoded]
y_transformer_test_pred[:5]

['хри семѣ влука своего наремъ великимъ княземъ всеа русіи',
 'по нашемъ указу отпущены были  на соловки и съ половкова пришли коламѣ къ москвѣ',
 'покупалъ тѣ лотки приказу тайныхъ дѣлъ подьячей пертирей оловяниковъ',
 'бобравъ вентцы прицесія празануютъ затолаки томъ день святому кресту пристовъ',
 'злѣсотворбли естеи нево можно вамъ будетъ предъ старымъ омцемъ отвощати']

In [171]:
evaluate(
    y_true=y_transformer_test_true,
    y_pred=test_df["src"].tolist()
)

[ERR:15] "а твоего государева денежнаго и хлѣбнаго жалованяя окладъ мнѣ холопу твоему не учиненъ" -> "я твоего го4[д⊕кева денешнаго и хлѣбна[о жалованяяокл8дъ м]ѣ хол6п2 твоему н2 учиферъ"
[ERR:11] "а имена  стрелцомъ и росписка въ росходномъ столпу" -> "аимена  стреицомъ и 8осп7ика въ твсх6дномъ 8тоѣпх"
[ERR:10] "и абіе нападе на нихъ ужасъ велій и посрамлени побегоша" -> "и аыіе 9впазе9а нищъ ужажъ велай и посрамлени побегдша"
[ERR:11] "зде государь  въ ангилове еровова досталь дасеваемъ" -> "здегосуда5ввъанеиловеерововадостальдксеваемъ"
[ERR:8] "писана на москвѣ  7141го апрѣля въ 15 день" -> "писана нф москвѣ  7141го апзв]я вн 1ъ дешс"
[ERR:19] "ноября въ 27 день велѣно быти на михайлове на княжъ васильево мѣсто морткина резанцу дею иванову сыну рохманинову" -> "ноябрявъ27денвелѣнобытинамихайловенакняжъвасилевомѣстоморткинарезанцудеюивановусынурохманинову"
[ERR:1] "да въ одоеве жъ кропивенское  съ одоевскимъ вмѣстѣ подъ соборную церковь въ каменныхъ полаткехъ" -> "да въ одоеве жъ кр

In [193]:
evaluate(
    y_true=y_transformer_test_true,
    y_pred=y_transformer_test_pred
)

[ERR:7] "а твоего государева денежнаго и хлѣбнаго жалованяя окладъ мнѣ холопу твоему не учиненъ" -> "и твоего государева денешнаго и хлѣбнаго жалованся окладъ мнѣ холопъ твоему на учитеръ"
[OK] "и казанцы били челомъ на царя шигъалѣя" -> "и казанцы били челомъ на царя шигъалѣя"
[ERR:6] "къ  росписи игнатъ еремѣевъ руку приложилъ" -> "къ росписи понать зеремѣехъ руку приложилъ"
[ERR:2] "и изъ тѣхъ 3 пищали худы стрѣлять нелзя" -> "и изъ тѣхъ 3 пищали суды стрѣлять неля"
[ERR:3] "срокъ троицынъ день" -> "свокъ троицынъ бегь"
[ERR:10] "двъ данилка кощіева померлъ въ прошломъ во 192 году" -> "двъ данилца короска прехолъ въ прошхомъ во 192 году"
[ERR:5] "на коломнѣ князь иванъ княжъ ивановъ сынъ шеховской" -> "на колочнѣ князь аванъ княжъ ивановъ сынъ нехосикой"
[ERR:6] "головы стрелецкіе всѣхъ приказовъ" -> "полоды стрялецкіе вся зъ приказовъ"
[ERR:2] "іюня въ 9 день пріѣхалъ въ дорогобужъ" -> "іюня въ 9 день пріѣхалъ въ дерогобутъ"
[ERR:7] "о посланіи царя кучюма сына своего маметкула на 

### Оценка моделей на валидационной выборке

In [73]:
!head val_prediction.txt

[ERR:11] "[а iменно при дворѣ аглинском] былъ" -> "га iмен но приддрѣ ат лин кком ]былъ"
[ERR:10] "iз лагору 11 д iюля 1711 петръ" -> "iзлагоруд iюля 1окюх етръ"
[ERR:10] "которым позволяется всѣ" -> "ко то ромм по оляотя пѣ"
[ERR:6] "в милости i призрѣнi" -> "вмимости i празрык"
[ERR:8] "і двины не можно л слюзоф здѣлат там" -> "i двиныио мож нол сло зоф здѣлат там"
[ERR:2] "ли" -> "а"
[ERR:6] "колко офицероф наперет" -> "солко офицеродна перб"
[ERR:13] "от речи жижи въ 15 д iюля петръ" -> "отрiнжиживъя д iюля ует еу"
[ERR:6] "буде же ск" -> "вудецс ве"
[ERR:6] "хто такого сыщет iли возвестит тому от" -> "пхто такого сыет iли возветит то муут"


In [76]:
import re


def parse(data):
    y_true, y_pred = [], []
    for line in data:
        true, recognized = re.findall('".+"', line)[0].split("->")
        true, recognized = true.strip(' "'), recognized.strip(' "')
        y_true.append(true)
        y_pred.append(recognized)
    return y_true, y_pred

        
def read_validation_data(fname):
    with open(fname, "r", encoding="utf-8") as f:
        data = [line for line in f.read().split("\n") if line]
        return parse(data)

In [195]:
y_val_true, y_val_pred = read_validation_data("val_prediction.txt")

In [199]:
print(y_val_true[0], "->", y_val_pred[0])

[а iменно при дворѣ аглинском] былъ -> га iмен но приддрѣ ат лин кком ]былъ


### Исходные показатели на валидационной выборке

In [200]:
evaluate(y_val_true, y_val_pred)

[ERR:8] "в помераниi [которой вѣдомости" -> "в померанич ра саторой водомоси"
[ERR:10] "купъцофъ рукомесленных i духовъных" -> "к упъдофо гукомесленныхх чдох явъных"
Character error rate: 27.366609%. Word error rate: 88.963964%. String accuracy: 0.000000%.


In [201]:
# Encoder-decoder model

encoder_decoder_generated = [
    greedy_decode(cv_pred, model=encoder_decoder_model, tokenizer=tokenizer, device=DEVICE) for cv_pred in y_val_pred
]

evaluate(
    y_true=y_val_true,
    y_pred=encoder_decoder_generated
)

[ERR:25] "линею остъвестъ чрез тѣ мѣста гдѣ доткнулис вы" -> "и отъ оста асибъ призовъ мѣтя гдѣ дот князнѣ"
[ERR:17] "ежели б возможно чтоб свадббѣ сына моего быт" -> "еделе вом моно чтоб сваглѣ о намого его"
[ERR:8] "не iзнужит но держат во въся" -> "не iзну жаи но держать двѣся"
[ERR:2] "так как нам самому" -> "тай как нах самому"
[ERR:16] "тит низом верхъ надлежит же знат что у указу" -> "и на том в ебрая надехлит же знать что у уко ]"
[ERR:12] "рыя въ 60 пушек то б на нижней полубѣ полукар" -> "рыя въ то  пушек той нанихъ ней полу бѣ полуку"
[ERR:14] "трактъ от киева до старадуба [а ска" -> "тристь отки ева доста доду на галка"
Character error rate: 32.573150%. Word error rate: 89.189189%. String accuracy: 0.000000%.


In [202]:
# Transformer model

val_df = pd.DataFrame([(y, y_val_true[i]) for i, y in enumerate(y_val_pred)], columns=["src", "trg"])
val_df.head()

Unnamed: 0,src,trg
0,га iмен но приддрѣ ат лин кком ]былъ,[а iменно при дворѣ аглинском] былъ
1,iзлагоруд iюля 1окюх етръ,iз лагору 11 д iюля 1711 петръ
2,ко то ромм по оляотя пѣ,которым позволяется всѣ
3,вмимости i празрык,в милости i призрѣнi
4,i двиныио мож нол сло зоф здѣлат там,і двины не можно л слюзоф здѣлат там


In [203]:
va_encode_input, va_decode_input, va_decode_output = prepare_data(
    val_df, tr_source_token_dict, tr_target_token_dict
)

In [215]:
va_decoded = decode(
    model,
    va_encode_input,
    start_token=tr_target_token_dict['<START>'],
    end_token=tr_target_token_dict['<END>'],
    pad_token=tr_target_token_dict['<PAD>'],
    temperature=1.0,
    top_k=1
)

In [216]:
y_transformer_val_true = [transformer_decode(x) for x in va_decode_input]
y_transformer_val_true[:5]

['[а iменно при дворѣ аглинском] былъ',
 'iз лагору 11 д iюля 1711 петръ',
 'которым позволяется всѣ',
 'в милости i призрѣнi',
 'і двины не можно л слюзоф здѣлат там']

In [217]:
y_transformer_val_pred = [transformer_decode(x) for x in va_decoded]
y_transformer_val_pred[:5]

['на iмен но придарѣ ат линском обылъ',
 'iзлагоруда іюля покладеръ',
 'ко то ромъ по олтотя пѣ',
 'вмимости i празры',
 'i двины то можиною сло золоздѣлат там']

In [213]:
evaluate(y_val_true, y_val_pred)

[ERR:7] "линно i дайте нам знат дабы" -> "и но iдахтенам знатдабя"
Character error rate: 27.366609%. Word error rate: 88.963964%. String accuracy: 0.000000%.


In [218]:
evaluate(
    y_true=y_transformer_val_true,
    y_pred=y_transformer_val_pred
)

[ERR:6] "колко офицероф наперет" -> "солко офицеродна перб"
[ERR:9] "нами сужъдено i вина" -> "нае мми бу ржено вина"
[ERR:14] "а когъда похочеш протиф норда часы дѣлат" -> "а к года подо че протифь норданы а садѣлять"
[ERR:11] "как дѣлат наклоном часы" -> "андрѣат на помом на 1"
[ERR:19] "повѣi ради мололюдства афицерофъ" -> "погдо ржи многи  ни завицемъ"
Character error rate: 32.228916%. Word error rate: 90.315315%. String accuracy: 0.000000%.
