# Baseline: Sequence-to-sequence

### Одним из возможных способов улучшения качества системы распознавания рукописных документов является пост-обработка предиктов с помощью модели sequence-to-sequence.

## Подготовка данных

### В качестве дополнительных данных для обучения модели можно использовать коллекцию текстов 17 века, которая была предложена организаторами соревнования GramEval2020.

In [3]:
fname = "/home/jovyan/grameval_17_century.txt"


def read_grameval(fname=fname):
    with open(fname, "r", encoding='utf-8') as f:
        lines = [x[:-1] for x in f.readlines()]
        return lines

In [4]:
grameval_texts = read_grameval(fname)

In [5]:
grameval_texts[-5]

'и то явное ихъ съ откащикомъ воровство не поставя столба въ отказные книги за споромъ чювашина бортнички написали'

### Зададим набор правил для аугментации данных с использованием шума и специфики стиля Петра I.

In [6]:
import random


# p of substitution = 1/znam
znam = 2

rules = []

#Традиционно над строкой Петр пишет «з» и «с», конечное «х», также «к» перед широкой размашистой «ж»
rules.append(('з',''))
rules.append(('c',''))
rules.append(('x',''))
rules.append(('кж', 'ж'))
#вместо старого «ѧ» уже регулярно употребляет вполне современное «я»
rules.append(('ѧ', 'я'))
#Не любит буквы «s» («зело») и «ѵ» 
rules.append(('s', ''))
rules.append(('ѵ', ''))
#Мягкий знак пропускает
rules.append(('ь', ''))


def replace_letters(line, to_replace, replace_by, znam=2):
    new_line = ''
    for letter in line:
        if letter == to_replace and random.randint(0, znam - 1) % znam == 0:
            new_line = new_line + replace_by
        else:
            new_line = new_line + letter
    return new_line
    

def apply_rule(lines, rule, znam=2):
    to_replace, replace_by = rule
    res = [
        replace_letters(line, to_replace, replace_by) for line in lines
    ]
    return res

In [7]:
print("Peter's writing rules:")
print(rules)

Peter's writing rules:
[('з', ''), ('c', ''), ('x', ''), ('кж', 'ж'), ('ѧ', 'я'), ('s', ''), ('ѵ', ''), ('ь', '')]


In [None]:
from tqdm.auto import tqdm


for rule in tqdm(rules, total=len(rules), desc="Generating data..."):
    grameval_texts = apply_rule(grameval_texts, rule)

### Скачаем обучающую выборку

In [1]:
!wget -c https://storage.yandexcloud.net/datasouls-ods/materials/46b7bb85/datasets.zip

--2020-10-21 18:31:12--  https://storage.yandexcloud.net/datasouls-ods/materials/46b7bb85/datasets.zip
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 640106308 (610M) [application/zip]
Saving to: ‘datasets.zip’


2020-10-21 18:31:28 (43.4 MB/s) - ‘datasets.zip’ saved [640106308/640106308]



In [9]:
!mkdir aij_data

import zipfile


with zipfile.ZipFile("datasets.zip", "r") as f:
    f.extractall("aij_data")

### Построим словарь для генерации шума.

In [9]:
import os
from collections import Counter


trans_dir = '/home/jovyan/aij_data/train/words'
image_dir = '/home/jovyan/aij_data/train/images'

english = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'm', 'n' ,'o', 'p', 'r', 's', 't', 'u', 'w']


def process_texts(image_dir, trans_dir):
    lens = []
    include_english = 0
    letters = ''

    lines = []
    names = []
    
    all_files = os.listdir(trans_dir)
    for filename in os.listdir(image_dir):
        if filename[:-3]+'txt' in all_files:
            name, ext = os.path.splitext(filename)
            txt_filepath = os.path.join(trans_dir, name + '.txt')
            with open(txt_filepath, 'r') as file:
                data = file.read()
                if len(data)==0:
                    continue
                if len(set(data).intersection(english))>0:
                    continue

                lines.append(data)
                names.append(filename)
                lens.append(len(data))
                letters += data

    print('Максимальная длина строки:', max(lens))
    print('Количество строк с английскими буквами ', include_english)

    return names, lines, Counter(letters)

In [10]:
names, lines, cnt = process_texts(image_dir,trans_dir)

Максимальная длина строки: 71
Количество строк с английскими буквами  0


In [11]:
letters = sorted(list(cnt.keys()))
print('Символы train:', ' '.join(letters))

Символы train:   ) + / 0 1 2 3 4 5 6 7 8 9 [ ] i k l | × ǂ а б в г д е ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я і ѣ – … ⊕ ⊗


### Пример аугментации

* добавление шума из словаря на посимвольном уровне;
* удаление пробелов с определенной вероятностью.

In [13]:
import numpy as np
import pandas as pd


df = pd.DataFrame(lines + grameval_texts, columns=["trg"])

def add_noise(text, symbols=letters, znam=4):
    text = list(text)
    num = len(text) // znam
    indexes = random.sample(range(0, len(text)), num)
    for i in indexes:
        if text[i]!=' ':
            text[i] = random.choice(symbols)
        else:
            del_space = np.random.choice([True, False], p=[0.3, 0.7])
            if del_space:
                text[i] = text[i].replace(" ", "")
    return ''.join(text)


df["src"] = [add_noise(t, znam=4) for t in df["trg"].tolist()]

In [14]:
df.head()

Unnamed: 0,trg,src
0,развѣ кромѣ на то развѣ на наше вой,8агвѣ кромѣ ѣа то ржзвѣ на н–ше 7щй
1,такой когда бывает осен то по феся годы выду,тад0й когда бы[ае3 осзн то по фесп гнеы выду
2,г подпол,г м]дпол
3,дению есть [i чтоб не iзволил слабѣт в при,иению есть [г 7тоб ме i⊕вмлилслдбѣ7 в пби
4,по крылам перебиратца,по крыла+ пиаебирітжа


### Используем заранее аугментированные данные, где:

* id – id семпла;
* src – исходная последовательность;
* trg – целевая последовательность;
* cn – длина исходной последовательности в символах.

Датафрейм включает в себя аугментацию предиктов бейзлайна на валидационном сете (```baseline.ipynb```) и предложений из ```grameval_17_century.txt```.

In [14]:
import pandas as pd


df = pd.read_csv("/home/jovyan/augmented_data.csv", sep=",")
df.sample(5)

Unnamed: 0,id,src,trg,cn
304337,400047,по указу великаго государя т иъ моностырского ...,по указу великаго государя т изъ моностырьског...,98
182797,256232,потиръ олотъ гладкой вѣсу въ нем 3 гривенки,потиръ золотъ гладкой вѣсу въ нем 3 гривенки,43
309834,406537,и борисъ съ ними ѣхали на подхожей станъ и не ...,и борисъ съ ними ѣхали на подхожей станъ и не ...,80
28951,40569,с достпканца божечко9,2 достоканца бочечкою,22
204573,281979,отъ перваго же авраамля и до исходъ моисеева ...,отъ перваго же авраамля и до исхода моисеева ...,65


### Вариант 1: Базовая модель Encoder-Decoder with Bahdanau Attention

##### На основе https://bastings.github.io/annotated_encoder_decoder/

In [14]:
!python -m pip install --quiet python-Levenshtein

In [16]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from IPython.core.debugger import set_trace
import Levenshtein as lev


USE_CUDA = torch.cuda.is_available()
DEVICE=torch.device('cuda:0')
print("CUDA:", USE_CUDA)
print(DEVICE)

CUDA: True
cuda:0


In [17]:
def init_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [18]:
init_seed()

### Архитектура модели

In [19]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many
    other models.
    """

    def __init__(self, encoder, decoder, src_embed, trg_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.trg_embed = trg_embed
        self.generator = generator

    def forward(self, src, trg, src_mask, trg_mask, src_lengths, trg_lengths):
        """Take in and process masked src and target sequences."""
        encoder_hidden, encoder_final = self.encode(src, src_mask, src_lengths)
        return self.decode(encoder_hidden, encoder_final, src_mask, trg, trg_mask)

    def encode(self, src, src_mask, src_lengths):
        return self.encoder(self.src_embed(src), src_mask, src_lengths)

    def decode(
        self,
        encoder_hidden,
        encoder_final,
        src_mask,
        trg,
        trg_mask,
        decoder_hidden=None,
    ):
        return self.decoder(
            self.trg_embed(trg),
            encoder_hidden,
            encoder_final,
            src_mask,
            trg_mask,
            hidden=decoder_hidden,
        )

In [20]:
class Generator(nn.Module):
    """Define standard linear + softmax generation step."""

    def __init__(self, hidden_size, vocab_size):
        super(Generator, self).__init__()
        self.proj = nn.Linear(hidden_size, vocab_size, bias=False)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

In [21]:
class Encoder(nn.Module):
    """Encodes a sequence of word embeddings"""

    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.0):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.rnn = nn.GRU(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout,
        )

    def forward(self, x, mask, lengths):
        """
        Applies a bidirectional GRU to sequence of embeddings x.
        The input mini-batch x needs to be sorted by length.
        x should have dimensions [batch, time, dim].
        """
        packed = pack_padded_sequence(x, lengths, batch_first=True)
        output, final = self.rnn(packed)
        output, _ = pad_packed_sequence(output, batch_first=True)

        # we need to manually concatenate the final states for both directions
        fwd_final = final[0 : final.size(0) : 2]
        bwd_final = final[1 : final.size(0) : 2]
        # [num_layers, batch, 2*dim]
        final = torch.cat([fwd_final, bwd_final], dim=2)

        return output, final

In [22]:
class Decoder(nn.Module):
    """A conditional RNN decoder with attention."""

    def __init__(
        self, emb_size, hidden_size, attention, num_layers=1, dropout=0.5, bridge=True
    ):
        super(Decoder, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.attention = attention
        self.dropout = dropout

        self.rnn = nn.GRU(
            emb_size + 2 * hidden_size,
            hidden_size,
            num_layers,
            batch_first=True,
            dropout=dropout,
        )

        # to initialize from the final encoder state
        self.bridge = (
            nn.Linear(2 * hidden_size, hidden_size, bias=True) if bridge else None
        )

        self.dropout_layer = nn.Dropout(p=dropout)
        self.pre_output_layer = nn.Linear(
            hidden_size + 2 * hidden_size + emb_size, hidden_size, bias=False
        )

    def forward_step(self, prev_embed, encoder_hidden, src_mask, proj_key, hidden):
        """Perform a single decoder step (1 word)"""

        # compute context vector using attention mechanism
        query = hidden[-1].unsqueeze(1)  # [#layers, B, D] -> [B, 1, D]
        context, attn_probs = self.attention(
            query=query, proj_key=proj_key, value=encoder_hidden, mask=src_mask
        )

        # update rnn hidden state
        rnn_input = torch.cat([prev_embed, context], dim=2)
        output, hidden = self.rnn(rnn_input, hidden)

        pre_output = torch.cat([prev_embed, output, context], dim=2)
        pre_output = self.dropout_layer(pre_output)
        pre_output = self.pre_output_layer(pre_output)

        return output, hidden, pre_output

    def forward(
        self,
        trg_embed,
        encoder_hidden,
        encoder_final,
        src_mask,
        trg_mask,
        hidden=None,
        max_len=None,
    ):
        """Unroll the decoder one step at a time."""

        # the maximum number of steps to unroll the RNN
        if max_len is None:
            max_len = trg_mask.size(-1)

        # initialize decoder hidden state
        if hidden is None:
            hidden = self.init_hidden(encoder_final)

        # pre-compute projected encoder hidden states
        # (the "keys" for the attention mechanism)
        # this is only done for efficiency
        proj_key = self.attention.key_layer(encoder_hidden)

        # here we store all intermediate hidden states and pre-output vectors
        decoder_states = []
        pre_output_vectors = []

        # unroll the decoder RNN for max_len steps
        for i in range(max_len):
            prev_embed = trg_embed[:, i].unsqueeze(1)
            output, hidden, pre_output = self.forward_step(
                prev_embed, encoder_hidden, src_mask, proj_key, hidden
            )
            decoder_states.append(output)
            pre_output_vectors.append(pre_output)

        decoder_states = torch.cat(decoder_states, dim=1)
        pre_output_vectors = torch.cat(pre_output_vectors, dim=1)
        return decoder_states, hidden, pre_output_vectors  # [B, N, D]

    def init_hidden(self, encoder_final):
        """Returns the initial decoder state,
        conditioned on the final encoder state."""

        if encoder_final is None:
            return None  # start with zeros

        return torch.tanh(self.bridge(encoder_final))

In [23]:
class BahdanauAttention(nn.Module):
    """Implements Bahdanau (MLP) attention"""

    def __init__(self, hidden_size, key_size=None, query_size=None):
        super(BahdanauAttention, self).__init__()

        # We assume a bi-directional encoder so key_size is 2*hidden_size
        key_size = 2 * hidden_size if key_size is None else key_size
        query_size = hidden_size if query_size is None else query_size

        self.key_layer = nn.Linear(key_size, hidden_size, bias=False)
        self.query_layer = nn.Linear(query_size, hidden_size, bias=False)
        self.energy_layer = nn.Linear(hidden_size, 1, bias=False)

        # to store attention scores
        self.alphas = None

    def forward(self, query=None, proj_key=None, value=None, mask=None):
        assert mask is not None, "mask is required"

        # We first project the query (the decoder state).
        # The projected keys (the encoder states) were already pre-computated.
        query = self.query_layer(query)

        # Calculate scores.
        scores = self.energy_layer(torch.tanh(query + proj_key))
        scores = scores.squeeze(2).unsqueeze(1)

        # Mask out invalid positions.
        # The mask marks valid positions so we invert it using `mask & 0`.
        scores.data.masked_fill_(mask == 0, -float("inf"))

        # Turn scores to probabilities.
        alphas = F.softmax(scores, dim=-1)
        self.alphas = alphas

        # The context vector is the weighted sum of the values.
        context = torch.bmm(alphas, value)

        # context shape: [B, 1, 2D], alphas shape: [B, 1, M]
        return context, alphas

In [24]:
class Batch:
    def __init__(self, src, trg, pad_index=0):

        src, src_lengths = src

        self.src = src
        self.src_lengths = src_lengths
        self.src_mask = (src != pad_index).unsqueeze(-2)
        self.nseqs = src.size(0)

        self.trg = None
        self.trg_y = None
        self.trg_mask = None
        self.trg_lengths = None
        self.ntokens = None

        if trg is not None:
            trg, trg_lengths = trg
            self.trg = trg[:, :-1]
            self.trg_lengths = trg_lengths
            self.trg_y = trg[:, 1:]
            self.trg_mask = self.trg_y != pad_index
            self.ntokens = (self.trg_y != pad_index).data.sum().item()

        if USE_CUDA:
            self.src = self.src.cuda()
            self.src_mask = self.src_mask.cuda()

            if trg is not None:
                self.trg = self.trg.cuda()
                self.trg_y = self.trg_y.cuda()
                self.trg_mask = self.trg_mask.cuda()

In [25]:
class SimpleLossCompute:
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt

    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(
            x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)
        )
        loss = loss / norm

        if self.opt is not None:
            loss.backward()
            self.opt.step()
            self.opt.zero_grad()

        return loss.data.item() * norm

In [26]:
def make_model(
    src_vocab, tgt_vocab, emb_size=256, hidden_size=512, num_layers=1, dropout=0.1
):
    "Helper: Construct a model from hyperparameters."

    attention = BahdanauAttention(hidden_size)

    model = EncoderDecoder(
        Encoder(emb_size, hidden_size, num_layers=num_layers, dropout=dropout),
        Decoder(
            emb_size, hidden_size, attention, num_layers=num_layers, dropout=dropout
        ),
        nn.Embedding(src_vocab, emb_size),
        nn.Embedding(tgt_vocab, emb_size),
        Generator(hidden_size, tgt_vocab),
    )

    return model.cuda() if USE_CUDA else model

In [27]:
def print_examples(
    example_iter,
    model,
    n=2,
    max_len=256,
    src_vocab=None,
    trg_vocab=None,
):
    """Prints N examples. Assumes batch size of 1."""

    model.eval()
    count = 0
    print()

    if src_vocab is not None and trg_vocab is not None:
        src_eos_index = src_vocab.stoi[EOS_TOKEN]
        trg_sos_index = trg_vocab.stoi[SOS_TOKEN]
        trg_eos_index = trg_vocab.stoi[EOS_TOKEN]
    else:
        src_eos_index = None
        trg_sos_index = 1
        trg_eos_index = None

    for i, batch in enumerate(example_iter):

        src = batch.src.cpu().numpy()[0, :]
        trg = batch.trg_y.cpu().numpy()[0, :]

        src = src[:-1] if src[-1] == src_eos_index else src
        trg = trg[:-1] if trg[-1] == trg_eos_index else trg

        result, _ = greedy_decode(
            model,
            batch.src,
            batch.src_mask,
            batch.src_lengths,
            max_len=max_len,
            sos_index=trg_sos_index,
            eos_index=trg_eos_index,
        )
        print("Example #%d" % (i + 1))
        print("Src : ", " ".join(lookup_words(src, vocab=src_vocab)))
        print("Trg : ", " ".join(lookup_words(trg, vocab=trg_vocab)))
        print("Pred: ", " ".join(lookup_words(result, vocab=trg_vocab)))
        print()

        count += 1
        if count == n:
            break

In [26]:
from torchtext import data, datasets


def tokenize(text):
    return list(text)


PAD_TOKEN = "{"
SOS_TOKEN = "~"
EOS_TOKEN = "^"
UNK_TOKEN = "&"


ID = data.Field(sequential=False, use_vocab=False)

SOURCE = data.Field(
    tokenize=tokenize,
    batch_first=True,
    lower=False,
    include_lengths=True,
    unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    init_token=None,
    eos_token=EOS_TOKEN,
)

TARGET = data.Field(
    tokenize=tokenize,
    batch_first=True,
    lower=False,
    include_lengths=True,
    unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    init_token=SOS_TOKEN,
    eos_token=EOS_TOKEN,
)

LEN = data.Field(sequential=False, use_vocab=False)

data_fields = [("id", ID), ("src", SOURCE), ("trg", TARGET), ("cn", LEN)]

In [21]:
import os


encoder_decoder_model_path = "encoder_decoder_model"


if not os.path.exists(encoder_decoder_model_path):
    os.makedirs(encoder_decoder_model_path)

In [30]:
data_path = "/home/jovyan/augmented_data.csv"

encoder_decoder_data = data.TabularDataset(
    path=data_path, format="csv", skip_header=True, fields=data_fields
)

train_data, dev_data = encoder_decoder_data.split(
    split_ratio=[0.9, 0.1], stratified=True, strata_field="cn"
)

SOURCE.build_vocab(train_data.src)
TARGET.build_vocab(train_data.trg)
PAD_INDEX = TARGET.vocab.stoi[PAD_TOKEN]

In [31]:
BATCH_SIZE = 16

train_iter = data.BucketIterator(
    train_data,
    batch_size=BATCH_SIZE,
    train=True,
    sort_within_batch=True,
    sort_key=lambda x: (len(x.src), len(x.trg)),
    repeat=False,
    device=DEVICE,
    shuffle=True,
)

valid_iter_batch = data.Iterator(
    dev_data,
    batch_size=BATCH_SIZE,
    train=False,
    sort_within_batch=True,
    sort_key=lambda x: (len(x.src), len(x.trg)),
    repeat=False,
    device=DEVICE,
    shuffle=False,
)

In [32]:
def rebatch(pad_idx, batch):
    """Wrap torchtext batch into our own Batch class for pre-processing"""
    return Batch(batch.src, batch.trg, pad_idx)


def run_epoch(data_iter, model, loss_compute, print_every=50, num_batches=100):
    """Standard Training and Logging Function"""

    start = time.time()
    total_loss = 0
    print_tokens = 0
    total_tokens = 0

    with tqdm(total=num_batches) as pbar:
        for i, batch in enumerate(data_iter, 1):

            out, _, pre_output = model.forward(
                batch.src,
                batch.trg,
                batch.src_mask,
                batch.trg_mask,
                batch.src_lengths,
                batch.trg_lengths,
            )
            loss = loss_compute(pre_output, batch.trg_y, batch.nseqs)
            total_loss += loss
            print_tokens += batch.ntokens
            total_tokens += batch.ntokens

            if model.training and i % print_every == 0:
                elapsed = time.time() - start
                print(
                    "Epoch Step: %d Loss: %f Tokens per Sec: %f"
                    % (i, loss / batch.nseqs, print_tokens / elapsed)
                )
                start = time.time()
                print_tokens = 0

            pbar.update(1)

    loss = total_loss / float(total_tokens)
    perplexity = math.exp(loss)

    return perplexity, loss

In [33]:
def translate_batch(batch, vocab, target=True):
    res = []
    eos_index = vocab.stoi[EOS_TOKEN]
    batch = batch.trg.tolist() if target else batch.src.tolist()
    for s in batch:
        first_eos = np.where(np.array(s) == eos_index)[0]
        if len(first_eos) > 0:
            res.append(
                "".join(lookup_words(s[: first_eos[0]], vocab=vocab))
                .replace("~", "")
                .strip()
            )
        else:
            res.append(
                "".join(lookup_words(s[:], vocab=vocab)).replace("~", "").strip()
            )
    return res

In [34]:
MAX_LEN = 128


def greedy_decode_batch(
    model, src, src_mask, src_lengths, max_len=MAX_LEN, sos_index=1, eos_index=None
):
    """Greedily decode a sentence."""
    batch_size = src.size(0)

    with torch.no_grad():
        encoder_hidden, encoder_final = model.encode(src, src_mask, src_lengths)
        prev_y = torch.ones(batch_size, 1).fill_(sos_index).type_as(src)
        trg_mask = torch.ones_like(prev_y)

    output, hidden = [], None

    for i in range(max_len):
        with torch.no_grad():
            out, hidden, pre_output = model.decode(
                encoder_hidden, encoder_final, src_mask, prev_y, trg_mask, hidden
            )
            prob = model.generator(pre_output[:, -1])

        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data
        output.append(next_word.cpu().numpy())
        prev_y = next_word.unsqueeze(dim=1)

    output = np.array(output)
    output = np.stack(output).T

    return output


def predict(
    example_iter,
    model,
    max_len=MAX_LEN,
    src_vocab=None,
    trg_vocab=None,
    num_batches=100,
):
    model.eval()

    if src_vocab is not None and trg_vocab is not None:
        trg_sos_index = trg_vocab.stoi[SOS_TOKEN]
        trg_eos_index = trg_vocab.stoi[EOS_TOKEN]
    else:
        trg_sos_index = 1
        trg_eos_index = None

    preds, sources, targets = [], [], []

    with tqdm(total=num_batches) as pbar:
        for i, batch in enumerate(example_iter):

            source_batch = translate_batch(batch, vocab=SOURCE.vocab, target=False)
            target_batch = translate_batch(batch, vocab=TARGET.vocab, target=True)

            sources.extend(source_batch)
            targets.extend(target_batch)

            output = greedy_decode_batch(
                model,
                batch.src,
                batch.src_mask,
                batch.src_lengths,
                max_len=max_len,
                sos_index=trg_sos_index,
                eos_index=trg_eos_index,
            )

            if trg_eos_index is not None:
                for pred in output:
                    if type(pred) == list:
                        pred = np.array(pred)
                    first_eos = np.where(pred == trg_eos_index)[0]
                    if len(first_eos) > 0:
                        # produce sentences
                        preds.append(
                            "".join(lookup_words(pred[: first_eos[0]], vocab=trg_vocab))
                        )
                    else:
                        preds.append("".join(lookup_words(pred[:], vocab=trg_vocab)))
            pbar.update(1)
    return preds, sources, targets

In [35]:
def compute_metrics(tr_loss, tr_ppl, val_loss, val_ppl):
    res = {
        "Train Loss": np.mean(tr_loss),
        "Train PPL": np.mean(tr_ppl),
        "Validation Loss": np.mean(val_loss),
        "Validation PPL": np.mean(val_ppl)
    }
    return res

In [22]:
import json


def save_json(fname, obj):
    with open(fname, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=4)


def save_results(all_predictions, all_targets, all_sources, model_path=encoder_decoder_model_path):
    res = {
        "predictions": all_predictions,
        "targets": all_targets,
        "sources": all_sources,
    }

    save_json(os.path.join(model_path, "encoder_decoder_results.json"), res)

In [37]:
def train(
    model, criterion, optim, source_vocab, target_vocab, num_epochs=10, print_every=500, model_path=encoder_decoder_model_path
):
    if USE_CUDA:
        model.cuda()

    train_losses, valid_losses = [], []
    train_perplexities, valid_perplexities = [], []

    for epoch in range(num_epochs):
        epoch = epoch + 1
        print("Epoch", epoch)
        print("Training the model")
        model.train()

        train_perplexity, train_loss = run_epoch(
            (rebatch(PAD_INDEX, b) for b in train_iter),
            model,
            SimpleLossCompute(model.generator, criterion, optim),
            print_every=print_every,
            num_batches=len(train_iter),
        )

        print("Train Loss: %f" % train_loss)
        model.eval()
        with torch.no_grad():
            print("Evaluating the model")
            # print_examples((rebatch(PAD_INDEX, x) for x in valid_iter), model, n=3, src_vocab=source_vocab.vocab, trg_vocab=target_vocab.vocab)
            dev_perplexity, dev_loss = run_epoch(
                (rebatch(PAD_INDEX, b) for b in valid_iter_batch),
                model,
                SimpleLossCompute(model.generator, criterion, None),
                num_batches=len(valid_iter_batch),
            )

            train_losses.append(train_loss)
            train_perplexities.append(train_perplexity)
            valid_losses.append(dev_loss)
            valid_perplexities.append(dev_perplexity)

            print("*" * 30)
            print("Epoch metrics\n")
            print("Validation perplexity: %3.f \n" % dev_perplexity)
            print("Validation Loss: %3.f " % dev_loss)

            print("*" * 30)

            if epoch == num_epochs:
                model_name = os.path.join(model_path, "encoder_decoder_model.pt")

                print("Saving model %s" % model_name)

                torch.save(model.state_dict(), model_name)

                preds, sources, targets = predict(
                    (rebatch(PAD_INDEX, x) for x in valid_iter_batch),
                    model,
                    max_len=MAX_LEN,
                    src_vocab=source_vocab.vocab,
                    trg_vocab=target_vocab.vocab,
                    num_batches=len(valid_iter_batch),
                )

                save_results(preds, targets, sources)

    return train_perplexities, valid_perplexities

In [38]:
def lookup_words(x, vocab=None):
    if vocab is not None:
        x = [vocab.itos[i] for i in x]
    return [str(t) for t in x]

In [41]:
EMB_SIZE = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 2
DROPOUT_RATE = 0.25
LEARNING_RATE = 2*1e-5

model = make_model(
    len(SOURCE.vocab),
    len(TARGET.vocab),
    emb_size=EMB_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT_RATE,
)

criterion = nn.CrossEntropyLoss(reduction="sum", ignore_index=PAD_INDEX)
optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
NUM_EPOCHS = 15
PRINT_EVERY = 20000

train_perplexities, valid_perplexities = train(
    model=model,
    num_epochs=NUM_EPOCHS,
    print_every=PRINT_EVERY,
    criterion=criterion,
    optim=optim,
    source_vocab=SOURCE,
    target_vocab=TARGET,
)

In [48]:
train_perplexities

[1.890741110764172,
 1.4811114213728331,
 1.3909569295283717,
 1.3182854119263536,
 1.2850568909004818,
 1.2535985835721168,
 1.2308219660135085,
 1.2078453773353905,
 1.1877415031559226,
 1.1736294333979227,
 1.160788853441537,
 1.1484450470626477,
 1.1384484172610505,
 1.1279865213512126,
 1.1190539476046744]

In [49]:
ppls = {
    "train": train_perplexities,
    "val": valid_perplexities
}


save_json(
    os.path.join(encoder_decoder_model_path, "ppls.json"), ppls
)

save_json(
    os.path.join(encoder_decoder_model_path, "source_vocab.json"), SOURCE.vocab.stoi
)

save_json(
    os.path.join(encoder_decoder_model_path, "target_vocab.json"), TARGET.vocab.stoi
)

### Вариант 2

### Модель sequence-to-sequence на основе архитектуры трансформера
https://github.com/CyberZHG/keras-transformer

In [12]:
!python -m pip install --quiet keras-transformer

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [15]:
df.sample(5)

Unnamed: 0,id,src,trg,cn
251876,338008,въ лото 6852 преставися княь 1рославъ алеэсанд...,въ лѣто 6852 преставися князь ярославъ алексан...,60
196760,272746,полмехб лисего хребтоваго длиною 2 аршъи ширин...,полмеха лисего хребтоваго длиною 2 аршъ ширино...,76
82414,116372,9тъ 2огч убо за г⊗ѣхи порабощеъи,отъ бога убо за грѣхи порабощени,32
60381,84973,ис[оріи съиускіъ,исторіи скифскія,16
182205,255556,2пэаыкарубковыешитсолотоми,2 платка рубковые шиты золотомъ,26


In [16]:
from sklearn.model_selection import train_test_split


transformer_model_path = "transformer_model"
if not os.path.exists(transformer_model_path):
    os.makedirs(transformer_model_path)


strata = df["cn"].values
train_df, test_df = train_test_split(df, stratify=strata, train_size=0.9, shuffle=True)


train_df.to_csv(os.path.join(transformer_model_path, "train.tsv"), sep="\t", index=False)
test_df.to_csv(os.path.join(transformer_model_path, "test.tsv"), sep="\t", index=False)

In [17]:
def build_token_dict(text_list):
    token_dict = {
        '<PAD>': 0,
        '<START>': 1,
        '<END>': 2,
    }
    for text in tqdm(text_list, total=len(text_list)):
        for token in text:
            if token not in token_dict:
                token_dict[token] = len(token_dict)
    return token_dict


def prepare_data(df, source_token_dict, target_token_dict):
    df["src_tok"] = df["src"].apply(tokenize)
    df["trg_tok"] = df["trg"].apply(tokenize)

    source_tokens = df["src_tok"].tolist()
    target_tokens = df["trg_tok"].tolist()

    encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
    decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
    output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]
    
    source_max_len = max(map(len, encode_tokens))
    target_max_len = max(map(len, decode_tokens))
    
    encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
    decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
    output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]
    
    encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
    decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
    decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]
    
    return encode_input, decode_input, decode_output

In [None]:
tr_source_token_dict = build_token_dict(train_df.src.tolist())
tr_target_token_dict = build_token_dict(train_df.trg.tolist())

tr_target_token_dict_inv = {v: k for k, v in tr_target_token_dict.items()}
tr_source_token_dict_inv = {v: k for k, v in tr_source_token_dict.items()}

In [23]:
save_json(
    os.path.join(transformer_model_path, "source_token_dict.json"), tr_source_token_dict
)

save_json(
    os.path.join(transformer_model_path, "target_token_dict.json"), tr_target_token_dict
)

In [24]:
from keras.optimizers import Adam
from keras_transformer import get_model
import numpy as np


EMBED_DIM = 512
HIDDEN_DIM = 256
HEAD_NUM = 4
ENC_NUM = 3
DEC_NUM = 3
DROPOUT_RATE = 0.2
LEARNING_RATE = 0.00001


model = get_model(
    token_num=max(len(tr_source_token_dict), len(tr_target_token_dict)),
    embed_dim=EMBED_DIM,
    encoder_num=ENC_NUM,
    decoder_num=DEC_NUM,
    head_num=HEAD_NUM,
    hidden_dim=HIDDEN_DIM,
    dropout_rate=DROPOUT_RATE,
    use_same_embed=False,
)

model.compile(
    optimizer=Adam(lr=LEARNING_RATE), loss='sparse_categorical_crossentropy'
)

In [27]:
BATCH_SIZE = 32
EPOCH_NUM = 15


encode_input, decode_input, decode_output = prepare_data(
    train_df, tr_source_token_dict, tr_target_token_dict
)

model.fit(
    x=[np.array(encode_input), np.array(decode_input)],
    y=np.array(decode_output),
    epochs=EPOCH_NUM,
    batch_size=BATCH_SIZE,
    verbose=2
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Epoch 1/15


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


9170/9170 - 477s - loss: 1.1074
Epoch 2/15
9170/9170 - 477s - loss: 0.6864
Epoch 3/15
9170/9170 - 477s - loss: 0.3320
Epoch 4/15
9170/9170 - 477s - loss: 0.2509
Epoch 5/15
9170/9170 - 477s - loss: 0.2169
Epoch 6/15
9170/9170 - 477s - loss: 0.1975
Epoch 7/15
9170/9170 - 477s - loss: 0.1849
Epoch 8/15
9170/9170 - 478s - loss: 0.1752
Epoch 9/15
9170/9170 - 478s - loss: 0.1674
Epoch 10/15
9170/9170 - 478s - loss: 0.1609
Epoch 11/15
9170/9170 - 478s - loss: 0.1553
Epoch 12/15
9170/9170 - 479s - loss: 0.1503
Epoch 13/15
9170/9170 - 478s - loss: 0.1456
Epoch 14/15
9170/9170 - 478s - loss: 0.1415
Epoch 15/15
9170/9170 - 478s - loss: 0.1376


<tensorflow.python.keras.callbacks.History at 0x7f5a95442a58>

In [28]:
model.save(os.path.join(transformer_model_path, "transformer_model_base.h5"))

### Оценка моделей на внутренней тестовой выборке

In [55]:
!python -m pip install --quiet editdistance

In [62]:
import editdistance


def evaluate(y_true, y_pred, print_num=50):
    numCharErr = 0
    numCharTotal = 0
    numStringOK = 0
    numStringTotal = 0
    counter = 0

    word_eds, word_true_lens = [], []
    
    for i, pred in enumerate(y_pred):
        true = y_true[i]
        
        numStringOK += 1 if true == pred else 0
        
        numStringTotal += 1
        dist = editdistance.eval(pred, true)
        
        numCharErr += dist
        numCharTotal += len(true)
        
        pred_words = pred.split()
        true_words = true.split()
        word_eds.append(editdistance.eval(pred_words, true_words))
        word_true_lens.append(len(true_words))
        
        is_print = np.random.choice([True, False], p=[0.05, 0.95])
        if is_print and counter < print_num and len(true) > 15:
            print('[OK]' if dist==0 else '[ERR:%d]' % dist,'"' + true + '"', '->', '"' + pred + '"')
            counter += 1

    charErrorRate = numCharErr / numCharTotal
    wordErrorRate = sum(word_eds) / sum(word_true_lens) 
    stringAccuracy = numStringOK / numStringTotal
    print(
        'Character error rate: %f%%. Word error rate: %f%%. String accuracy: %f%%.' % \
        (charErrorRate*100.0,wordErrorRate*100.0, stringAccuracy*100.0)
    )

#### Encoder-decoder model

In [50]:
class CharTokenizer(object):

    def __init__(self, config):
        self.config = config
        self.src_stoi = self.config["vocab"]["src_stoi"]
        self.trg_stoi = self.config["vocab"]["trg_stoi"]
        self.src_itos = {v: k for k, v in self.src_stoi.items()}
        self.trg_itos = {v: k for k, v in self.trg_stoi.items()}
        self.eos_token = self.config["tok"]["eos_token"]
        self.unk_token = self.config["tok"]["unk_token"]
        self.pad_token = self.config["tok"]["pad_token"]
        self.sos_token = self.config["tok"]["sos_token"]
    
    def encode(self, sequence):
        enc = [self.src_stoi[char] if char in self.src_stoi else self.stoi[self.unk_token_id] for char in list(sequence)] + [self.src_stoi[self.eos_token]]
        return torch.tensor(enc).unsqueeze(0)
    
    def create_mask(self, enc):
        return (enc != self.src_stoi[self.pad_token]).unsqueeze(-2)
    
    def get_length(self, enc):
        return torch.tensor(enc.shape[-1], dtype=torch.int64).unsqueeze(0)


def load_model(config, device):

    model_params = config["model"]
    model_path = model_params["model_path"]
    emb_size = model_params["emb_size"]
    hidden_size = model_params["hidden_size"]
    num_layers = model_params["num_layers"]
    dropout = model_params["dropout"]

    state_dict = torch.load(model_path)
    source_dim = state_dict["src_embed.weight"].shape[0]
    target_dim = state_dict["trg_embed.weight"].shape[0]

    model = EncoderDecoder(
        Encoder(emb_size, hidden_size, num_layers=num_layers, dropout=dropout),
        Decoder(emb_size, hidden_size, BahdanauAttention(hidden_size), num_layers=num_layers, dropout=dropout),
        nn.Embedding(source_dim, emb_size),
        nn.Embedding(target_dim, emb_size),
        Generator(hidden_size, target_dim)
    )

    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    return model


def load_json(fname_path):
    with open(fname_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def build_config(encoder_decoder_model_path,
                 model_name="encoder_decoder_model.pt",
                 emb_size=256,
                 hidden_size=512,
                 num_layers=2,
                 dropout_rate=0.2,
                 source_vocab="source_vocab.json",
                 target_vocab="target_vocab.json"
                ):
    config = {
        "model": {
            "model_path": os.path.join(encoder_decoder_model_path, model_name),
            "emb_size": emb_size,
            "hidden_size": hidden_size,
            "num_layers": num_layers,
            "dropout": dropout_rate
        },
        "tok": {
            "pad_token": PAD_TOKEN,
            "sos_token": SOS_TOKEN,
            "eos_token": EOS_TOKEN,
            "unk_token": UNK_TOKEN
        },
        "vocab": {
            "src_stoi": load_json(
                os.path.join(encoder_decoder_model_path, source_vocab)
            ),
            "trg_stoi": load_json(
                os.path.join(encoder_decoder_model_path, target_vocab)
            )
        }
    }
    return config

In [51]:
def dummy_copy(sequence, output):
    diff = len(sequence) - len(output)
    if diff > 0:
        return output + sequence[-diff:]
    return output


def greedy_decode(sequence, model, tokenizer, device, copy=False, max_len=256):
    src = tokenizer.encode(sequence).to(device)
    src_mask = tokenizer.create_mask(src).to(device)
    src_length = tokenizer.get_length(src).to(device)
    sos_index = tokenizer.trg_stoi[tokenizer.sos_token]
    eos_index = tokenizer.trg_stoi[tokenizer.eos_token]

    with torch.no_grad():
        encoder_hidden, encoder_final = model.encode(src, src_mask, src_length)
        prev_y = torch.ones(1, 1).fill_(sos_index).type_as(src)
        trg_mask = torch.ones_like(prev_y)

    output = []
    hidden = None

    for i in range(max_len):
        with torch.no_grad():
            out, hidden, pre_output = model.decode(
              encoder_hidden, encoder_final, src_mask,
              prev_y, trg_mask, hidden)
            prob = model.generator(pre_output[:, -1])

        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data.item()
        output.append(next_word)
        prev_y = torch.ones(1, 1).type_as(src).fill_(next_word)
    
    output = np.array(output)
    if eos_index is not None:
        first_eos = np.where(output==eos_index)[0]
        if len(first_eos) > 0:
            output = output[:first_eos[0]]      
    
    output = "".join([tokenizer.trg_itos[token_id] for token_id in output.tolist()])
    return output

In [52]:
import json


encoder_decoder_model_config = build_config(encoder_decoder_model_path=encoder_decoder_model_path)
tokenizer = CharTokenizer(config=encoder_decoder_model_config)
encoder_decoder_model = load_model(config=encoder_decoder_model_config, device=DEVICE)

In [53]:
import json


encoder_decoder_result_path = os.path.join(encoder_decoder_model_path, "encoder_decoder_results.json")
encoder_decoder_result = load_json(encoder_decoder_result_path)

In [54]:
encoder_decoder_prediction = encoder_decoder_result["predictions"]
encoder_decoder_trgs = encoder_decoder_result["targets"]
encoder_decoder_srcs = encoder_decoder_result["sources"]

#### Пример генерации на аугментированных данных

In [58]:
encoder_decoder_srcs[-50]

'потомй жх воздгилнуша церков съ ткапезою др1вяну во имя святыхъ ве8имихн гучедикъ 40 иже въ севаттіи'

In [59]:
encoder_decoder_prediction[-50]

'потомъ же воздвилнуша церковь съ трапезою древяну во имя святыхъ великихъ гученикъ 40 иже въ севастіи'

In [60]:
encoder_decoder_trgs[-50]

'потомъ же воздвигнуша церковь съ трапезою древяну во имя святыхъ великихъ мученикъ 40 иже въ севастіи'

### Исходные значения метрик

In [63]:
evaluate(
    y_true=encoder_decoder_trgs,
    y_pred=encoder_decoder_srcs
)

[ERR:6] "марта въ 31 день" -> "ма1тавъ3ыден"
[ERR:6] "ничто же бо имѣю" -> "вичтожебосмѣт"
[ERR:4] "такъ   отскочили" -> "такъотскодили"
[ERR:2] "1 чаршка яшмовая" -> "1чаршкаяшмовая"
[ERR:5] "сигъ подо зваромъ" -> "сиъьпошоваромъ"
[ERR:3] "пирогъ росольной" -> "пиобгъ росолной"
[ERR:3] "марта въ 21 день" -> "марта вл 21 ьен"
[ERR:3] "трошке васильеву" -> "трршкевасил4еву"
[ERR:2] "иванъ вельяминовъ" -> "иванъвеляминовъ"
[ERR:6] "2 бочки полуперцу" -> "2бопкиколуйенцу"
[ERR:6] "степанку лазареву" -> "стъцансулафреву"
[ERR:4] "7102го  въ 26 день" -> "7102говг 26 ден"
[ERR:5] "иду жъ я въ дорогу" -> "идужъзвъ дор0гу"
[ERR:2] "пирогъ колобобой" -> "пирогъ щплобобой"
[ERR:4] "апрѣля въ 7 день" -> "ам]ѣл7 въ 7 д⊗нь"
[ERR:3] "петръ сандетцкой" -> "петръ жандетцсог"
[OK] "игоне офонасьеву" -> "игоне офонасьеву"
[ERR:3] "поѣхали на войну" -> "поѣхала н2 войнч"
[ERR:4] "исторіи скифскія" -> "ис[оріи съиускіъ"
[ERR:3] "степану щоголеву" -> "йтепуну щоголе2у"
[ERR:4] "гуляй золотаревъ" -> "гуляй 

### Целевые значения метрик

In [64]:
evaluate(
    y_true=encoder_decoder_trgs,
    y_pred=encoder_decoder_prediction
)

[OK] "ноября въ 22 день" -> "ноября въ 22 день"
[OK] "декабря въ 1 день" -> "декабря въ 1 день"
[ERR:7] "и збыся во время се" -> "и бысть возвресяся"
[OK] "марта въ 31 день" -> "марта въ 31 день"
[OK] "олешке кричалеву" -> "олешке кричалеву"
[OK] "ондрей подлесовъ" -> "ондрей подлесовъ"
[OK] "7102го  въ 26 день" -> "7102го  въ 26 день"
[OK] "сентября въ 20 день" -> "сентября въ 20 день"
[OK] "сентября въ 19 день" -> "сентября въ 19 день"
[ERR:1] "4440 четъ ячмени" -> "4440 четъ тячмени"
[ERR:2] "пятой григорьевъ" -> "пятка григорьевъ"
[ERR:2] "тѣ  тамъ погибли" -> "тѣ  таты погибли"
[OK] "богданъ поздеевъ" -> "богданъ поздеевъ"
[OK] "лучѣ бы повѣсить" -> "лучѣ бы повѣсить"
[OK] "беляйку федорову" -> "беляйку федорову"
[OK] "марта въ 31 день" -> "марта въ 31 день"
[ERR:1] "бабарике фролову" -> "бадарике фролову"
[OK] "тренке зиновьеву" -> "тренке зиновьеву"
[OK] "о измѣнѣ донской" -> "о измѣнѣ донской"
[ERR:4] "у другого столпа" -> "у лучного столпа"
[ERR:1] "первуше обрамову" -> "перву

#### Transformer model

In [None]:
import tensorflow.python.util.deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False


te_encode_input, te_decode_input, te_decode_output = prepare_data(
    test_df, tr_source_token_dict, tr_target_token_dict
)

In [None]:
from keras_transformer import decode


def transformer_decode(decode_input, vocab=tr_target_token_dict_inv):
    decode_input = [x for x in decode_input if not vocab[x] in ("<PAD>", "<END>", "<START>")]
    return "".join(map(lambda x: vocab[x], decode_input))


decoded = decode(
    model,
    te_encode_input,
    start_token=tr_target_token_dict['<START>'],
    end_token=tr_target_token_dict['<END>'],
    pad_token=tr_target_token_dict['<PAD>'],
    temperature=1.0,
)

In [33]:
y_transformer_test_true = [transformer_decode(x) for x in te_decode_input]
y_transformer_test_true[:5]

['богь отець богь сынь правда истинна милосердіе мирь',
 'куплены въ полату въ казенную на дверь',
 'у подлинной памяти припись діака григорья нечаева справка подьячево юрья собакина',
 'у главы жъ 2 жемчюга да 2 яхонты въ гнѣздехъ',
 'на оборотѣ столникъ донской']

In [34]:
y_transformer_test_pred = [transformer_decode(x) for x in decoded]
y_transformer_test_pred[:5]

['того ототь бого сынь правда истанна миласердіе синь',
 'куплены въ солату въ казеннуюнцоверь',
 'у подлинной памяти припись діака григоря нечаева справка подьячевою рясобакина',
 'у главе жъ и жежзюга да и яхостыхъ гнѣздехъ',
 'на оборотѣ похлоикъ понакой']

### Исходные значения метрик

In [38]:
evaluate(
    y_true=y_transformer_test_true,
    y_pred=test_df["src"].tolist()
)

[ERR:11] "но изгубилъ многое воинство отыде со срамомъ многимъ" -> "но итгубилъ хногоу во]нстщо отцде 8о срамоьс внокимъ"
[ERR:8] "подать емельяну игнатевичю украинцову" -> "податг имесьюнуигнатемичю украингооу"
[ERR:7] "стольники были на службѣ безъсъезду" -> "стольни6и э5ли 7а слу]бѣ жезъс езду"
[ERR:9] "а по осмотру въ томъ сундукѣ бѣлаго платя" -> "6 по осмутру вч тцмъ …у0дукѣ бѣла0о шлакя"
[ERR:11] "на оборотѣ  172 октября въ 16 д подалъ бобыль гаврилка" -> "на обо2оиѣ  172 оi3ябдя въ 10 н подалъ бо5ыль гжвѣилк…"
[OK] "а у подлинной грамоты назади пишетъ" -> "а у подлинной грамоты назади пишетъ"
[ERR:13] "потребно ю съясти изрядно разрѣзавши на мелкія части" -> "2нткефнф ю сьязти 1⊗р1дно разрѣзавшина меніія части"
[OK] "и сведши турки три брани съ волохи на нихъ же всюду поражаеми бяху" -> "и сведши турки три брани съ волохи на нихъ же всюду поражаеми бяху"
[ERR:1] "богъ же по   и исцелиль ихъ" -> "богъ же по   и исцелил ихъ"
[ERR:13] "и были въ войскѣ запорожскомъ у гетмана у бог

### Целевые значения метрик

In [39]:
evaluate(
    y_true=y_transformer_test_true,
    y_pred=y_transformer_test_pred
)

[ERR:9] "о войнѣ на греки како посылалъ велики князь ярославъ сына своего воевати царьградъ" -> "о вридѣ за греки како посылалъ велики кназь ярославъ сына своего воечаты цареградѣ"
[ERR:5] "а переславское драгунское и салдатцкое  устроить въ переславле  резанскомъ" -> "а переславское драгу нское и салдатцкое устроить въ переславле реренскомъ"
[ERR:8] "двѣ пари ножей съ вилки съ серебреною оправою" -> "двѣ парино жей съ вилки съ серебрея обрававою"
[ERR:8] "по той дорогѣ много горъ по которымъ растутъ мелкіе лѣса" -> "по все дорскѣ много годъ по потерымъ растутъ мелкіе лѣса"
[ERR:2] "князь петръ канъмурзинъ сынъ урусовъ" -> "князь петръ канъ муринъ сынъ урусовъ"
[ERR:7] "и черкасы тотчасъ пошли съ ними подъ астрахань" -> "2 черка сытотча съ пошлитъ ними полъ астрахань"
[ERR:4] "деисусъ 7 образовъ писанъ на стѣнѣ на золотѣ" -> "черсусъ 7 отразовъ писанъ на стѣнѣ на молотѣ"
[ERR:4] "буди же по глаголу вашему" -> "буди же поглаго оувашему"
[ERR:12] "пускай ево испекли  хлѣбъ сладокъ святѣй