In [1]:
import matplotlib.pyplot as plt
import itertools
import numpy as np
import random
import re

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
# from unidecode import unidecode

%matplotlib inline

# Choose text samples and corporas

In [2]:
SAMPLE_TEXTS = {
    'EN': '''On the other hand, we denounce with righteous
    indignation and dislike men who are so beguiled and
    demoralized by the charms of pleasure of the moment, so
    blinded by desire, that they cannot foresee the pain and
    trouble that are bound to ensue; and equal blame belongs to
    those who fail in their duty through weakness of will, which
    is the same as saying through shrinking from toil and pain.
    These cases are perfectly simple and easy to distinguish.
    In a free hour, when our power of choice is untrammelled
    and when nothing prevents our being able to do what we
    like best, every pleasure is to be welcomed and every
    pain avoided. But in certain circumstances and owing to
    the claims of duty or the obligations of business it will
    frequently occur that pleasures have to be repudiated and
    annoyances accepted. The wise man therefore always holds
    in these matters to this principle of selection: he
    rejects pleasures to secure other greater pleasures, or
    else he endures pains to avoid worse pains.''',
    
    'RU': '''С учетом сложившейся международной обстановки,
    семантический разбор внешних противодействий является
    качественно новой ступенью экспериментов, поражающих по
    своей масштабности и грандиозности. С другой стороны,
    дальнейшее развитие различных форм деятельности не дает
    нам иного выбора, кроме определения благоприятных
    перспектив. Наше дело не так однозначно, как может
    показаться: семантический разбор внешних противодействий
    представляет собой интересный эксперимент проверки
    глубокомысленных рассуждений. Но явные признаки победы
    институционализации представлены в исключительно
    положительном свете. Но некоторые особенности внутренней
    политики призывают нас к новым свершениям, которые, в
    свою очередь, должны быть смешаны с не уникальными
    данными до степени совершенной неузнаваемости, из-за
    чего возрастает их статус бесполезности.'''
}

In [3]:
CORPORA_PATHS = {
    'EN': './corpora/WarAndPeaceEng.txt',
    'RU': './corpora/WarAndPeace.txt',
}

# Replacement coding

In [4]:
class ReplacementCoding:
    def __init__(self, mapping):
        self._mapping = mapping
        self._inverse_mapping = {v: k for k, v in mapping.items()}
        
    def encode(self, text):
        return [self._mapping[elem] for elem in text]
        
    def decode(self, text):
        return [self._inverse_mapping[elem] for elem in text]
    
    def encodes(self, text):
        return ''.join(self.encode(text))
    
    def decodes(self, text):
        return ''.join(self.decode(text))
    
    def inversed(self):
        return ReplacementCoding(self._inverse_mapping)
    
    @staticmethod
    def composition(*codings):
        assert len(codings) > 0
        mapping = dict(codings[0]._mapping)
        for i, coding in enumerate(codings[1:]):
            mapping = {k: coding._mapping[v] for k, v in mapping.items()}
        return ReplacementCoding(mapping)
    
    @staticmethod
    def shuffle(vocabulary, order):
        src = list(vocabulary)
        dst = [src[i] for i in order]
        return ReplacementCoding(dict(zip(src, dst)))
    
    @staticmethod
    def random_shuffle(vocabulary):
        src = list(vocabulary)
        dst = list(src)
        random.shuffle(dst)
        return ReplacementCoding(dict(zip(src, dst)))

# Text wrapper

In [5]:
class Text:
    def __init__(self, s, preprocess=True):  
        if preprocess:
            self.preprocessed = re.sub(r"[\W\s]+", ' ', s, flags=re.MULTILINE | re.UNICODE).lower()
        else:
            self.preprocessed = s
        
        self.count_vectorizers = {
            'char': CountVectorizer(analyzer='char', ngram_range=(1, 1)),
            'bigram': CountVectorizer(analyzer='char', ngram_range=(2, 2)),
        }
        
        self.stats = {
            mode: vectorizer.fit_transform([self.preprocessed]).toarray().reshape(-1)
            for mode, vectorizer in self.count_vectorizers.items()
        }
        
        self.inds_coding = {
            mode: ReplacementCoding(vectorizer.vocabulary_)
            for mode, vectorizer in self.count_vectorizers.items()
        }
        
    @staticmethod
    def load(path):
        with open(path) as f:
            return Text(f.read())

In [6]:
corporas = {lang: Text.load(path) for lang, path in CORPORA_PATHS.items()}
sample_texts = {lang: Text(text) for lang, text in SAMPLE_TEXTS.items()}

# Cryptoanalysis

## Gready most common char matching 

In [7]:
def guess_coding_by_gready_most_common_char_matching(corpora_text, encoded_text):
    encoded_stats = encoded_text.stats['char']
    corpora_stats = corpora_text.stats['char']
    
    assert np.any(np.unique(corpora_stats, return_counts=True)[1] > 1)
    
    return ReplacementCoding.composition(
        encoded_text.inds_coding['char'],
        ReplacementCoding.shuffle(np.arange(encoded_stats.shape[0]), np.argsort(-encoded_stats)).inversed(),
        ReplacementCoding.shuffle(np.arange(corpora_stats.shape[0]), np.argsort(-corpora_stats)),
        corpora_text.inds_coding['char'].inversed()
    ).inversed()

In [8]:
def ansi_colored(s, color):
    ESC = '\x1b'
    RED_BG = ESC + '[41m'
    GREEN_BG  = ESC + '[42m'
    
    # beginc = {'red': '\033[91m', 'green': '\033[92m'}
    beginc = {'red': RED_BG, 'green': GREEN_BG}
    endc = '\x1b[0m'
    return beginc[color] + s + endc
    

def print_matching(message, s, matching_s):
    colored_s = ''.join(
        ansi_colored(char, 'red' if char != match else 'green')
        for char, match in zip(s, matching_s)
    )
    print(message, colored_s, end='\n\n')


def report_guess_accuracy(true_coding, pred_coding, text_preprocesed):
    pred = pred_coding.decode(true_coding.encode(text_preprocesed))
    print_matching('Pred:', pred, text_preprocesed)
    print_matching('True:', text_preprocesed, pred)
    
    score = accuracy_score(list(text_preprocesed), pred)
    print(f'CER: {100 * (1 - score):.2f}%')


def evaluate_guesser(coding_guesser, corpora_text, sample_text):
    true_coding = ReplacementCoding.random_shuffle(
        corpora_text.count_vectorizers['char'].vocabulary_.keys()
    )
    encoded = true_coding.encodes(sample_text.preprocessed)
    pred_coding = coding_guesser(corpora_text, Text(encoded))
    report_guess_accuracy(true_coding, pred_coding, sample_text.preprocessed)

In [9]:
evaluate_guesser(guess_coding_by_gready_most_common_char_matching, corporas['EN'], sample_texts['EN'])

Pred: [41ma[0m[42mn[0m[42m [0m[42mt[0m[41mr[0m[42me[0m[42m [0m[41ma[0m[42mt[0m[41mr[0m[42me[0m[41ms[0m[42m [0m[41mr[0m[41mo[0m[42mn[0m[42md[0m[42m [0m[41mc[0m[42me[0m[42m [0m[42md[0m[42me[0m[42mn[0m[41ma[0m[42mu[0m[42mn[0m[41mm[0m[42me[0m[42m [0m[41mc[0m[41mh[0m[42mt[0m[41mr[0m[42m [0m[41ms[0m[41mh[0m[41my[0m[41mr[0m[42mt[0m[42me[0m[41ma[0m[42mu[0m[41mi[0m[42m [0m[41mh[0m[42mn[0m[42md[0m[41mh[0m[41my[0m[42mn[0m[41mo[0m[42mt[0m[41mh[0m[41ma[0m[42mn[0m[42m [0m[41mo[0m[42mn[0m[42md[0m[42m [0m[42md[0m[41mh[0m[41mi[0m[42ml[0m[41mh[0m[42mk[0m[42me[0m[42m [0m[41mg[0m[42me[0m[42mn[0m[42m [0m[41mc[0m[41mr[0m[41ma[0m[42m [0m[41mo[0m[41ms[0m[42me[0m[42m [0m[41mi[0m[41ma[0m[42m [0m[41mf[0m[42me[0m[41my[0m[42mu[0m[41mh[0m[42ml[0m[42me[0m[42md[0m[42m [0m[41mo[0m[42mn[0m[42md[0m[42m [0m[42md[0m[42me[0m[41

In [10]:
evaluate_guesser(guess_coding_by_gready_most_common_char_matching, corporas['RU'], sample_texts['RU'])

Pred: [41mн[0m[42m [0m[41mз[0m[42mч[0m[41mо[0m[42mт[0m[41mа[0m[41mп[0m[42m [0m[41mн[0m[41mд[0m[41mа[0m[41mш[0m[42mи[0m[42mв[0m[41mй[0m[41mо[0m[41mя[0m[41mн[0m[41mы[0m[42m [0m[41mп[0m[41mо[0m[41mш[0m[41mм[0m[41mз[0m[41mе[0m[41mс[0m[41mл[0m[41mа[0m[41mм[0m[41mе[0m[41mа[0m[41mя[0m[42m [0m[41mа[0m[41mь[0m[41mн[0m[42mт[0m[41mс[0m[41mе[0m[41mа[0m[42mв[0m[42mк[0m[42mи[0m[42m [0m[41mн[0m[41mо[0m[41mп[0m[41mс[0m[41mе[0m[42mт[0m[42mи[0m[42mч[0m[41mо[0m[41mн[0m[42mк[0m[42mи[0m[41mя[0m[42m [0m[41mл[0m[41mс[0m[41mг[0m[41mь[0m[41mа[0m[41mл[0m[42m [0m[42mв[0m[41mе[0m[41mо[0m[41mй[0m[41mе[0m[42mи[0m[41mж[0m[42m [0m[41mр[0m[41mл[0m[41mа[0m[42mт[0m[42mи[0m[42mв[0m[41mа[0m[41mм[0m[41mо[0m[41mя[0m[41mн[0m[42mт[0m[42mв[0m[42mи[0m[41mя[0m[42m [0m[41mы[0m[42mв[0m[41mд[0m[41mы[0m[41mо[0m[42mт[0m[41mн[0m[41mы[0m[42

In [11]:
PROBLEM_TEXT = '←⇠⇒↟↹↷⇊↹↷↟↤↟↨←↹↝⇛⇯↳⇴⇒⇈↝⇊↾↹↟⇒↟↹⇷⇛⇞↨↟↹↝⇛⇯↳⇴⇒⇈↝⇊↾↹↨←⇌⇠↨↹⇙↹⇸↨⇛↙⇛↹⇠⇛⇛↲⇆←↝↟↞↹⇌⇛↨⇛⇯⇊↾↹⇒←↙⇌⇛↹⇷⇯⇛⇞↟↨⇴↨⇈↹⇠⇌⇛⇯←←↹↷⇠←↙⇛↹↷⇊↹↷⇠←↹⇠↤←⇒⇴⇒↟↹⇷⇯⇴↷↟⇒⇈↝⇛↹↟↹⇷⇛⇒⇙⇞↟↨←↹↳⇴⇌⇠↟↳⇴⇒⇈↝⇊↾↹↲⇴⇒⇒↹⇰⇴↹⇷⇛⇠⇒←↤↝←←↹⇞←↨↷←⇯↨⇛←↹⇰⇴↤⇴↝↟←↹⇌⇙⇯⇠⇴↹↘⇛↨↞↹⇌⇛↝←⇞↝⇛↹↞↹↝↟⇞←↙⇛↹↝←↹⇛↲←⇆⇴⇏'

for lang in ('RU', 'EN'):
    coding = guess_coding_by_gready_most_common_char_matching(corporas[lang], Text(PROBLEM_TEXT, preprocess=False))
    print(coding.decodes(PROBLEM_TEXT))
    print()

алте рд реьеса иовынтуидя ете помсе иовынтуидя саклс б eсого лоозжаиеч косовдя тагко пвомеснсу лковаа рлаго рд рла льатнте пвнретуио е потбмеса ынклеынтуидя знтт йн полтаьиаа масравсоа йньниеа кбвлн шосч коиамио ч иемаго иа озажнх

tsia du dayaht oerpnicouf aia wemha oerpnicouf htlsh v 1hege seebjtoak leheruf itgle wremahnhc slertt dstge du dst sytinia wrndaicoe a weivmaht pnlsapnicouf bnii xn wesityott mthdtrhet xnynoat lvrsn zehk leotmoe k oamtge ot ebtjnq



## Gready most common bigram matching 

In [12]:
def get_bigrams(text):
    if len(text) & 1:
        text = text[:-1]
    return [tok for tok in re.split(r'(.{2})', text) if tok]


def guess_coding_by_gready_most_common_bigram_matching(corpora_text, encoded_text):
    encoded_stats = encoded_text.stats['bigram']
    corpora_stats = corpora_text.stats['bigram']

    assert np.any(np.unique(corpora_stats, return_counts=True)[1] > 1)
    
    return ReplacementCoding.composition(
        encoded_text.inds_coding['bigram'],
        ReplacementCoding.shuffle(np.arange(encoded_stats.shape[0]), np.argsort(-encoded_stats)).inversed(),
        ReplacementCoding.shuffle(np.arange(corpora_stats.shape[0]), np.argsort(-corpora_stats)),
        corpora_text.inds_coding['bigram'].inversed()
    ).inversed()


coding = guess_coding_by_gready_most_common_bigram_matching(corporas['EN'], corporas['EN'])

In [13]:
def evaluate_bigram_guesser(coding_guesser, corpora_text, sample_text):
    true_coding = ReplacementCoding.random_shuffle(
        corpora_text.count_vectorizers['bigram'].vocabulary_.keys()
    )
    encoded = true_coding.encodes(get_bigrams(sample_text.preprocessed))
    pred_coding = guess_coding_by_gready_most_common_bigram_matching(corpora_text, Text(encoded))
    report_guess_accuracy(true_coding, pred_coding, get_bigrams(sample_text.preprocessed))
    
    
evaluate_bigram_guesser(guess_coding_by_gready_most_common_bigram_matching, corporas['EN'], sample_texts['EN'])

Pred: [41m m[0m[42m t[0m[41m s[0m[41m a[0m[41mhe[0m[41mo [0m[41mpe[0m[41mat[0m[41m w[0m[41m r[0m[41my [0m[41m c[0m[41mof[0m[41mro[0m[42me [0m[41ma [0m[41mhe[0m[41mom[0m[41mee[0m[41meg[0m[41mlu[0m[41mew[0m[41mle[0m[41m h[0m[41mee[0m[41msy[0m[41mde[0m[41m m[0m[41md [0m[41m h[0m[41my [0m[41mce[0m[41mut[0m[41msu[0m[41mwe[0m[41m c[0m[41mre[0m[41mti[0m[41md [0m[41mt [0m[41mne[0m[41mnd[0m[41mad[0m[41mnb[0m[41mno[0m[41mte[0m[41md [0m[41m h[0m[41my [0m[41mdu[0m[41mra[0m[41msb[0m[41mry[0m[41mte[0m[41med[0m[41mis[0m[41mhe[0m[42me [0m[41mow[0m[41mht[0m[41mwi[0m[41m a[0m[41mon[0m[41m g[0m[41m e[0m[41mse[0m[41mt [0m[41m a[0m[41mon[0m[41mhe[0m[42me [0m[41mhy[0m[41mlo[0m[41mct[0m[41mne[0m[41mnd[0m[41mve[0m[41man[0m[41mey[0m[41m w[0m[41mgu[0m[41my [0m[41mer[0m[41mar[0m[42me [0m[41mhe[0m[41mto[0m[42m t[0m[41m s[0m[41mis[0m[41

In [14]:
evaluate_bigram_guesser(guess_coding_by_gready_most_common_bigram_matching, corporas['RU'], sample_texts['RU'])

Pred: [41mно[0m[41mжу[0m[41mл [0m[41m т[0m[41mна[0m[41mкр[0m[41mбу[0m[41mык[0m[41mен[0m[41mрш[0m[41mом[0m[41mев[0m[41mд [0m[41mпо[0m[41mст[0m[41mue[0m[41mть[0m[41mаб[0m[41mнс[0m[41mка[0m[41mо [0m[41m r[0m[41mе [0m[41mнн[0m[41mль[0m[41mра[0m[41mбы[0m[41mпр[0m[41mта[0m[41m с[0m[41mан[0m[41m к[0m[41mя [0m[41mне[0m[41mал[0m[41mид[0m[41mыл[0m[41mи [0m[41mст[0m[41mго[0m[41m о[0m[41mь [0m[41mны[0m[41mси[0m[41mу [0m[41mзм[0m[41mих[0m[41m г[0m[41mсс[0m[41mли[0m[41mю [0m[41mых[0m[41mа [0m[41mво[0m[41mос[0m[41m и[0m[41mо [0m[41m о[0m[41m с[0m[41mа [0m[41mиж[0m[41m н[0m[41mмп[0m[41mоб[0m[41mми[0m[41mол[0m[41mза[0m[41mей[0m[41mра[0m[41mle[0m[41mи [0m[41mя [0m[41m ш[0m[41mче[0m[41mpr[0m[41m ч[0m[41mок[0m[41mна[0m[41m о[0m[41mен[0m[41mом[0m[41m м[0m[41mыт[0m[41mзн[0m[41mо [0m[41mа [0m[41mе [0m[41mе [0m[41mес[0m[41mог[0m[41

## MCMC top bigram matching 

Идея:
- представим статистику частотности биграм в виде матрицы (строка -- первый символ биграма, колонка -- второй, значение -- доля вхождений)
- будем семплировать матрицы, соответствующие случайным одновременным перестановкам в столбцах и строках исходной (в поисках наилучшего приближения "целевой" матрицы)
- т.к. это не работает, будем семплировать субоптимальные перестановки матрицы
- для части случайных шифров этот подход показывает результаты лучше, чем униграмный guesser (результаты в этом смысле нестабильны)

In [15]:
def gen_pairs(n):
    xv, yv = np.meshgrid(np.arange(n), np.arange(n))
    xv, yv = xv.flatten(), yv.flatten()
    mask = xv < yv
    return xv[mask], yv[mask]

def get_swap_scores(src, order, dst):
    i, j = gen_pairs(len(order))
    assert np.all(i < j)
    
    oi = order[i]
    oj = order[j]
    
    old_loss = (
        np.square(src[oi] - dst[i]).sum(axis=1)
        + np.square(src[:,oi] - dst[:,i]).sum(axis=0)
        - np.square(src[oi,oi] - dst[i,i])
    ) + (
        np.square(src[oj] - dst[j]).sum(axis=1)
        + np.square(src[:,oj] - dst[:,j]).sum(axis=0)
        - np.square(src[oj,oj] - dst[j,j])
    )
    new_loss = (
        np.square(src[oi] - dst[j]).sum(axis=1)
        + np.square(src[:,oi] - dst[:,j]).sum(axis=0)
        - np.square(src[oi,oi] - dst[j,j]) 
    ) + (
        np.square(src[oj] - dst[i]).sum(axis=1)
        + np.square(src[:,oj] - dst[:,i]).sum(axis=0)
        - np.square(src[oj,oj] - dst[i,i])
    )

    return i, j, old_loss - new_loss


def do_best_swap(src, order, dst):
    old_score = np.square(src[order][:,order] - dst).mean()
    i, j, scores = get_swap_scores(src, order, dst)
    best_swap_idx = np.argmax(scores)
    
    order[[i[best_swap_idx],j[best_swap_idx]]] = np.copy(order[[j[best_swap_idx],i[best_swap_idx]]])

In [16]:
def sample_suboptimal_reorder(src, dst, max_swaps=200, order=None):
    assert src.ndim == dst.ndim
    assert src.shape[0] == src.shape[1]
    assert src.shape == dst.shape
    
    if order is None:
        order = np.arange(src.shape[0])
        np.random.shuffle(order)

    for i in range(max_swaps):        
        do_best_swap(src, order, dst)

    return order


def bootstrap_suboptimal_matrix_reorder(src, dst, num_samples=2000):
    assert src.ndim == dst.ndim
    assert src.shape[0] == src.shape[1]
    assert src.shape == dst.shape
    
    best = np.arange(src.shape[0])
    np.random.shuffle(best)
    best_loss = np.square(src[best][:,best] - dst).sum()
    print(best_loss)

    for _ in tqdm(range(num_samples)):
        candidate = sample_suboptimal_reorder(src, dst)
        candidate_loss = np.square(src[candidate][:,candidate] - dst).sum()
        if candidate_loss < best_loss:
            best, best_loss = candidate, candidate_loss
            print(best_loss)
            
        candidate = sample_suboptimal_reorder(src, dst, order=np.argsort(candidate))
        candidate_loss = np.square(src[candidate][:,candidate] - dst).sum()
        if candidate_loss < best_loss:
            best, best_loss = candidate, candidate_loss
            print(best_loss)
            
    return best


In [23]:
def get_char_vocab(text, top_k=None):
    top_inds = np.argsort(-text.stats['char'])
    if top_k is not None:
        top_inds = top_inds[:top_k]
    top_inds = top_inds
    
    new_vocab = {}
    v = []
    for top_ind in top_inds: # O(char_vocab^2) seems irrelevant
        for char, old_ind in text.count_vectorizers['char'].vocabulary_.items():
            if old_ind == top_ind:
                new_vocab[char] = len(new_vocab)
                v.append(char)

    return new_vocab


def get_bigram_stats_matrix(vocab, text):
    matrix = np.zeros((len(vocab), len(vocab)))
                      
    stats = text.stats['bigram'].flatten()
    stats = stats / (len(text.preprocessed) - 1)
                      
    for (c1, c2), i in text.count_vectorizers['bigram'].vocabulary_.items():
        if c1 in vocab and c2 in vocab:
            matrix[vocab[c1], vocab[c2]] = stats[i]
    return matrix
            

def guess_coding_by_mcmc_bigram_matching(corpora_text, encoded_text, num_samples=10):
    # if corpora vocabulary is richer, exclude least frequent chars
    dst_vocab = get_char_vocab(encoded_text)
    src_vocab = get_char_vocab(corpora_text, top_k=len(dst_vocab))
    
    # normalized bigram frequency is stored in matrix[bigram[0]][bigram[1]]
    dst = get_bigram_stats_matrix(dst_vocab, corpora_text)
    src = get_bigram_stats_matrix(src_vocab, encoded_text)

    reorder = bootstrap_suboptimal_matrix_reorder(src, dst, num_samples)
    
    return ReplacementCoding.composition(
        ReplacementCoding(src_vocab),
        ReplacementCoding.shuffle(np.arange(len(src_vocab)), reorder).inversed(),
        ReplacementCoding(dst_vocab).inversed()
    )


coding = guess_coding_by_mcmc_bigram_matching(corporas['EN'], sample_texts['EN'])
print(coding._mapping)

0.019772555431209256


  0%|          | 0/10 [00:00<?, ?it/s]

0.0015629173104991737
0.0015625970672635257
{' ': ' ', 'e': 'e', 't': 't', 'a': 'a', 'o': 'o', 'n': 'n', 'i': 'i', 'h': 'h', 's': 's', 'r': 'r', 'd': 'd', 'l': 'l', 'u': 'u', 'm': 'm', 'c': 'c', 'w': 'w', 'f': 'f', 'g': 'g', 'y': 'y', 'p': 'p', 'b': 'b', 'v': 'v', 'k': 'k', 'x': 'q', 'j': 'j', 'z': 'z'}


In [26]:
def evaluate_mcmc_guesser(coding_guesser, corpora_text, sample_text):
    sample_vocab = get_char_vocab(sample_text)
    corpora_vocab = get_char_vocab(corpora_text, top_k=len(sample_vocab))
    true_coding = ReplacementCoding.composition(
        ReplacementCoding(sample_vocab),
        ReplacementCoding.random_shuffle(np.arange(len(sample_vocab))),
        ReplacementCoding(corpora_vocab).inversed()
    )

    encoded = true_coding.encodes(sample_text.preprocessed)
    pred_coding = coding_guesser(corpora_text, Text(encoded)).inversed()
    report_guess_accuracy(true_coding, pred_coding, sample_text.preprocessed)

In [29]:
evaluate_mcmc_guesser(guess_coding_by_mcmc_bigram_matching, corporas['EN'], sample_texts['EN'])

0.01663710188653929


  0%|          | 0/10 [00:00<?, ?it/s]

0.004892027973919156
0.004664570795264694
Pred: [41mr[0m[41ml[0m[42m [0m[42mt[0m[42mh[0m[42me[0m[42m [0m[41mr[0m[42mt[0m[42mh[0m[42me[0m[41mn[0m[42m [0m[42mh[0m[42ma[0m[41ml[0m[42md[0m[42m [0m[42mw[0m[42me[0m[42m [0m[42md[0m[42me[0m[41ml[0m[41mr[0m[42mu[0m[41ml[0m[41mp[0m[42me[0m[42m [0m[42mw[0m[41mo[0m[42mt[0m[42mh[0m[42m [0m[41mn[0m[41mo[0m[41mf[0m[42mh[0m[42mt[0m[42me[0m[41mr[0m[42mu[0m[42ms[0m[42m [0m[41mo[0m[41ml[0m[42md[0m[41mo[0m[41mf[0m[41ml[0m[42ma[0m[42mt[0m[41mo[0m[41mr[0m[41ml[0m[42m [0m[42ma[0m[41ml[0m[42md[0m[42m [0m[42md[0m[41mo[0m[42ms[0m[41mi[0m[41mo[0m[41mx[0m[42me[0m[42m [0m[42mm[0m[42me[0m[41ml[0m[42m [0m[42mw[0m[42mh[0m[41mr[0m[42m [0m[42ma[0m[41mn[0m[42me[0m[42m [0m[42ms[0m[41mr[0m[42m [0m[41mc[0m[42me[0m[41mf[0m[42mu[0m[41mo[0m[41mi[0m[42me[0m[42md[0m[42m [0m[42ma[0m[41ml[0m[

In [27]:
evaluate_mcmc_guesser(guess_coding_by_mcmc_bigram_matching, corporas['RU'], sample_texts['RU'])

0.011489882367471772


  0%|          | 0/10 [00:00<?, ?it/s]

0.004600861471532155
0.004594584665688642
0.004517875339750004
0.004477532662100582
0.004131373650261073
Pred: [41mп[0m[42m [0m[41mч[0m[41mш[0m[42mе[0m[42mт[0m[42mо[0m[41mл[0m[42m [0m[41mп[0m[41mр[0m[42mо[0m[41mх[0m[42mи[0m[42mв[0m[41mы[0m[42mе[0m[41mя[0m[41mп[0m[41mм[0m[42m [0m[41mл[0m[42mе[0m[41mх[0m[41mб[0m[41mч[0m[42mн[0m[42mа[0m[41mг[0m[42mо[0m[41mб[0m[42mн[0m[42mо[0m[41mя[0m[42m [0m[42mо[0m[41mз[0m[41mп[0m[42mт[0m[42mа[0m[42mн[0m[42mо[0m[42mв[0m[42mк[0m[42mи[0m[42m [0m[41mп[0m[42mе[0m[41mл[0m[42mа[0m[42mн[0m[42mт[0m[42mи[0m[41mш[0m[42mе[0m[41mп[0m[42mк[0m[42mи[0m[41mя[0m[42m [0m[41mг[0m[42mа[0m[41mд[0m[41mз[0m[42mо[0m[41mг[0m[42m [0m[42mв[0m[42mн[0m[42mе[0m[41mы[0m[42mн[0m[42mи[0m[41mь[0m[42m [0m[41mс[0m[41mг[0m[42mо[0m[42mт[0m[42mи[0m[42mв[0m[42mо[0m[41mб[0m[42mе[0m[41mя[0m[41mп[0m[42mт[0m[42mв[0m[42mи[0