<h2>1. Доп. ранжирование по вероятности<h2>

In [8]:
pip install textdistance

Note: you may need to restart the kernel to use updated packages.


In [137]:
import numpy as np
import pandas as pd
import textdistance
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

In [138]:
corpus = open('wiki_data.txt', encoding='utf8').read()
vocab = Counter(re.findall('\w+', corpus.lower()))

word2id = list(vocab.keys())
id2word = {i:word for i, word in enumerate(vocab)}


vec = CountVectorizer(analyzer='char', ngram_range=(1,3), max_features=1000)
X = vec.fit_transform(vocab)

In [139]:
def get_closest_match_vec(text, X, vec, topn=20):
    v = vec.transform([text])
    
    similarities = cosine_distances(v, X)[0]
    topn = similarities.argsort()[:topn] 
    
    return [(id2word[top], similarities[top]) for top in topn]

In [140]:
def get_closest_match_with_metric(text, lookup,topn=20, metric=textdistance.levenshtein):
    similarities = Counter()
    
    for word in lookup:
        similarities[word] = metric.normalized_similarity(text, word) 
    
    return similarities.most_common(topn)

In [141]:
def get_closest_hybrid_match(text, X, vec, topn=3, metric=textdistance.damerau_levenshtein):
    candidates = get_closest_match_vec(text, X, vec, topn*4)
    lookup = [cand[0] for cand in candidates]
    closest = get_closest_match_with_metric(text, lookup, topn, metric=metric)

    
    return closest

N = sum(vocab.values())

def P(word, N=N):
    return vocab[word] / N

def predict_mistaken(word, vocab):
    return 0 if word in vocab else 1

In [142]:
get_closest_hybrid_match('сонце', X, vec)

[('солнце', 0.8333333333333334), ('соне', 0.8), ('донце', 0.8)]

In [143]:
from difflib import get_close_matches

In [144]:
def get_closest_hybrid_match(text, X, vec, topn=1, metric=textdistance.damerau_levenshtein):
    candidates = get_closest_match_vec(text, X, vec, topn*4)
    lookup = [cand[0] for cand in candidates]
    closest = get_closest_match_with_metric(text, lookup, topn, metric=metric)

    return closest

In [145]:
get_closest_hybrid_match('сонце', X, vec)

[('солнце', 0.8333333333333334)]

<h2>2. Symspell<h2>

In [146]:
from string import punctuation
from nltk import sent_tokenize
punctuation += "«»—…“”"
punct = set(punctuation)
from sklearn.metrics import classification_report, accuracy_score
from string import punctuation

In [147]:
def correction(word): 
    return max(candidates(word), key=P)

def candidates(word): 
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    return set(w for w in words if w in vocab)

def edits1(word):
    letters    = 'йцукенгшщзхъфывапролджэячсмитьбюё'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    return set(deletes)

def edits2(word): 
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [148]:
bad = open('sents_with_mistakes.txt', encoding='utf8').read().splitlines()
true = open('correct_sents.txt', encoding='utf8').read().splitlines()

len(true)

915

In [149]:
def align_words(sent_1, sent_2):
    tokens_1 = sent_1.lower().split()
    tokens_2 = sent_2.lower().split()
    
    tokens_1 = [token.strip(punctuation) for token in tokens_1]
    tokens_2 = [token.strip(punctuation) for token in tokens_2]
    
    tokens_1 = [token for token in tokens_1 if token]
    tokens_2 = [token for token in tokens_2 if token]
    
    assert len(tokens_1) == len(tokens_2)
    
    return list(zip(tokens_1, tokens_2))

In [150]:
word = 'сонце'
splits = [(word[:i], word[i:])    for i in range(len(word) + 1)]

In [151]:
mistakes = []
total_mistaken = 0
mistaken_fixed = 0

total_correct = 0
correct_broken = 0

total = 0
correct = 0

cashed = {}
for i in range(len(true)):
    word_pairs = align_words(true[i], bad[i])
    for pair in word_pairs:
        if predict_mistaken(pair[1], vocab):
            pred = cashed.get(pair[1], get_closest_hybrid_match(pair[1], X, vec)[0][0])
            cashed[pair[1]] = pred
        else:
            pred = pair[1]
        
            
        if pred == pair[0]:
            correct += 1
        else:
            mistakes.append((pair[0], pair[1], pred))
        total += 1
            
        if pair[0] == pair[1]:
            total_correct += 1
            if pair[0] != pred:
                correct_broken += 1
        else:
            total_mistaken += 1
            if pair[0] == pred:
                mistaken_fixed += 1
    
    if not i % 100:
        print(i)

0
100
200
300
400
500
600
700
800
900


**очень долго работает**

In [152]:
print(correct/total)
print(mistaken_fixed/total_mistaken)
print(correct_broken/total_correct)

0.8490245122561281
0.43711180124223603
0.09004249454461927


In [159]:
%%time
correction('хочеться')

Wall time: 0 ns


'хочется'

In [161]:
%%time
correction('приветб')

Wall time: 0 ns


'привет'

In [162]:
[(wt[0], wt[1], correction(wt[1])) for wt, _ in Counter(mistakes).most_common(10)]

[('сегодня', 'седня', 'сеня'),
 ('вообще', 'ваще', 'аще'),
 ('кстати', 'кстате', 'кате'),
 ('очень', 'ооочень', 'очень'),
 ('что-то', 'что-то', 'тото'),
 ('как-то', 'както', 'като'),
 ('очень', 'оооочень', 'оооочень'),
 ('это', 'ето', 'то'),
 ('ничего', 'ничо', 'нио'),
 ('что-то', 'чтото', 'тото')]