In [18]:
import pandas as pd
import numpy as np
import nltk, re

cwd = 'C:/Users/klouc/Desktop/slovcho/spell_checking'

sents = pd.read_csv(rf"{cwd}/sents.csv",encoding='utf-8')
valid_words = pd.read_csv(rf"{cwd}/single_words_bg.csv",encoding='utf-8')

valid_words_dict = {}
for word in list(valid_words['word']):
    valid_words_dict[word] = word

print("Number of sentences:")
print(len(sents.values))
print("Number of words:")
print(len(valid_words.values))

Number of sentences:
1758815
Number of words:
736456


In [19]:
# Pre-process sents
# Sents vocab АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя1234567890!@#$%^&*()-_=+`~[]{}|;':"\,./<>

# TODO: Break sentences up more as some 'sents' include multiple sentences
# TODO: Lowers first letter of sent but what if first word has multiple capitalised letters
# TODO: Tokenise sentences in a more clever way

def process_sent(sentence):

    replace_with_space_chars = '[!@#$%^&*()-_=+`~{}\|;\[\]\':\"\,./<>]'

    # CLEAN SENTENCES
    # Replace some chars with space
    sentence = re.sub(replace_with_space_chars, ' ', sentence)
    # Replace consecutive spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence)

    # If sent below 10 characters do not consider
    if(len(sentence) < 5): return np.nan

    # Remove first charatcer if space
    if(sentence[0] == ' '): sentence = sentence[1:]
    # Lower letter if non-capitlised word exists in dict
    first_word = ''
    if(sentence.find(' ')):
        first_word = sentence[:sentence.find(' ')]
    else:
        first_word = sentence
    if(ord(first_word[0]) >= ord('А') and ord(first_word[0]) <= ord('Я') and (first_word[0].lower() + first_word[1:]) in valid_words_dict):
        sentence = sentence[0].lower() + sentence[1:]

    # ADD BEGINNING AND CLOSING TAG
    sentence = f'<s> {sentence} </s>'

    return sentence
    
cleaned_sents = sents["sent"].apply(lambda x : process_sent(x))
# Drop rows with no content
cleaned_sents.dropna(inplace=True)
cleaned_sents.reset_index(inplace=True, drop=True)

tokenised_sents = cleaned_sents.apply(lambda x : x.split(' '))
tokenised_sents.head(10)

0    [<s>, по, какво, четеш, Квантова, физика, Ама,...
1    [<s>, на, тила, му, е, сцената, от, древногръц...
2    [<s>, декември, г, Европейският, съвет, в, Люк...
3    [<s>, АД, е, различно, дружество, от, Лукойл, ...
4    [<s>, АД, от, агенция, Митници, С, решение, на...
5    [<s>, Абе, вече, забравихме, руският, украинск...
6    [<s>, в, тази, насока, препоръчвам, филмчето, ...
7    [<s>, вярвате, ли, че, халифата, ще, приключи,...
8    [<s>, Габи, ще, си, оправи, ли, леглото, не, о...
9    [<s>, Европа, е, най, западният, полуостров, н...
Name: sent, dtype: object

In [20]:
# Create unigrams and bigrams

unigrams = nltk.FreqDist()
bigrams = []

for tokenised_sent in tokenised_sents:
    unigrams.update(nltk.FreqDist(tokenised_sent))
    bigrams.extend(nltk.trigrams(tokenised_sent))

bigrams = nltk.ConditionalFreqDist(bigrams)


In [59]:
# TODO: Tokenise input in clever way
# TODO: Dealing with non-cyrillic characters
# TODO: How to deal if first word (capitalised but only cause start of sent) is wrong? As in, discerning capital letter

import re

sent_to_check = 'той обича да играе с неговата копка'

# PRE-PROCESS INPUT

# Clear punctuation
sent_to_check = sent_to_check
sent_to_check = re.sub("[?!.,:]", "", sent_to_check)

# Tokenise
words_to_check = sent_to_check.split(' ')

# Lower first character
if(ord(words_to_check[0][0]) >= ord('А') and ord(words_to_check[0][0]) <= ord('Я')):
     words_to_check[0] = words_to_check[0].lower()
# # If beginning word is capitalised and a non-capitalised version exists in the dict, lower it (might be a name)
# # Else do not lower
# if(ord(words_to_check[0][0]) >= ord('А') and ord(words_to_check[0][0]) <= ord('Я')):
#     if(words_to_check[0].lower() in valid_words.values):
#         words_to_check[0] = words_to_check[0].lower()

# Add <s> and </s>
words_to_check.insert(0, '<s>')
words_to_check.append('</s>')

In [60]:
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'абвгдежзийклмнопрстуфхцчшщъьюя'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

unigram_sum = sum(unigrams.values())
unigram_count = len(unigrams.keys())
trust_valid_word_is_correct = 0.95

for i in range(len(words_to_check)):
    word = words_to_check[i]
    if word != '<s>' and word != '</s>':
        prev_token = words_to_check[i - 1]
        next_token = words_to_check[i + 1]
        candidates = edits1(word)

        # Get candidates which are valid words
        valid_candidates = []
        for candidate in candidates:
            if(candidate in valid_words.values):
                valid_candidates.append(candidate)

        # Non-word errors
        if word not in valid_words.values:
            # Calculate probabilty distribution
            prob_distr = []
            for valid_candidate in valid_candidates:
                # [MISSING] = P(prob of making mistake)

                # P(candidate) with smooting = probability of word appering
                # candidate count in word unigram + 1 / count of all words + |V|
                pc = unigrams[valid_candidate] + 1 / float(unigram_sum + unigram_count)

                # P(candidate|prev_token) with smoothing = probability of findging candidate followed by prev_token
                # count of bigrams (prev_token, candidate) + 1 / count of prev_token + |V|
                pcp = bigrams[prev_token][valid_candidate] + 1 / float(unigrams[prev_token] + unigram_count)

                # P(next_token|candidate) with smoothing = probability of findging next_token followed by candidate
                # count of bigrams (candidate, next_token) + 1 / count of candidate + |V|
                pnc = bigrams[valid_candidate][next_token] + 1 / float(unigrams[valid_candidate] + unigram_count)

                prob_distr.append(pc * pcp * pnc)
            words_with_probs = [(word, prob) for word, prob in zip(valid_candidates, prob_distr)]
            words_with_probs.sort(key = lambda x: x[1], reverse=True)
            print(words_with_probs)

        # Real world errors
        else:
            # Calculate probabilty distribution
            prob_distr = []
            # valid_candidates.append(word)
            for valid_candidate in valid_candidates:
                # [MISSING] = P(prob of making mistake)

                # P(candidate) with smooting = probability of word appering
                # candidate count in word unigram + 1 / count of all words + |V|
                pc = unigrams[valid_candidate] + 1 / float(unigram_sum + unigram_count)

                # P(candidate|prev_token) with smoothing = probability of findging candidate followed by prev_token
                # count of bigrams (prev_token, candidate) + 1 / count of prev_token + |V|
                pcp = bigrams[prev_token][valid_candidate] + 1 / float(unigrams[prev_token] + unigram_count)

                # P(next_token|candidate) with smoothing = probability of findging next_token followed by candidate
                # count of bigrams (candidate, next_token) + 1 / count of candidate + |V|
                pnc = bigrams[valid_candidate][next_token] + 1 / float(unigrams[valid_candidate] + unigram_count)

                trust = 0
                # Apply trust for valid word
                if(word == valid_candidate):
                    trust = trust_valid_word_is_correct
                else:
                    trust = 1 - trust_valid_word_is_correct / len(valid_candidate)
                prob_distr.append(pc * pcp * pnc * trust)
            words_with_probs = [(word, prob) for word, prob in zip(valid_candidates, prob_distr)]
            words_with_probs.sort(key = lambda x: x[1], reverse=True)
            print(words_with_probs)


[('той', 108969293193.83888), ('кой', 49530756.711005874), ('то', 89.84784705398206), ('тъй', 29.55516694564533), ('тоя', 3.2994693843514007), ('том', 0.05035934456456159), ('бой', 0.03630500200295024), ('топ', 0.03537061389426542), ('ток', 0.022001525423675075), ('мой', 0.016556561360464267), ('стой', 0.005889572229933068), ('тон', 0.0031671801105727483), ('тор', 0.00209582757745562), ('тай', 0.0006517089714673821), ('рой', 0.00039304047912952543), ('вой', 0.00029059986621005393), ('твой', 0.0002124791092182442), ('пой', 2.6423277424972485e-05), ('дой', 8.807872709330106e-06), ('ой', 8.458766553309802e-06), ('сой', 8.328624467136766e-12), ('лой', 6.015166131127083e-12), ('гой', 2.3135552629981814e-12)]
[('обича', 47147243.48175534), ('обичал', 14720.754954256308), ('облича', 901.4266372831771), ('обичат', 0.3280397610255443), ('обичам', 0.07225827384472802), ('обичай', 0.008100882758406834), ('обичаш', 0.004631149802103834), ('обичая', 0.001024708526527512), ('обида', 0.00091513369496