In [None]:
import re
import string
from collections import Counter
import numpy as np

In [None]:
def read_corpus(filename):
    """
    """
    with open(filename, "r") as file:
        lines = file.readlines()
    words = []
    for line in lines:
        words += re.findall(r'\w+', line.lower())

    return words

In [None]:
words = read_corpus("data/les_miserables_victor_hugo.txt")
print(f"There are {len(words)} total words in the corpus")

In [None]:
vocabs = set(words)
print(f"There are {len(vocabs)} unique words in the vocabulary")

In [None]:
word_counts = Counter(words)
print(word_counts["amour"])

In [None]:
total_word_count = float(sum(word_counts.values()))
word_probas = {word: word_counts[word] / total_word_count for word in word_counts.keys()}
print(word_probas["amour"])

In [None]:
print(word_probas["le"])

In [None]:
def split(word):
    return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [None]:
print(split("amour"))

In [None]:
def delete(word):
    """
    delete a letter of the input word
    """
    return [l + r[1:] for l,r in split(word) if r]

In [None]:
print(delete("amour"))

In [None]:
def swap(word):
    """
    swap two consecutive letters in the word
    """
    return [l + r[1] + r[0] + r[2:] for l, r in split(word) if len(r)>1]

In [None]:
print(swap("amour"))

In [None]:
string.ascii_lowercase

In [None]:
def replace(word):
    """
    replace a letter of the word with another alphabet letter
    """
    letters = string.ascii_lowercase
    return [l + c + r[1:] for l, r in split(word) if r for c in letters]

In [None]:
print(replace("les"))

In [None]:
def insert(word):
    """
    insert a letter of the alphabet into the word
    """
    letters = string.ascii_lowercase
    return [l + c + r for l, r in split(word) for c in letters]

In [None]:
print(insert("amour"))

In [None]:
def edit1(word):
    """
    list of words at 1 edit distance of the words
    """
    return set(delete(word) + swap(word) + replace(word) + insert(word))

In [None]:
print(edit1("amour"))

In [None]:
def edit2(word):
    """
    list of words at 2 edit distance of the input word
    """
    return set(e2 for e1 in edit1(word) for e2 in edit1(e1))

In [None]:
print(edit2("amour"))

In [None]:
def spelling_corrections_proposal(word, vocabulary, word_probabilities):
    if word in vocabulary:
        print(f"{word} is already correctly spelt")
        return 

    suggestions = edit1(word) or edit2(word) or [word]
    best_guesses = [w for w in suggestions if w in vocabulary]
    words_with_probas=[(w,word_probabilities[w]) for w in best_guesses]
    words_with_probas.sort(key=lambda tup: tup[1], reverse=True)
    return words_with_probas

In [None]:
word = "famile"
corrections = spelling_corrections_proposal(word, vocabs, word_probas)

if corrections:
    print(corrections)
    correct = corrections[0][0]
    print(f"{correct} is suggested for {word}")