## The Spelling Corrector from NLP Course 1 - fast.ai

In [63]:
from collections import Counter
import re
import nltk

def words(text): return re.findall(r'\w+', text.lower())

text = words(open("words_set_spelling.txt").read())
words_vocab = Counter(text)
word_bigrams = nltk.bigrams(text)
word_bigrams_vocab = Counter(word_bigrams)

In [18]:
def prob(word):
    '''
    Find the probability of a word
    '''
    return (words_vocab[word]/sum(words_vocab.values()))

In [28]:
def max_prob(words):
    '''
    Out of a set of words get word with max probability
    '''
    P = {word:prob(word) for word in words}
    return max(P,key=P.get)

In [30]:
def known(words):
    '''
    Out of a set of words, return the words which are present in the vocab
    '''
    return [word for word in words if word in words_vocab]

In [57]:
def possibilities(word):
    '''
    All the possibilities that exist 1/2 edit distance away 
    '''
    letters = "abcdefghijklmnopqrstuvwxyz"
    splits = [[word[:i],word[i:]] for i in range(len(word))]
    deletes = [L + R[1:] for L,R in splits]
    substitutes = [L + l + R[1:] for L,R in splits for l in letters]
    swaps = [L + R[1]+ R[0] + R[2:] for L,R in splits if len(R)>=2]
    inserts = [L + l + R for L,R in splits for l in letters]
    return [word] + deletes + swaps + inserts + substitutes


In [58]:
def correction(word):
    return max_prob(known(possibilities(word)))

In [59]:
correction("chegk")

'cheek'

In [56]:
correction("substtute")

'substitute'

In [66]:
word_bigrams_vocab[("the","project")]

95

In [81]:
# Correction with bigram context. Previous word

words_vocab = Counter(text)
word_bigrams = nltk.bigrams(text)
word_bigrams_vocab = Counter(word_bigrams)

def prob_context(word,prev_word):
    '''
    Find the probability of a word given a context
    P(A|B) = P(A U B)/ P(B)
    '''
    return word_bigrams_vocab[(word,prev_word)]/(words_vocab[word])

def max_prob_context(words,prev_word):
    '''
    Out of a set of words get word with max probability
    '''
    P = {word:prob_context(prev_word,word) for word in words}
    return max(P,key=P.get)

def correction_context(word,prev_word):
    return max_prob_context(known(possibilities(word)),prev_word)

In [82]:
correction_context("hear",prev_word="my")

'dear'

In [90]:
correction_context("waant",prev_word="i")

'want'