In [1]:
import re
import string
from collections import Counter
import numpy as np

In [4]:
def read_corpus(filename) :
  with open(filename ,"r") as file:
    lines = file.readlines()
    words = []

    for line in lines:
      words+=re.findall(r'\w+',line.lower())
  return words

In [5]:
words = read_corpus("./corpus.txt")

In [6]:
print(f"There are {len(words)} total words in the corpus.")

There are 1033894 total words in the corpus.


In [7]:
vocabs = set(words)
print(f"There are {len(vocabs)} total words in the corpus.")

There are 42432 total words in the corpus.


In [8]:
word_counts = Counter(words)
print(word_counts["moon"])

68


In [9]:
total_word_count = float(sum(word_counts.values()))
word_probas = {word:word_counts[word] / total_word_count for word in word_counts.keys()}

In [10]:
print(word_probas["moon"])

6.57707656684341e-05


In [11]:
def split(word) :
  return [(word[:i] , word[i:]) for i in range(len(word)+1)]

In [12]:
print(split("moon"))

[('', 'moon'), ('m', 'oon'), ('mo', 'on'), ('moo', 'n'), ('moon', '')]


In [13]:
def delete(word) :
  return [l+r[1:] for l,r in split(word) if r]

In [14]:
print(delete("moon"))

['oon', 'mon', 'mon', 'moo']


In [15]:
def swap(word) :
  return [l + r[1] + r[0] + r[2:] for l,r in split(word) if len(r)>1]

In [16]:
print(swap("moon"))

['omon', 'moon', 'mono']


In [17]:
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [18]:
def replace(word):
  letters = string.ascii_lowercase
  return [l + c + r[1:] for l,r in split(word) if r for c in letters]

In [19]:
print(replace("moon"))

['aoon', 'boon', 'coon', 'doon', 'eoon', 'foon', 'goon', 'hoon', 'ioon', 'joon', 'koon', 'loon', 'moon', 'noon', 'ooon', 'poon', 'qoon', 'roon', 'soon', 'toon', 'uoon', 'voon', 'woon', 'xoon', 'yoon', 'zoon', 'maon', 'mbon', 'mcon', 'mdon', 'meon', 'mfon', 'mgon', 'mhon', 'mion', 'mjon', 'mkon', 'mlon', 'mmon', 'mnon', 'moon', 'mpon', 'mqon', 'mron', 'mson', 'mton', 'muon', 'mvon', 'mwon', 'mxon', 'myon', 'mzon', 'moan', 'mobn', 'mocn', 'modn', 'moen', 'mofn', 'mogn', 'mohn', 'moin', 'mojn', 'mokn', 'moln', 'momn', 'monn', 'moon', 'mopn', 'moqn', 'morn', 'mosn', 'motn', 'moun', 'movn', 'mown', 'moxn', 'moyn', 'mozn', 'mooa', 'moob', 'mooc', 'mood', 'mooe', 'moof', 'moog', 'mooh', 'mooi', 'mooj', 'mook', 'mool', 'moom', 'moon', 'mooo', 'moop', 'mooq', 'moor', 'moos', 'moot', 'moou', 'moov', 'moow', 'moox', 'mooy', 'mooz']


In [20]:
def insert(word) :
  letters = string.ascii_lowercase
  return [l + c + r for l,r in split(word) for c in letters]

In [21]:
print(insert("moon"))

['amoon', 'bmoon', 'cmoon', 'dmoon', 'emoon', 'fmoon', 'gmoon', 'hmoon', 'imoon', 'jmoon', 'kmoon', 'lmoon', 'mmoon', 'nmoon', 'omoon', 'pmoon', 'qmoon', 'rmoon', 'smoon', 'tmoon', 'umoon', 'vmoon', 'wmoon', 'xmoon', 'ymoon', 'zmoon', 'maoon', 'mboon', 'mcoon', 'mdoon', 'meoon', 'mfoon', 'mgoon', 'mhoon', 'mioon', 'mjoon', 'mkoon', 'mloon', 'mmoon', 'mnoon', 'mooon', 'mpoon', 'mqoon', 'mroon', 'msoon', 'mtoon', 'muoon', 'mvoon', 'mwoon', 'mxoon', 'myoon', 'mzoon', 'moaon', 'mobon', 'mocon', 'modon', 'moeon', 'mofon', 'mogon', 'mohon', 'moion', 'mojon', 'mokon', 'molon', 'momon', 'monon', 'mooon', 'mopon', 'moqon', 'moron', 'moson', 'moton', 'mouon', 'movon', 'mowon', 'moxon', 'moyon', 'mozon', 'mooan', 'moobn', 'moocn', 'moodn', 'mooen', 'moofn', 'moogn', 'moohn', 'mooin', 'moojn', 'mookn', 'mooln', 'moomn', 'moonn', 'mooon', 'moopn', 'mooqn', 'moorn', 'moosn', 'mootn', 'mooun', 'moovn', 'moown', 'mooxn', 'mooyn', 'moozn', 'moona', 'moonb', 'moonc', 'moond', 'moone', 'moonf', 'moong', 

In [22]:
def edit1(word) :
  return set (delete(word) + replace(word) + insert(word) + delete(word))

In [23]:
print(edit1("moon"))

{'mkon', 'mown', 'tmoon', 'nmoon', 'mgon', 'doon', 'modon', 'monon', 'hmoon', 'mooc', 'mhon', 'eoon', 'mojon', 'mocn', 'motn', 'imoon', 'mqon', 'loon', 'moov', 'moqn', 'mvoon', 'mofn', 'moocn', 'kmoon', 'mooy', 'mood', 'moosn', 'roon', 'mlon', 'mooa', 'moonm', 'mool', 'mson', 'rmoon', 'moo', 'meoon', 'moovn', 'moong', 'moaon', 'maon', 'mgoon', 'mokon', 'moobn', 'pmoon', 'molon', 'mofon', 'moozn', 'meon', 'woon', 'mooe', 'moson', 'mloon', 'moopn', 'mpoon', 'moonb', 'mmoon', 'moofn', 'mozon', 'mfon', 'mooyn', 'mook', 'moun', 'mooni', 'coon', 'mkoon', 'mooqn', 'mooz', 'mdoon', 'mion', 'smoon', 'hoon', 'mooln', 'ioon', 'moyn', 'mojn', 'moos', 'mdon', 'moqon', 'moony', 'msoon', 'myoon', 'qoon', 'moona', 'mon', 'oon', 'mnoon', 'mooj', 'moonl', 'maoon', 'mcoon', 'wmoon', 'jmoon', 'xoon', 'poon', 'omoon', 'moomn', 'moodn', 'moonv', 'muoon', 'moogn', 'mooh', 'muon', 'toon', 'moou', 'movon', 'mqoon', 'moonh', 'moop', 'moton', 'moonz', 'lmoon', 'moonf', 'mpon', 'momn', 'moow', 'moion', 'vmoon', '

In [24]:
def edit2(word) :
  return set(e2 for e1  in edit1(word) for e2 in edit1(e1))

In [25]:
def correct_spelling(word, vocabulary, word_probabilities):
  if word in vocabulary:
    print(f"{word} is already correctly spelt")
    return

  suggestions = edit1(word) or edit2(word) or [word]
  best_guesses = [w for w in suggestions if w in vocabulary]
  return [(w, word_probabilities[w]) for w in best_guesses]

In [26]:
def check(self, word):
    candidates = self._level_one_edits(word) or self._level_two_edits(word) or [word]
    valid_candidates = [w for w in candidates if w in self.vocabs]
    return sorted([(c, self.word_probas[c]) for c in valid_candidates], key=lambda tup: tup[1], reverse=True)

In [27]:
class SpellChecker(object):

  def __init__(self, corpus_file_path):
    with open(corpus_file_path, "r") as file:
      lines = file.readlines()
      words = []
      for line in lines:
        words += re.findall(r'\w+', line.lower())

    self.vocabs = set(words)
    self.word_counts = Counter(words)
    total_words = float(sum(self.word_counts.values()))
    self.word_probas = {word: self.word_counts[word] / total_words for word in self.vocabs}

  def _level_one_edits(self, word):
    letters = string.ascii_lowercase
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [l + r[1:] for l,r in splits if r]
    swaps = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r)>1]
    replaces = [l + c + r[1:] for l, r in splits if r for c in letters]
    inserts = [l + c + r for l, r in splits for c in letters]

    return set(deletes + swaps + replaces + inserts)

  def _level_two_edits(self, word):
    return set(e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edits(e1))

  def check(self, word):
    candidates = self._level_one_edits(word) or self._level_two_edits(word) or [word]
    valid_candidates = [w for w in candidates if w in self.vocabs]
    return sorted([(c, self.word_probas[c]) for c in valid_candidates], key=lambda tup: tup[1], reverse=True)
    corrections = correct_spelling(word, vocabs, word_probas)

    if corrections:
       print(corrections)
       probs = np.array([c[1] for c in corrections])
       best_ix = np.argmax(probs)
       correct = corrections[best_ix][0]
       print(f"{correct} is suggested for {word}")


In [28]:
checker =  SpellChecker("./corpus.txt")

In [29]:
import re
import string
from collections import Counter
import ipywidgets as widgets
from IPython.display import display, clear_output

class SpellChecker(object):

    def __init__(self, corpus_file_path):
        with open(corpus_file_path, "r") as file:
            lines = file.readlines()
            words = []
            for line in lines:
                words += re.findall(r'\w+', line.lower())

        self.vocabs = set(words)
        self.word_counts = Counter(words)

    def _level_one_edits(self, word):
        letters = string.ascii_lowercase
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [l + r[1:] for l,r in splits if r]
        swaps = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r)>1]
        replaces = [l + c + r[1:] for l, r in splits if r for c in letters]
        inserts = [l + c + r for l, r in splits for c in letters]

        return set(deletes + swaps + replaces + inserts)

    def _level_two_edits(self, word):
        return set(e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edits(e1))

    def check(self, word):
        candidates = self._level_one_edits(word) or self._level_two_edits(word) or [word]
        valid_candidates = [w for w in candidates if w in self.vocabs]
        return sorted(valid_candidates, key=lambda w: self.word_counts[w], reverse=True)

def handle_submit(sender):
    word = text.value.strip().lower()
    corrections = spell_checker.check(word)
    if corrections:
        with output:
            clear_output()
            print("Did you mean:")
            for suggestion in corrections[:5]:
                print(suggestion.capitalize())

spell_checker = SpellChecker("corpus.txt")

text = widgets.Text(
    value='',
    placeholder='Type a word',
    description='Word:',
    disabled=False,
    layout=widgets.Layout(width='300px')
)

text.on_submit(handle_submit)

output = widgets.Output()

text_box = widgets.HBox([widgets.Label("Enter a word:", layout=widgets.Layout(width='100px')), text])
text_box.layout.align_items = 'center'
text_box.layout.padding = '10px'

output.layout.padding = '10px'
output.layout.border = '2px solid #1E90FF'
output.layout.border_radius = '5px'

display(widgets.VBox([text_box, output], layout=widgets.Layout(justify_content='center', align_items='center', margin='20px', padding='20px', border='2px solid #1E90FF', border_radius='10px')))



VBox(children=(HBox(children=(Label(value='Enter a word:', layout=Layout(width='100px')), Text(value='', descr…

In [30]:
checker.check("samd")

[('said', 0.0018967128158205774),
 ('same', 0.0006635109595374381),
 ('sam', 8.027902280117692e-05),
 ('sad', 3.38525999763999e-05),
 ('sand', 2.8049297123302777e-05),
 ('saud', 1.9344342843657085e-06)]