In [1]:
import string
import re
import numpy as np


In [2]:
def read_corpus(filename):
  with open(filename,'r') as file:
    lines = file.readlines();
    words = []
    for line in lines:
      words += re.findall(r'\w+',line.lower())
  return words

In [3]:
words = read_corpus('/content/drive/MyDrive/AI/DL/spell_check/big.txt')

In [4]:
print(f"There are {len(words)} words in big.txt")

There are 1115585 words in big.txt


In [5]:
vocab = set(words)
print(f"There are {len(vocab)} words in big.txt")

There are 32198 words in big.txt


In [6]:
from collections import Counter

In [7]:
word_count = Counter(words)
word_count["the"]

79809

In [9]:
tot_word_count = sum(word_count.values())
tot_word_count

1115585

In [None]:
word_prob = {word:word_count[word]/tot_word_count for word in word_count.keys()}
word_prob

In [17]:
def split(word):
  return [[word[:i],word[i:]] for i in range(len(word)+1)]

In [18]:
split('test')

[['', 'test'], ['t', 'est'], ['te', 'st'], ['tes', 't'], ['test', '']]

In [21]:
def delete(word):
  return [l+r[1:] for l,r in split(word) if r]

In [22]:
delete('test')

['est', 'tst', 'tet', 'tes']

In [24]:
def swap(word):
  return [l+r[1]+r[0]+r[2:] for l,r in split(word) if len(r)>1]
swap('test')

['etst', 'tset', 'tets']

In [25]:
letters = string.ascii_lowercase;

In [None]:
def replace(word):
  return [l+c+r[1:] for l,r in split(word) if r for c in letters]
replace('test')

In [None]:
def insert(word):
  return [l+c+r for l,r in split(word) for c in letters]
insert('test')

In [None]:
def level_one_edits(word):
  return set(delete(word)+swap(word)+replace(word)+insert(word))
level_one_edits('trash')

In [None]:
def level_two_edits(word):
  return set(e2 for e1 in level_one_edits(word) for e2 in level_one_edits(e1))
level_one_edits('trash')

In [31]:
def isinvocab(words):
  return set([w for w in words if w in vocab])

In [37]:
def correct_spell(word,vocab,prob):
  if word in vocab:
    print('Word ' + word + ' is correctly spelled already.')
    return

  suggestions = isinvocab(level_one_edits(word)) or isinvocab(level_two_edits(word)) or set([word])
  return [(w, prob[w]) for w in suggestions]
  

In [40]:
correct_spell('foad',vocab,word_prob)

[('road', 0.00023126879619213237),
 ('load', 7.171125463321934e-06),
 ('ford', 6.2747347804066925e-06),
 ('fold', 2.061698570705056e-05),
 ('food', 6.722930121864313e-05),
 ('fond', 5.467983165782975e-05),
 ('foal', 8.963906829152417e-07),
 ('fad', 8.963906829152417e-07),
 ('foam', 3.585562731660967e-06)]

In [55]:
class Spell_Cheker(object):
  def __init__(self, corpus_file_path):
    with open(corpus_file_path,'r') as file:
      lines = file.readlines()
    words = []
    for line in lines:
      words+=re.findall(r'\w+',line.lower())
      self.word_count = Counter(words) 
      self.vocab = set(words)
      self.total_words = sum(self.word_count.values())
      self.word_probs = {word:self.word_count[word]/self.total_words for word in self.vocab}
  
  def _level_one_edits(self,word):
    letters = string.ascii_lowercase
    splits = [(word[:i],word[i:]) for i in range(len(word)+1)]
    delete = [l+r[1:] for (l,r) in splits if r]
    swaps = [l+r[1]+r[0]+r[2:] for l,r in splits if len(r)>1]
    replaces = [l+c+r[1:] for l,r in splits if r for c in letters]
    inserts = [l+c+r for l,r in splits for c in letters]
    return set(deletes+swaps+replaces+inserts)

  def _level_two_edits(self,word):
    return (e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edits(e1))

  def _isinvocab(self,words):
    return set([w for w in words if w in self.vocab])

  def checker(self,word):
    candidates = _isinvocab(self._level_one_edits(word)) or _isinvocab(self._level_two_edits) or [word]
    return sorted([(c,self.word_probs[c]) for c in candidates], key= lambda tup:tup[1],reverse=True)

In [56]:
checker = Spell_Cheker('/content/drive/MyDrive/AI/DL/spell_check/big.txt')