In [2]:
import re
from collections import Counter

In [3]:
# function to tokenise words
def words(document):
    "Convert text to lower case and tokenise the document"
    return re.findall(r'\w+', document.lower())

In [4]:
# create a frequency table of all the words of the document
all_words = Counter(words(open('/mnt/d/6- Books/upgrad/TextMining/big.txt').read()))

In [5]:
# check frequency of a random word, say, 'chair'
all_words['chair']

135

In [6]:
all_words

Counter({'unconciousness': 1,
         'roseola': 1,
         'fetes': 2,
         'combatted': 1,
         'horses': 262,
         'differed': 10,
         'hillside': 2,
         'his': 10034,
         'tariff': 123,
         'fulfilled': 21,
         'thud': 10,
         'snugly': 1,
         'introduce': 23,
         'division': 110,
         'utilise': 1,
         'boxes': 16,
         'dignitaries': 2,
         'sterne': 1,
         'anaphylactic': 1,
         'afloat': 2,
         'consultant': 1,
         '_endarteritis': 1,
         'annoyance': 18,
         'beaming': 19,
         'newly': 39,
         'heights': 23,
         'sowing': 6,
         'cleveland': 42,
         'purchaser': 4,
         'sojourn': 3,
         'chemist': 3,
         'nausea': 4,
         'imagines': 4,
         'enfant': 2,
         'weeden': 2,
         'fails': 20,
         'produces': 23,
         'portmanteaus': 4,
         'paragraphs': 10,
         'arts': 17,
         'yale': 4,
         'bes

In [7]:
# look at top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [8]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
    deletes    = [left + right[1:]                       for left, right in splits if right]
    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

In [9]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [10]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [11]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [12]:
def prob(word, N=sum(all_words.values())): 
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [13]:
print(len(set(edits_one("monney"))))
print(edits_one("monney"))

336
{'tonney', 'moneey', 'monnes', 'moeney', 'monnepy', 'monneb', 'monneyw', 'movney', 'monneo', 'bmonney', 'monrey', 'moonney', 'emonney', 'molnney', 'moiney', 'monnev', 'monndey', 'mwonney', 'monzney', 'moniey', 'monnky', 'xmonney', 'monncey', 'monneq', 'monneoy', 'monnejy', 'monnee', 'monnwey', 'mbonney', 'mponney', 'monneyg', 'aonney', 'rmonney', 'mvonney', 'mornney', 'montney', 'monniey', 'menney', 'nmonney', 'monpey', 'monuey', 'monneg', 'monnkey', 'monniy', 'monneiy', 'monnhy', 'moknney', 'mobnney', 'meonney', 'monkney', 'monny', 'monjey', 'monnzy', 'mfonney', 'onney', 'mopnney', 'mondney', 'monneyl', 'mcnney', 'mvnney', 'mohnney', 'monnec', 'wonney', 'monner', 'monnety', 'mnoney', 'monneay', 'monnevy', 'monneh', 'monnet', 'mmnney', 'moenney', 'moneney', 'monnex', 'monnea', 'monnyey', 'monnefy', 'monneyd', 'moncey', 'monfney', 'mtonney', 'ionney', 'modnney', 'monneby', 'monneya', 'uonney', 'monyney', 'wmonney', 'moqney', 'monhey', 'mpnney', 'monnuy', 'fonney', 'mronney', 'omonne

In [14]:
print(known(edits_one("monney")))

{'monkey', 'money'}


In [15]:
# Let's look at words that are two edits away
print(len(set(edits_two("monney"))))
print(known(edits_one("monney")))

51013
{'monkey', 'money'}


In [16]:
# Let's look at possible corrections of a word
print(possible_corrections("monney"))

{'monkey', 'money'}


In [17]:
# Let's look at probability of a word
print(prob("money"))
print(prob("monkey"))

0.0002922233626303688
5.378344097491451e-06


In [18]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [19]:
# test spell check
print(spell_check("Vat"))

Did you mean at?


In [42]:
def rectify(word):
    "return the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    return correct_word

In [43]:
rectify('lov')

'love'

In [38]:
#!pip install pyspellchecker
#!pip install autocorrect
