In [1]:
import re
import pandas as pd
from tqdm import tqdm

In [2]:
fd = open('big.txt','r')
txt = fd.read()
fd.close()

In [3]:
len(txt)

6488666

## Get the unique words

In [4]:
with open('big.txt','r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words += re.findall('\w+',line.lower())
        
print(len(words))
vocab = list(set(words))
print(len(vocab))

1115585
32198


# Finding the Probability Distribution

In [6]:
words.count('the')

79809

In [7]:
word_probability = {}

for word in tqdm(vocab):
    word_probability[word] = float(words.count(word)/len(words))

100%|████████████████████████████████████████████████████████████████████████████| 32198/32198 [11:22<00:00, 47.18it/s]


# Text Preprocessing

### Splitting

In [9]:
def split(word):  
    parts = []
    for i in range(len(word) + 1):
        parts += [(word[ : i], word[i : ])]
    return parts

split("Abhinav")

[('', 'Abhinav'),
 ('A', 'bhinav'),
 ('Ab', 'hinav'),
 ('Abh', 'inav'),
 ('Abhi', 'nav'),
 ('Abhin', 'av'),
 ('Abhina', 'v'),
 ('Abhinav', '')]

### 3.1) Delete

In [14]:
# loave -> love

def delete(word):
    
    output = []
    for l,r in split(word):
        output.append(l + r[1:])
    return output

delete('heallo')

['eallo', 'hallo', 'hello', 'healo', 'healo', 'heall', 'heallo']

### 3.2) Swap

In [10]:
# lvoe -> love

def swap(word):
        
    output = []    
    for l,r in split(word):
        if (len(r) > 1):
            output.append(l + r[1] + r[0] + r[2:])
    return output
            
swap('lvoe')

['vloe', 'love', 'lveo']

### 3.3) Replace

In [12]:
# 'lave' -> love

def replace(word):
    
    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []    

    for l,r in split(word):
        for char in characters:
            output.append(l + char +  r[1:])
    return output

len(replace('lave'))

130

### 3.4) Insert

In [13]:
# lve - > love

def insert(word):

    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []

    for l,r in split(word):
        for char in characters:
            output.append(l + char + r)

    return output

len(insert('lve'))

104

# Finding the Prediction (Level - 1)

### Combining Possible Words

In [15]:
def edit(word):   
    return list(set(insert(word) + delete(word) + swap(word) + replace(word)))

###  Predicting the Word

In [16]:
def spell_check_edit_1(word, count = 5):
    
    output = []
    suggested_words = edit(word)
    
    for wrd in suggested_words:        
        if wrd in word_probability.keys():
            output.append([wrd, word_probability[wrd]])
            
    return list(pd.DataFrame(output, columns = ['word','prob']).sort_values(by = 'prob', ascending = False).head(count)['word'].values)

#  Finding the Prediction (Level - 2)

### Combining Possible Words

In [17]:
def spell_check_edit_2(word, count = 5):
    
    output = []
    suggested_words = edit(word)       # Level one Edit
    
    for e1 in edit(word):
        suggested_words += edit(e1)    # Second Level Edit 
    
    suggested_words = list(set(suggested_words))
    
    for wrd in suggested_words:
        if wrd in word_probability.keys():
            output.append([wrd, word_probability[wrd]])
    return list(pd.DataFrame(output, columns = ['word','prob']).sort_values(by = 'prob', ascending = False).head(count)['word'].values)
        
spell_check_edit_2('fameli')

['family', 'namely', 'fame', 'camelia', 'camel']