In [1]:
import re
import pandas as pd

In [2]:
def editDistance(word1, word2):
    len1 = len(word1)
    len2 = len(word2)
    distanceMap = [[0] for i in range(len1+1)]
    distanceMap[0] = [i for i in range(len2+1)]
    for i in range(len1+1):
        distanceMap[i][0] = i
    
    for i in range(len1):
        for j in range(len2):
            adder = 0 if(word1[i]==word2[j]) else 1
            dist = min(distanceMap[i+1][j]+1,distanceMap[i][j+1]+1,distanceMap[i][j]+adder)
            distanceMap[i+1].append(dist)
    
    return distanceMap[len1][len2]

In [117]:
def D_L_editDistance(word1, word2):
    len1 = len(word1)
    len2 = len(word2)
    distanceMap = [[0] for i in range(len1+1)]
    distanceMap[0] = [i for i in range(len2+1)]
    for i in range(len1+1):
        distanceMap[i][0] = i
    
    for i in range(1,len1+1):
        for j in range(1,len2+1):
            adder = 0 if(word1[i-1]==word2[j-1]) else 1
            #print(i,j,adder)
            dist = min(distanceMap[i][j-1]+1,distanceMap[i-1][j]+1,distanceMap[i-1][j-1]+adder)
            if(i>1 and j>1 and word1[i-2:i] == word2[j-2:j][::-1]):
                dist = min(distanceMap[i-2][j-2]+1, dist)
            distanceMap[i].append(dist)
    #print(distanceMap)
    return distanceMap[len1][len2]

In [229]:
def D_L_Backtrack(word1, word2):
    len1 = len(word1)
    len2 = len(word2)
    distanceMap = [[0] for i in range(len1+1)]
    distanceMap[0] = [i for i in range(len2+1)]
    for i in range(len1+1):
        distanceMap[i][0] = i
    
    for i in range(1,len1+1):
        for j in range(1,len2+1):
            adder = 0 if(word1[i-1]==word2[j-1]) else 1
            #print(i,j,adder)
            dist = min(distanceMap[i][j-1]+1,distanceMap[i-1][j]+1,distanceMap[i-1][j-1]+adder)
            if(i>1 and j>1 and word1[i-2:i] == word2[j-2:j][::-1]):
                dist = min(distanceMap[i-2][j-2]+1, dist)
            distanceMap[i].append(dist)
            
    i = len1
    j = len2
    operations = []
    while(i>=1 and j>=1):
        if(distanceMap[i][j] == distanceMap[i][j-1]+1):
            #print('insertion of %s after %s' % (word2[j-1], word1[i-1]))
            operations.append(('insertion', word2[j-1], word1[i-1]))
            j=j-1
        elif(distanceMap[i][j] == distanceMap[i-1][j]+1):
            #print('deletion of %s after %s' % (word1[i-1], word1[i-2]))
            operations.append(('deletion', word1[i-1], word1[i-2]))
            i=i-1
        elif(distanceMap[i][j] == distanceMap[i-1][j-1] + (0 if word1[i-1] == word2[j-1] else 1)):
            if(not word1[i-1] == word2[j-1]):    
                #print('substitution of %s with %s' % (word2[j-1], word1[i-1]))
                operations.append(('substitution', word2[j-1], word1[i-1]))
            i=i-1
            j=j-1
        else:
            #print('transposition of %s with %s' % (word2[j-1], word1[i-1]))
            operations.append(('transposition', word2[j-1], word1[i-1]))
            i=i-2
            j=j-2
    word1=word1[:-1]+'@'
    word2=word2[:-1]+'@'
    while(i>0):
        #print('deletion of %s after %s' % (word1[i-1], word1[i-2]))
        operations.append(('deletion', word1[i-1], word1[i-2]))
        i=i-1
    while(j>0):
        #print('insertion of %s after %s' % (word2[j-1], word1[i-1]))
        operations.append(('insertion', word2[j-1], word1[i-1]))
        j=j-1
    return operations

In [321]:
def tokenMe(inputString):
    inputString = inputString.lower()
    outputString = ''
    for ch in inputString.replace('--',' ').replace('\'', ''):
        if(ch.isalpha() or ch == ' ' or ch.isnumeric()):
            outputString+=ch
        else:
            outputString+=' '
    return outputString

In [323]:
corpus = open('corpusMini.txt').read().lower()
corpus2 = tokenMe(corpus)

In [324]:
test_words_correct = []
with open('test-words-correct.txt') as fp:
    line = fp.readline()
    while(line):
        line = tokenMe(line)
        test_words_correct.append(line[:-1])
        line = fp.readline()
test_words_misspelled = []
with open('test-words-misspelled.txt') as fp2:
    line = fp2.readline()
    while(line):
        line = tokenMe(line)
        test_words_misspelled.append(line[:-1])
        line = fp2.readline()

In [329]:
spell_errors = []
with open('spell-errors.txt') as fp:
    line = fp.readline()
    while(line):
        spell_errors.append(line[:-1])
        line = fp.readline()

In [367]:
spell_error_samples = {}
for spl_error in spell_errors:
    spl = spl_error.split(':')
    spell_error_samples[spl[0].lower()] = []
    splErrors = spl[1].split(',')
    for err in splErrors:
        spell_error_samples[spl[0].lower()].append(err.replace(' ', '').lower())

In [368]:
for se in spell_error_samples.items():
    spell_error_samples[se[0]] = []
    for err in se[1]:
        if('*' in err):
            sp = err.split('*')
            spell_error_samples[se[0]].append((sp[0], int(sp[1])))
        else:
            spell_error_samples[se[0]].append((err,1))

In [369]:
word_list=re.split('\s+',corpus2)

In [370]:
numTokens = len(word_list)
print("Number of tokens: %d" %numTokens)

Number of tokens: 3383


In [371]:
df = pd.DataFrame({'word':[],'freq':[],'percentage':[]})

In [372]:
wordFreq = []
for word in set(word_list):
    count = word_list.count(word)
    wordFreq.append([word, count, round(count/numTokens,5)])
wordFreq = sorted(wordFreq, key=lambda tup: tup[2], reverse=True)
df = pd.DataFrame(wordFreq, columns=['word','count','percentage'])

In [373]:
#df.sort_values('percentage', ascending=False)
df.head(5)
#wordFreq

Unnamed: 0,word,count,percentage
0,the,162,0.04789
1,of,110,0.03252
2,and,96,0.02838
3,a,91,0.0269
4,to,87,0.02572


In [388]:
len(letters)

28

In [397]:
def char_position(letter):
    return ord(letter) - 97
letters    = 'abcdefghijklmnopqrstuvwxyz@*'
confusion_matrix_substitution = [[0 for i in range(len(letters))] for j in range(len(letters))]
confusion_matrix_transposition = [[0 for i in range(len(letters))] for j in range(len(letters))]
confusion_matrix_insertion = [[0 for i in range(len(letters))] for j in range(len(letters))]
confusion_matrix_deletion = [[0 for i in range(len(letters))] for j in range(len(letters))]
confusion_matrix = {'deletion': confusion_matrix_deletion,
                   'insertion': confusion_matrix_insertion,
                   'substitution': confusion_matrix_substitution,
                   'transposition': confusion_matrix_transposition}
for (key,spellErrors) in spell_error_samples.items():
    for spellError in spellErrors:
        corrections = D_L_Backtrack(spellError[0], key)
        for correction in corrections:
            x = char_position(correction[1]) if 0<=char_position(correction[1])<=25 else 27
            y = char_position(correction[2]) if correction[2] != '@' and 0<=char_position(correction[2])<=25 else 26 if correction[2] == '@' else 27
            #print(correction[0], x, correction[1], y, correction[2], key, spellError)
            confusion_matrix[correction[0]][x][y]+=spellError[1]

In [404]:
import matplotlib.pyplot as plt

plt.plot(confusion_matrix['deletion'])

[<matplotlib.lines.Line2D at 0x2645bdadc18>,
 <matplotlib.lines.Line2D at 0x2645bdadd30>,
 <matplotlib.lines.Line2D at 0x2645bdadf28>,
 <matplotlib.lines.Line2D at 0x2645bdbb0b8>,
 <matplotlib.lines.Line2D at 0x2645bdbb208>,
 <matplotlib.lines.Line2D at 0x2645bdbb358>,
 <matplotlib.lines.Line2D at 0x2645bdbb4a8>,
 <matplotlib.lines.Line2D at 0x2645bdbb5f8>,
 <matplotlib.lines.Line2D at 0x2645bdbb748>,
 <matplotlib.lines.Line2D at 0x2645bdbb898>,
 <matplotlib.lines.Line2D at 0x2645bdbb9e8>,
 <matplotlib.lines.Line2D at 0x2645bdbbb38>,
 <matplotlib.lines.Line2D at 0x2645bdbbc88>,
 <matplotlib.lines.Line2D at 0x2645bdbbdd8>,
 <matplotlib.lines.Line2D at 0x2645bdbbf28>,
 <matplotlib.lines.Line2D at 0x2645bdc10b8>,
 <matplotlib.lines.Line2D at 0x2645bdc1278>,
 <matplotlib.lines.Line2D at 0x2645bdc13c8>,
 <matplotlib.lines.Line2D at 0x2645bdc1518>,
 <matplotlib.lines.Line2D at 0x2645bdc1668>,
 <matplotlib.lines.Line2D at 0x2645bdc17b8>,
 <matplotlib.lines.Line2D at 0x2645bdc1908>,
 <matplotl

In [394]:
a = 1 if 'x'=='y' else 2 if 'x'=='a' else 3

In [395]:
a

3

In [31]:
prob = [lis[2] for lis in wordFreq if lis[0]=='want']
prob

[0.00029]

In [24]:
testWord = 'whant'

In [25]:
candidates1 = [word for word in set(word_list) if(abs(len(word)-len(testWord))<=1)]
#candidates2 = [word for word in set(word_list) if editDistance(word,testWord) == 1 ] 
candidates2 = [candidate for candidate in candidates1 if(editDistance(candidate, testWord)==1)]
candidates2

['what', 'want']

In [21]:
#candidates2 = sorted(candidates1, key=lambda x: editDistance(testWord, x))

In [34]:
editDistance('oslo', 'snow')

3