In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import pandas as pd
from nltk.corpus import stopwords
import string
import nltk
from collections import Counter
from editdistance import D_L_Backtrack, D_L_editDistance

In [2]:
corpusFile = 'data/corpus.txt'
testCorrectFile = 'data/test-words-correct.txt'
testMisspelledFile = 'data/test-words-misspelled.txt'
spellErrorsFile = 'data/spell-errors.txt'

In [30]:
letters = 'abcdefghijklmnopqrstuvwxyz@*'
deletionFrame = pd.DataFrame(confusion_matrix['deletion'], columns = list('abcdefghijklmnopqrstuvwxyz@*'), index = list(letters))
insertionFrame= pd.DataFrame(confusion_matrix['insertion'], columns = list('abcdefghijklmnopqrstuvwxyz@*'), index = list(letters))
substitutionFrame=pd.DataFrame(confusion_matrix['substitution'], columns = list('abcdefghijklmnopqrstuvwxyz@*'), index = list(letters))
transpositionFrame=pd.DataFrame(confusion_matrix['transposition'], columns = list('abcdefghijklmnopqrstuvwxyz@*'), index = list(letters))

### Read Corpus

In [31]:
def readCorpus(filename):
    raw_corpus = open(filename).read().lower()
    corpus = "".join([char if char not in string.punctuation + '\n\t' else ' ' for char in raw_corpus ])
    return corpus

### Stopwords Removal

In [32]:
def getTokens(corpus, stopwords):
    tokens = [token for token in word_tokenize(corpus) if token not in stopwordList]
    return tokens

### Frequency Table

In [33]:
def getFrequencyTable(tokens):
    counter = Counter(tokens)
    frequencyItems = []
    for token, count in counter.items():
        frequencyItems.append([token, count, count / lenTokens])
    wordFreqTable = pd.DataFrame(sorted(frequencyItems, key=lambda tup: tup[2], reverse=True), columns=['word', 'count', 'percentage'])
    return wordFreqTable

In [34]:
stopwordList = stopwords.words('english')
corpus = readCorpus(corpusFile)
tokens = getTokens(corpus, stopwordList)
lenTokens = len(tokens)
setOfTokens = set(tokens)

In [35]:
wordFreqTable = getFrequencyTable(tokens)

In [36]:
wordFreqTable.head(20)

Unnamed: 0,word,count,percentage
0,said,3464,0.006121
1,one,3371,0.005957
2,may,2551,0.004508
3,pierre,1964,0.003471
4,would,1953,0.003451
5,prince,1935,0.003419
6,could,1700,0.003004
7,man,1652,0.002919
8,time,1529,0.002702
9,natasha,1212,0.002142


In [37]:
def createConfusionMatricesFromCorrections(spell_error_samples):
        def char_position(letter):
            return ord(letter) - 97 
        confusion_matrix_substitution = [[0 for i in range(len(letters))] for j in range(len(letters))]
        confusion_matrix_transposition = [[0 for i in range(len(letters))] for j in range(len(letters))]
        confusion_matrix_insertion = [[0 for i in range(len(letters))] for j in range(len(letters))]
        confusion_matrix_deletion = [[0 for i in range(len(letters))] for j in range(len(letters))]
        confusion_matrix = {'deletion': confusion_matrix_deletion,
                           'insertion': confusion_matrix_insertion,
                           'substitution': confusion_matrix_substitution,
                           'transposition': confusion_matrix_transposition}
        # For each spell error sample
        # backtrack is applied to error and its correct version
        # and get a correction such as ('deleted', 'c', 'a')
        # then for each correction, confusion matrix [ correction_type] is updated
        for (key,spellErrors) in spell_error_samples.items():
            for spellError in spellErrors:
                operations = D_L_Backtrack(spellError[0], key)
                for operation in operations:
                        x = char_position(operation[1]) if 0<=char_position(operation[1])<=25 else 27  # 0<=pos<=25 is for letters else it is * (wildcard)
                        y = char_position(operation[2]) if operation[2] != '@' and 0<=char_position(operation[2])<=25 else 26 if operation[2] == '@' else 27
                        confusion_matrix[operation[0]][x][y]+=spellError[1]

        return confusion_matrix

In [38]:
def readSpellErrors(filename):
        spell_errors = []
        with open(filename) as fp:
            line = fp.readline()
            while(line):
                spell_errors.append(line[:-1])
                line = fp.readline()
        spell_error_samples = {}
        for spl_error in spell_errors:
            spl = spl_error.split(':') # first split it into key and possible misspellings
            spell_error_samples[spl[0].lower()] = [] # key is lowered and put in
            splErrors = spl[1].split(',') # splErrors are [loking, luing*2]
            for err in splErrors:
                spell_error_samples[spl[0].lower()].append(err.replace(' ', '').lower())
        for se in spell_error_samples.items():
            spell_error_samples[se[0]] = []
            for err in se[1]:
                if('*' in err):
                    sp = err.split('*') # If it has a * in it, I split it and take the number
                    spell_error_samples[se[0]].append((sp[0], int(sp[1])))
                else:
                    spell_error_samples[se[0]].append((err, 1)) # else 1 is placed in.
        return spell_error_samples

In [39]:
spellErrorSamples = readSpellErrors(spellErrorsFile)
confusion_matrix = createConfusionMatricesFromCorrections(spellErrorSamples)

In [59]:
def getCandidates(testWord):
    # First filter by length
    candidatesWithLengthDifference1 = [word for word in setOfTokens if (abs(len(word) - len(testWord)) <= 1)]
    # Secondly, filter by edit distance using D_L_editDistance method
    candidatesWithEditDistance1 = [candidate for candidate in candidatesWithLengthDifference1 if (D_L_editDistance(candidate, testWord) <= 1)]
    return candidatesWithEditDistance1

In [60]:
getCandidates("disfranchisements")

['disfranchisements', 'disfranchisement']

In [61]:
def getWordFreq(word):
    if(word not in setOfTokens):
        return 0
    located = wordFreqTable.loc[wordFreqTable['word'] == word]
    return float(located['percentage'])

In [62]:
def getProbabilityFromConfusionMatrix(operation):
    if(operation[0] == 'insertion'):
        series = insertionFrame.loc[operation[1]]
        return round(series[operation[2]]/sum(series), 7)
    elif(operation[0] == 'deletion'):
        series = deletionFrame.loc[operation[1]]
        return round(series[operation[2]]/sum(series), 7)

    elif(operation[0] == 'substitution'):
        series = substitutionFrame.loc[operation[1]]
        return round(series[operation[2]]/sum(series), 7)

    elif(operation[0] == 'transposition'):
        series = transpositionFrame.loc[operation[1]]
        return round(series[operation[2]]/sum(series), 7)

In [90]:
def getBestCandidate(misspelledWord):
    bestCandidateScore = 0
    bestCandidate = misspelledWord
    candidates = getCandidates(misspelledWord)
    for candidate in candidates:
        candidateFrequency = getWordFreq(candidate)
        operation = D_L_Backtrack(candidate, misspelledWord) # What is the operation to get to candidate from misspelledWord
        # operation[0] is enough for words with edit distance 1
        probabilityFromConfusionMatrix = getProbabilityFromConfusionMatrix(operation[0]) # P(x|w)
        if(probabilityFromConfusionMatrix * candidateFrequency >= bestCandidateScore):
            bestCandidateScore = probabilityFromConfusionMatrix * candidateFrequency # P(x|w) * p(x)
            bestCandidate = candidate
    return bestCandidate

In [84]:
getWordFreq('hello')

operation = D_L_Backtrack('hello', 'hella')
probabilityConfusionMatrix = getProbabilityFromConfusionMatrix(operation[0]) # P(x|w)
probabilityConfusionMatrix

0.1992636

In [85]:
candidates = getCandidates('kel')
for candidate in candidates:
    candidateFrequency = getWordFreq(candidate)
    print(candidate, candidateFrequency)

del 1.7671216415853201e-06
ke 3.5342432831706403e-06
key 4.5945162681218324e-05
el 1.7671216415853201e-06
keg 3.5342432831706403e-06
ken 8.835608207926601e-06


In [91]:
getBestCandidate('keyt')

Operation:  [('insertion', 't', 'y')]
key 0.0094013 4.5945162681218324e-05
Operation:  [('substitution', 'y', 'p')]
kept 0.0037488 0.0004435475320379154
Operation:  [('substitution', 'y', 'n')]
kent 0.0253046 8.835608207926601e-06
Operation:  [('substitution', 't', 's')]
keys 0.1637969 2.2972581340609162e-05


'keys'

In [82]:
getProbabilityFromConfusionMatrix([('substitution', 'k', 'd')])