In [92]:
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import pandas as pd
from nltk.corpus import stopwords
import string
import nltk
from collections import Counter
from editdistance import D_L_Backtrack, D_L_editDistance

In [1]:
corpusFile = 'data/corpus.txt'
testCorrectFile = 'data/test-words-correct.txt'
testMisspelledFile = 'data/test-words-misspelled.txt'
spellErrorsFile = 'data/spell-errors.txt'

### Read Corpus

In [74]:
def readCorpus(filename):
    raw_corpus = open(filename).read().lower()
    corpus = "".join([char for char in raw_corpus if char not in string.punctuation])
    corpus = re.sub('[\n\t]', ' ', corpus)
    return corpus

### Stopwords Removal

In [79]:
def getTokens(corpus, stopwords):
    tokens = [token for token in word_tokenize(corpus) if token not in stopwordList]
    return tokens

### Frequency Table

In [83]:
def getFrequencyTable(tokens):
    counter = Counter(tokens)
    frequencyItems = []
    for token, count in counter.items():
        frequencyItems.append([token, count, count / lenTokens])
    wordFreqTable = pd.DataFrame(sorted(frequencyItems, key=lambda tup: tup[2], reverse=True), columns=['word', 'count', 'percentage'])
    return wordFreqTable

In [81]:
stopwordList = stopwords.words('english')
corpus = readCorpus(corpusFile)
tokens = getTokens(corpus, stopwordList)
lenTokens = len(tokens)

In [84]:
wordFreqTable = getFrequencyTable(tokens)

In [85]:
wordFreqTable.head(20)

Unnamed: 0,word,count,percentage
0,said,3456,0.006163
1,one,3215,0.005733
2,may,2538,0.004526
3,would,1949,0.003476
4,prince,1893,0.003376
5,pierre,1785,0.003183
6,could,1695,0.003023
7,time,1509,0.002691
8,man,1502,0.002678
9,new,1199,0.002138


In [88]:
def createConfusionMatricesFromCorrections(spell_error_samples):
        letters = 'abcdefghijklmnopqrstuvwxyz@*'
        confusion_matrix_substitution = [[0 for i in range(len(letters))] for j in range(len(letters))]
        confusion_matrix_transposition = [[0 for i in range(len(letters))] for j in range(len(letters))]
        confusion_matrix_insertion = [[0 for i in range(len(letters))] for j in range(len(letters))]
        confusion_matrix_deletion = [[0 for i in range(len(letters))] for j in range(len(letters))]
        confusion_matrix = {'deletion': confusion_matrix_deletion,
                           'insertion': confusion_matrix_insertion,
                           'substitution': confusion_matrix_substitution,
                           'transposition': confusion_matrix_transposition}
        # For each spell error sample
        # backtrack is applied to error and its correct version
        # and get a correction such as ('deleted', 'c', 'a')
        # then for each correction, confusion matrix [ correction_type] is updated
        for (key,spellErrors) in spell_error_samples.items():
            for spellError in spellErrors:
                operations = D_L_Backtrack(spellError[0], key)
                for operation in operations:
                        x = char_position(operation[1]) if 0<=char_position(operation[1])<=25 else 27  # 0<=pos<=25 is for letters else it is * (wildcard)
                        y = char_position(operation[2]) if operation[2] != '@' and 0<=char_position(operation[2])<=25 else 26 if operation[2] == '@' else 27
                        confusion_matrix[operation[0]][x][y]+=spellError[1]

        return confusion_matrix

In [90]:
def readSpellErrors(filename):
        spell_errors = []
        with open(filename) as fp:
            line = fp.readline()
            while(line):
                spell_errors.append(line[:-1])
                line = fp.readline()
        spell_error_samples = {}
        for spl_error in spell_errors:
            spl = spl_error.split(':') # first split it into key and possible misspellings
            spell_error_samples[spl[0].lower()] = [] # key is lowered and put in
            splErrors = spl[1].split(',') # splErrors are [loking, luing*2]
            for err in splErrors:
                spell_error_samples[spl[0].lower()].append(err.replace(' ', '').lower())
        for se in spell_error_samples.items():
            spell_error_samples[se[0]] = []
            for err in se[1]:
                if('*' in err):
                    sp = err.split('*') # If it has a * in it, I split it and take the number
                    spell_error_samples[se[0]].append((sp[0], int(sp[1])))
                else:
                    spell_error_samples[se[0]].append((err, 1)) # else 1 is placed in.
        return spell_error_samples

In [87]:
def char_position(letter):
    return ord(letter) - 97

In [93]:
spell_error_samples = readSpellErrors(spellErrorsFile)
confusion_matrix = createConfusionMatricesFromCorrections(spell_error_samples)