In [1]:
'''
Based on SymSpell:

Originally written in C#:

// SymSpell: 1 million times faster through Symmetric Delete spelling correction algorithm
//
// The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup 
// for a given Damerau-Levenshtein distance. It is six orders of magnitude faster and language independent.
// Opposite to other algorithms only deletes are required, no transposes + replaces + inserts.
// Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term.
// Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters!
//
// Copyright (C) 2015 Wolf Garbe
// Version: 3.0
// Author: Wolf Garbe <wolf.garbe@faroo.com>
// Maintainer: Wolf Garbe <wolf.garbe@faroo.com>
// URL: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
// Description: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
//
// License:
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License, 
// version 3.0 (LGPL-3.0) as published by the Free Software Foundation.
// http://www.opensource.org/licenses/LGPL-3.0
//
// Usage: single word + Enter:  Display spelling suggestions
//        Enter without input:  Terminate the program
'''
pass

In [2]:
import re
import math # GD: needed to calculate logs below
from scipy.stats import poisson # GD: needed to calculate emission probability

In [250]:
max_edit_distance = 3

In [4]:
def get_deletes_list(w):
    '''given a word, derive strings with up to max_edit_distance characters deleted'''
    deletes = []
    queue = [w]
    for d in range(max_edit_distance):
        temp_queue = []
        for word in queue:
            if len(word)>1:
                for c in range(len(word)):  # character index
                    word_minus_c = word[:c] + word[c+1:]
                    if word_minus_c not in deletes:
                        deletes.append(word_minus_c)
                    if word_minus_c not in temp_queue:
                        temp_queue.append(word_minus_c)
        queue = temp_queue
        
    return deletes

In [5]:
# get_deletes_list("tomorrow")

In [6]:
def create_dictionary_entry(w, dictionary, longest_word_length):
    '''add word and its derived deletions to dictionary'''
    # check if word is already in dictionary
    # dictionary entries are in the form: (list of suggested corrections, frequency of word in corpus)

    new_real_word_added = False
    if w in dictionary:
        dictionary[w] = (dictionary[w][0], dictionary[w][1] + 1)  # increment count of word in corpus
    else:
        dictionary[w] = ([], 1)  
        longest_word_length = max(longest_word_length, len(w))
        
    if dictionary[w][1]==1:
        # first appearance of word in corpus
        # n.b. word may already be in dictionary as a derived word (deleting character from a real word)
        # but counter of frequency of word in corpus is not incremented in those cases)
        
        new_real_word_added = True
        deletes = get_deletes_list(w)
        
        for item in deletes:
            if item in dictionary:
                # add (correct) word to delete's suggested correction list if not already there
                if item not in dictionary[item][0]:
                    dictionary[item][0].append(w)
            else:
                dictionary[item] = ([w], 0)  # note frequency of word in corpus is not incremented
        
    return new_real_word_added, longest_word_length

In [176]:
def create_dictionary(fname):
    
    print "Creating dictionary..." 

    dictionary = dict() # GD: moved here to ensure that dictionary is re-initialized.
    longest_word_length = 0 # GD: moved here to ensure that it is re-initialized.
    start_prob = dict()
    transition_prob = dict()
    transition_prob_norm = dict()    
    word_count = 0
    
    with open(fname) as file:    
        
        for line in file:
            
            for sentence in line.split('.'): # GD: added to ensure split at sentence level
                
                words = re.findall('[a-z]+', sentence.lower())  # separate by words by non-alphabetical characters      
                
                for num_word in xrange(len(words)):
                    
                    new_word, longest_word_length = \
                        create_dictionary_entry(words[num_word], dictionary, longest_word_length)
                    
                    if new_word:
                        word_count += 1
                        
                    # GD: added to calculate probabilities for Hidden Markov Model
                    if num_word == 0:

                        # Probability of a word being at the beginning of a sentence
                        if words[num_word] in start_prob:
                            start_prob[words[num_word]] += 1
                        else:
                            start_prob[words[num_word]] = 1
                    else:
                        
                        # Probability of transitionining from one word to another
                        # Key format (word, previous word)
                        if (words[num_word], words[num_word - 1]) in transition_prob:
                            transition_prob[(words[num_word], words[num_word - 1])] += 1
                        else:
                            transition_prob[(words[num_word], words[num_word - 1])] = 1
                            
                        # Used to normalize probabilities
                        if words[num_word] in transition_prob_norm:
                            transition_prob_norm[words[num_word]] += 1
                        else:
                            transition_prob_norm[words[num_word]] = 1
                              
    # GD: added to convert counts to log-probabilities (to avoid underflow)
    # Note: natural logarithm, not base-10
    total_start_words = float(sum(start_prob.values()))
    default_start_prob = math.log(1/total_start_words)
    start_prob.update({k: math.log(v/total_start_words) for k, v in start_prob.items()})
    default_transition_prob = math.log(1./word_count)
    transition_prob.update({k: math.log(v/float(transition_prob_norm[k[0]])) for k, v in transition_prob.items()})
                    
    print "Total unique words in corpus: %i" % word_count
    print "Total items in dictionary (corpus words and deletions): %i" % len(dictionary)
    print "  Edit distance for deletions: %i" % max_edit_distance
    print "  Length of longest word in corpus: %i" % longest_word_length
    print "Total unique words appearing at the start of a sentence: %i" % len(start_prob)
    print "Total unique word transitions: %i" % len(transition_prob)
        
    return dictionary, longest_word_length, start_prob, default_start_prob, transition_prob, default_transition_prob

In [177]:
%%time
dictionary, longest_word_length, start_prob, default_start_prob, transition_prob, default_transition_prob = \
    create_dictionary("testdata/big.txt")
pass

Creating dictionary...
Total unique words in corpus: 29157
Total items in dictionary (corpus words and deletions): 2151998
  Edit distance for deletions: 3
  Length of longest word in corpus: 18
Total unique words appearing at the start of a sentence: 15297
Total unique word transitions: 319665
CPU times: user 33.2 s, sys: 1.07 s, total: 34.2 s
Wall time: 34.6 s


<div class="alert alert-info">
<b>For testing:</b> <p>
Can look up a specific entry in the dictionary below. <br>
shows (possible corrections, and frequency that entry itself is in corpus - 0 if not a real word) <br>
Note: will return key error if there are no corrections (because we are accessing dictionary directly here)
</div>

In [183]:
try:
    print dictionary["essential"]
except KeyError:
    print 'Not in dictionary'

(['essentially', 'essentials'], 92)


In [184]:
try:
    print dictionary["wack"]
except KeyError:
    print 'Not in dictionary'

(['wrack'], 0)


In [185]:
def dameraulevenshtein(seq1, seq2):
    """Calculate the Damerau-Levenshtein distance between sequences.

    Source: http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
    
    This distance is the number of additions, deletions, substitutions,
    and transpositions needed to transform the first sequence into the
    second. Although generally used with strings, any sequences of
    comparable objects will work.

    Transpositions are exchanges of *consecutive* characters; all other
    operations are self-explanatory.

    This implementation is O(N*M) time and O(M) space, for N and M the
    lengths of the two sequences.

    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2

    It works with arbitrary sequences too:
    >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
    2
    """
    # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
    # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
    # However, only the current and two previous rows are needed at once,
    # so we only store those.
    oneago = None
    thisrow = range(1, len(seq2) + 1) + [0]
    for x in xrange(len(seq1)):
        # Python lists wrap around for negative indices, so put the
        # leftmost column at the *end* of the list. This matches with
        # the zero-indexed strings and saves extra calculation.
        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # This block deals with transpositions
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
    return thisrow[len(seq2) - 1]

In [294]:
def get_suggestions(string, dictionary, longest_word_length, silent=False, min_count=0):
    '''return list of suggested corrections for potentially incorrectly spelled word'''
    
    if (len(string) - longest_word_length) > max_edit_distance:
        if not silent:
            print "no items in dictionary within maximum edit distance"
        return []
    
    # suggestions = []
    # s_dictionary = {}
    suggest_dict = {}
    
    queue = [string]
    q_dictionary = {}  # items other than string that we've checked
    
    while len(queue)>0:
        q_item = queue[0]  # pop
        # print "processing '%s'" % q_item
        queue = queue[1:]
        
        # process queue item
        if (q_item in dictionary) and (q_item not in suggest_dict):
            if (dictionary[q_item][1]>=min_count):# GD added to trim list for context checking
            # word is in dictionary, and is a word from the corpus, and not already in suggestion list
            # so add to suggestion dictionary, indexed by the word with value (frequency in corpus, edit distance)
            # note q_items that are not the input string are shorter than input string 
            # since only deletes are added (unless manual dictionary corrections are added)
                assert len(string)>=len(q_item)
                suggest_dict[q_item] = (dictionary[q_item][1], len(string) - len(q_item))

            ## the suggested corrections for q_item as stored in dictionary (whether or not
            ## q_item itself is a valid word or merely a delete) can be valid corrections
            for sc_item in dictionary[q_item][0]:
                if (sc_item not in suggest_dict):
                    # compute edit distance
                    # if len(sc_item)==len(q_item):
                    #    item_dist = len(string) - len(q_item)
                    # suggested items should always be longer (unless manual corrections are added)
                    assert len(sc_item)>len(q_item)
                    # q_items that are not input should be shorter than original string 
                    # (unless manual corrections added)
                    assert len(q_item)<=len(string)
                    if len(q_item)==len(string):
                        assert q_item==string
                        item_dist = len(sc_item) - len(q_item)
                    #elif len(q_item)==len(string):
                        # a suggestion could be longer or shorter than original string (bug in original FAROO?)
                        # if suggestion is from string's suggestion list, sc_item will be longer
                        # if suggestion is from a delete's suggestion list, sc_item may be shorter
                    #   item_dist = abs(len(sc_item) - len(q_item))
                    #else:
                    # check in original code, but probably not necessary because string has already checked
                    assert sc_item!=string

                    # calculate edit distance using, for example, Damerau-Levenshtein distance
                    item_dist = dameraulevenshtein(sc_item, string)

                    if item_dist<=max_edit_distance:
                        assert sc_item in dictionary  # should already be in dictionary if in suggestion list
                        if (dictionary[q_item][1]>=min_count):# GD added to trim list for context checking
                            suggest_dict[sc_item] = (dictionary[sc_item][1], item_dist)

        # now generate deletes (e.g. a substring of string or of a delete) from the queue item
        # as additional items to check -- add to end of queue
        assert len(string)>=len(q_item)
        if (len(string)-len(q_item))<max_edit_distance and len(q_item)>1:
            for c in range(len(q_item)): # character index        
                word_minus_c = q_item[:c] + q_item[c+1:]
                if word_minus_c not in q_dictionary:
                    queue.append(word_minus_c)
                    q_dictionary[word_minus_c] = None  # arbitrary value, just to identify we checked this
             
    # queue is now empty: convert suggestions in dictionary to list for output
    
    if not silent:
        print "number of possible corrections: %i" %len(suggest_dict)
        print "  edit distance for deletions: %i" % max_edit_distance
    
    # output option 1
    # sort results by ascending order of edit distance and descending order of frequency
    #     and return list of suggested corrections only:
    # return sorted(suggest_dict, key = lambda x: (suggest_dict[x][1], -suggest_dict[x][0]))

    # output option 2
    # return list of suggestions with (correction, (frequency in corpus, edit distance)):
    as_list = suggest_dict.items()
    return sorted(as_list, key = lambda (term, (freq, dist)): (dist, -freq))

    '''
    Option 1:
    get_suggestions("file")
    ['file', 'five', 'fire', 'fine', ...]
    
    Option 2:
    get_suggestions("file")
    [('file', (5, 0)),
     ('five', (67, 1)),
     ('fire', (54, 1)),
     ('fine', (17, 1))...]  
    '''

<div class="alert alert-info">
<b>For testing:</b><p>
Type in word to correct below, to test and get whole list of possible suggestions.
</div>

In [295]:
%%time
get_suggestions("mittens", dictionary, longest_word_length)

number of possible corrections: 142
  edit distance for deletions: 3
CPU times: user 16.6 ms, sys: 6.92 ms, total: 23.5 ms
Wall time: 17.9 ms


[('mitten', (0, 1)),
 ('mittes', (0, 1)),
 ('ittens', (0, 1)),
 ('matters', (136, 2)),
 ('bitten', (13, 2)),
 ('kitten', (7, 2)),
 ('listens', (2, 2)),
 ('battens', (1, 2)),
 ('smitten', (1, 2)),
 ('itten', (0, 2)),
 ('mites', (0, 2)),
 ('miten', (0, 2)),
 ('ittes', (0, 2)),
 ('mtten', (0, 2)),
 ('mttes', (0, 2)),
 ('ttens', (0, 2)),
 ('itens', (0, 2)),
 ('miens', (0, 2)),
 ('mittn', (0, 2)),
 ('mitte', (0, 2)),
 ('ittns', (0, 2)),
 ('mitts', (0, 2)),
 ('matter', (365, 3)),
 ('sitting', (269, 3)),
 ('minutes', (146, 3)),
 ('written', (117, 3)),
 ('miles', (110, 3)),
 ('citizens', (109, 3)),
 ('letters', (108, 3)),
 ('listen', (100, 3)),
 ('cities', (77, 3)),
 ('bitter', (47, 3)),
 ('masters', (37, 3)),
 ('intense', (34, 3)),
 ('witness', (33, 3)),
 ('attend', (29, 3)),
 ('mistress', (24, 3)),
 ('fitted', (23, 3)),
 ('mines', (22, 3)),
 ('fitting', (21, 3)),
 ('miners', (19, 3)),
 ('mitenka', (16, 3)),
 ('tens', (16, 3)),
 ('sisters', (16, 3)),
 ('intent', (13, 3)),
 ('mothers', (12, 3)

In [296]:
%%time
#benchmark timing
for i in range(1000):
    get_suggestions("acamodation", dictionary, longest_word_length, silent=True)

CPU times: user 2.19 s, sys: 33.3 ms, total: 2.23 s
Wall time: 2.47 s


In [259]:
%%time
#benchmark timing
for i in range(1000):
    get_suggestions("acomodation", dictionary, longest_word_length, silent=True)

CPU times: user 2.26 s, sys: 14.4 ms, total: 2.27 s
Wall time: 2.29 s


In [260]:
%%time
#benchmark timing
for i in range(1000):
    get_suggestions("hous", dictionary, longest_word_length, silent=True)

CPU times: user 6.21 s, sys: 35.8 ms, total: 6.25 s
Wall time: 6.29 s


In [261]:
# get best word
def best_word(s, dictionary, silent=False):
    try:
        return get_suggestions(s, dictionary, longest_word_length, silent)[0]
    except:
        return None

<div class="alert alert-info">
<b>For testing:</b><p>
Type in word to correct below, to test and get most suggested word.
</div>

In [262]:
best_word("hello", dictionary)

number of possible corrections: 163
  edit distance for deletions: 3


('hello', (1, 0))

In [263]:
def correct_document(fname, dictionary):
    with open(fname) as file:
        doc_word_count = 0
        corrected_word_count = 0
        unknown_word_count = 0
        print "Finding misspelled words in your document..." 
        
        for i, line in enumerate(file):
            doc_words = re.findall('[a-z]+', line.lower())  # separate by words by non-alphabetical characters      
            for doc_word in doc_words:
                doc_word_count += 1
                suggestion = best_word(doc_word, dictionary, silent=True)
                if suggestion is None:
                    print "In line %i, the word < %s > was not found (no suggested correction)" % (i, doc_word)
                    unknown_word_count += 1
                elif suggestion[0]!=doc_word:
                    print "In line %i, %s: suggested correction is < %s >" % (i, doc_word, suggestion[0])
                    corrected_word_count += 1
        
    print "-----"
    print "total words checked: %i" % doc_word_count
    print "total unknown words: %i" % unknown_word_count
    print "total potential errors found: %i" % corrected_word_count

    return

<div class="alert alert-info">
<b>For testing:</b><p>
Provide text file to correct, and give all best word suggestions (word level only) for errors.
</div>

In [264]:
correct_document("testdata/test.txt", dictionary)

Finding misspelled words in your document...
In line 4, za: suggested correction is < a >
In line 6, tee: suggested correction is < see >
-----
total words checked: 27
total unknown words: 0
total potential errors found: 2


In [265]:
# from http://www.columbia.edu/acis/cria/rosenberg/sample/
correct_document("testdata/OCRsample.txt", dictionary)

Finding misspelled words in your document...
In line 3, taiths: suggested correction is < faith >
In line 11, the word < oonipiittee > was not found (no suggested correction)
In line 13, tj: suggested correction is < t >
In line 13, mnnff: suggested correction is < snuff >
In line 13, gjpt: suggested correction is < get >
In line 15, bh: suggested correction is < b >
In line 15, snc: suggested correction is < sac >
In line 15, uth: suggested correction is < th >
In line 15, unuer: suggested correction is < under >
In line 20, mthiitt: suggested correction is < thirty >
In line 21, cas: suggested correction is < as >
In line 22, pythian: suggested correction is < scythian >
In line 26, brainin: suggested correction is < brain >
In line 27, jfl: suggested correction is < f >
In line 28, ji: suggested correction is < i >
In line 28, stice: suggested correction is < stick >
In line 28, blaci: suggested correction is < black >
In line 28, eug: suggested correction is < dug >
In line 28, deb

<div class="alert alert-info">
Serial code for context-based improvement, based on Hidden Markov Model. Integrated with code above - probabilities calculated when generating main dictionary (using same corpus).
</div>

In [459]:
def get_emission_prob(edit_dist, poisson_lambda=0.01):
    
    # Poisson(k, l), where k = edit distance and l=0.01
    # TODO - validate lambda parameter (taken from Verena's code
    
    return math.log(poisson.pmf(edit_dist, poisson_lambda))

In [451]:
def get_start_prob(word, start_prob, default_start_prob):
    if word in start_prob:
        return start_prob[word]
    else:
        return default_start_prob

In [452]:
def get_transition_prob(cur_word, prev_word, transition_prob, default_transition_prob):
    if (cur_word, prev_word) in transition_prob:
        return transition_prob[(cur_word, prev_word)]
    else:
        return default_transition_prob

In [453]:
def get_belief(prev_word, prev_belief):
    if prev_word in prev_belief:
        return prev_belief[prev_word]
    else:
        return math.log(math.exp(min(prev_belief.values()))/2.) # TODO - confirm default value

In [487]:
def viterbi_num(words, dictionary, longest_word_length, start_prob, default_start_prob, \
                transition_prob, default_transition_prob, \
                num_word_suggestions=5000):
    
    V = [{}]
    path = {}
    path_context = []
    path_word = []
    
    # FOR TESTING - DELETE EVENTUALLY
    if type(words) != list:
        words = re.findall('[a-z]+', words.lower())  # separate by words by non-alphabetical characters
        
    # Character level correction
    corrections = get_suggestions(words[0], dictionary, longest_word_length, silent=True, min_count=1)

    if len(corrections) > num_word_suggestions:
        corrections = corrections[0:num_word_suggestions]
    if len(corrections) > 0:
        path_word.append(corrections[0][0])  # string of most frequent word tuple

    # Initialize base cases (t == 0)
    for sug_word in corrections:
        
        # compute the value for all possible starting states
        V[0][sug_word[0]] = math.exp(get_start_prob(sug_word[0], start_prob, default_start_prob) \
                                     + get_emission_prob(sug_word[1][1]))
        
        # remember all the different paths (here its only one state so far)
        path[sug_word[0]] = [sug_word[0]]
 
    # normalize for numerical stability
    path_temp_sum = sum(V[0].values())
    V[0].update({k: math.log(v/path_temp_sum) for k, v in V[0].items()})
    prev_corrections = [i[0] for i in corrections]
    
    if len(words) == 1:
        path_context = [max(V[0], key=lambda i: V[0][i])]
        return path_word, path_context

    # Run Viterbi for t > 0
    for t in range(1, len(words)):

        V.append({})
        new_path = {}
        
        # Character level correction
        corrections = get_suggestions(words[t], dictionary, longest_word_length, silent=True, min_count=1)

        if len(corrections) > num_word_suggestions:
            corrections = corrections[0:num_word_suggestions]
        if len(corrections) > 0:
            path_word.append(corrections[0][0])  # string of most frequent word tuple
 
        for sug_word in corrections:
        
            sug_word_emission_prob = get_emission_prob(sug_word[1][1])
            
            # compute the values coming from all possible previous states, only keep the maximum
            (prob, word) = max((get_belief(prev_word, V[t-1]) \
                            + get_transition_prob(sug_word[0], prev_word, transition_prob, default_transition_prob) \
                            + sug_word_emission_prob, prev_word) for prev_word in prev_corrections)

            # save the maximum value for each state
            V[t][sug_word[0]] = math.exp(prob)
            # remember the path we came from to get this maximum value
            new_path[sug_word[0]] = path[word] + [sug_word[0]]
            
        # normalize for numerical stability
        path_temp_sum = sum(V[t].values())
        V[t].update({k: math.log(v/path_temp_sum) for k, v in V[t].items()})
        prev_corrections = [i[0] for i in corrections]
 
        # Don't need to remember the old paths
        path = new_path
     
    (prob, word) = max((V[t][sug_word[0]], sug_word[0]) for sug_word in corrections)
    path_context = path[word]

    assert len(path_word) == len(path_context)

    return path_word, path_context

In [491]:
%%time
sentence = "ther sa pile of clothsing on the side of thee train treks"
word_check, context_check = viterbi_num(sentence, \
            dictionary, longest_word_length, start_prob, default_start_prob, transition_prob, default_transition_prob)
print 'Original sentence: ', sentence
print 'Word-level check: ', " ".join(word_check)
print 'Context-level check: ', " ".join(context_check)

Original sentence:  ther sa pile of clothsing on the side of thee train treks
Word-level check:  the sa pile of clothing on the side of thee train trees
Context-level check:  the a pile of clothing on the side of the train tres
CPU times: user 10.9 s, sys: 117 ms, total: 11.1 s
Wall time: 11.5 s


In [None]:
def fwd_bkw(words, dictionary, longest_word_length, start_prob, default_start_prob, \
                transition_prob, default_transition_prob, \
                num_word_suggestions=5000):

# (x, states, a_0, a, e, end_st):
    
    # FOR TESTING - DELETE EVENTUALLY
    if type(words) != list:
        words = re.findall('[a-z]+', words.lower())  # separate by words by non-alphabetical characters
        
    path_word = []
        
    L = len(words)
 
    # forward part of the algorithm

    fwd = []
    f_prev = {}
    
    for i, word_i in enumerate(words):

        # Character level correction
        corrections = get_suggestions(word_i, dictionary, longest_word_length, silent=True, min_count=1)

        if len(corrections) > num_word_suggestions:
            corrections = corrections[0:num_word_suggestions]
        if len(corrections) > 0:
            path_word.append(corrections[0][0])  # string of most frequent word tuple
        
        f_curr = {}
        for st in corrections:

            if i == 0:
                # base case for the forward part
                prev_f_sum = get_start_prob(st[0], start_prob, default_start_prob)
            else:
                prev_f_sum = math.log(sum(math.exp(get_belief(k, f_prev) \
                            + get_transition_prob(st[0], k, transition_prob, default_transition_prob)) \
                                 for k in prev_corrections))
 
            f_curr[st] = get_emission_prob(st[1][1]) + prev_f_sum
 
        fwd.append(f_curr)
        f_prev = f_curr

        prev_corrections = [i[0] for i in corrections]
 
    p_fwd = math.log(sum(math.exp(f_curr[k] + get_transition_prob(end_st, k, transition_prob, default_transition_prob)) \
                for k in states))
 
    # backward part of the algorithm

    bkw = []
    b_prev = {}

    for i, x_i_plus in enumerate(reversed(words[1:]+(None,))):
        b_curr = {}
#         for st in states:
#             if i == 0:
#                 # base case for backward part
#                 b_curr[st] = a[st][end_st]
#             else:
#                 b_curr[st] = sum(a[st][l]*e[l][x_i_plus]*b_prev[l] for l in states)
 
#         bkw.insert(0,b_curr)
#         b_prev = b_curr
 
#     p_bkw = sum(a_0[l] * e[l][x[0]] * b_curr[l] for l in states)
 
#     # merging the two parts
#     posterior = []
#     for i in range(L):
#         posterior.append({st: fwd[i][st]*bkw[i][st]/p_fwd for st in states})
 
#     assert p_fwd == p_bkw
#     return fwd, bkw, posterior
    return path_word, []

In [514]:
%%time
sentence = "ther sa pile of clothsing on the side of thee train treks"
word_check, context_check = fwd_bkw(sentence, \
            dictionary, longest_word_length, start_prob, default_start_prob, transition_prob, default_transition_prob)
print 'Original sentence: ', sentence
print 'Word-level check: ', " ".join(word_check)
print 'Context-level check: ', " ".join(context_check)

KeyboardInterrupt: 

In [438]:
def correct_document_context(fname, dictionary, longest_word_length, start_prob, default_start_prob, \
                             transition_prob, default_transition_prob, \
                             context_threshold=5, num_word_suggestions=5000):
    
    doc_word_count = 0
    unknown_word_count = 0
    corrected_word_count = 0
    mismatches = 0
    
    with open(fname) as file:
        
        for line in file:
            
            for sentence in line.split('.'):
                
                words = re.findall('[a-z]+', sentence.lower())  # separate by words by non-alphabetical characters
                doc_word_count += len(words)
                    
                suggestion_w, suggestion_c = viterbi_num(words, dictionary, longest_word_length, \
                                                start_prob, default_start_prob, transition_prob, default_transition_prob)

                for word_num in range(len(suggestion_w)):
                
                    if suggestion_w[word_num] is None:
#                         print "In line %i, the word < %s > was not found (no suggested corrections)" % (i, doc_word)
                        unknown_word_count += 1
                    elif (suggestion_w[word_num]!=words[word_num]) or (suggestion_c[word_num]!=words[word_num]):
#                         print "In line %i, (%s %s) %s: word level correction is < %s >, context correction is < %s >" \
#                                   % (i, word1, word2, doc_word, suggestion_w, suggestion_c)
                        corrected_word_count += 1
                        if suggestion_w[word_num]!=suggestion_c[word_num]:
                            mismatches += 1
        
    print "-----"
    print "total words checked: %i" % doc_word_count
    print "total unknown words: %i" % unknown_word_count
    print "total potential errors found: %i" % corrected_word_count
    print "total mismatches (word level vs. context): %i" % mismatches

<div class="alert alert-info">
<b>For testing:</b><p>
Provide text file to correct, and give all best word suggestions (word level & context level) for all known words.
</div>

In [470]:
%%time
viterbi_num("ther sa pile of clothsing on the side of thee train treks", \
            dictionary, longest_word_length, start_prob, default_start_prob, transition_prob, default_transition_prob)

CPU times: user 10.2 s, sys: 54.8 ms, total: 10.2 s
Wall time: 10.3 s


(['the',
  'sa',
  'pile',
  'of',
  'clothing',
  'on',
  'the',
  'side',
  'of',
  'thee',
  'train',
  'trees'],
 ['the',
  'a',
  'pile',
  'of',
  'clothing',
  'on',
  'the',
  'side',
  'of',
  'the',
  'train',
  'tres'])

In [439]:
#test.txt

#this is a test
#this is a test
#here is a test
#this is ax test
#this is za test

In [440]:
correct_document_context("testdata/test.txt", \
                         dictionary, longest_word_length, start_prob, default_start_prob, \
                         transition_prob, default_transition_prob)

-----
total words checked: 27
total unknown words: 0
total potential errors found: 27
total mismatches (word level vs. context): 27


In [444]:
viterbi_num("is a very", \
            dictionary, longest_word_length, start_prob, default_start_prob, transition_prob, default_transition_prob)

(['is', 'a', 'very'], ['i', 'sta', 'ted'])

In [445]:
viterbi_num("is a test", \
            dictionary, longest_word_length, start_prob, default_start_prob, transition_prob, default_transition_prob)

(['is', 'a', 'test'], ['i', 'sta', 'ted'])

In [446]:
viterbi_num("a test this", \
            dictionary, longest_word_length, start_prob, default_start_prob, transition_prob, default_transition_prob)

(['a', 'test', 'this'], ['a', 'mere', 'dots'])

In [447]:
viterbi_num("a test tube", \
            dictionary, longest_word_length, start_prob, default_start_prob, transition_prob, default_transition_prob)

(['a', 'test', 'tube'], ['and', 'then', 'titi'])

In [38]:
# correct_document_context("/Users/K-Lo/Desktop/OCRsample.txt")

In [None]:
# correct_document_context("/Users/K-Lo/Desktop/usingengsample.txt")

In [None]:
# correct_document_context("/Users/K-Lo/Desktop/usingengsample.txt", 50, 5)