In [1]:
'''
Based on SymSpell:

Originally written in C#:

// SymSpell: 1 million times faster through Symmetric Delete spelling correction algorithm
//
// The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup 
// for a given Damerau-Levenshtein distance. It is six orders of magnitude faster and language independent.
// Opposite to other algorithms only deletes are required, no transposes + replaces + inserts.
// Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term.
// Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters!
//
// Copyright (C) 2015 Wolf Garbe
// Version: 3.0
// Author: Wolf Garbe <wolf.garbe@faroo.com>
// Maintainer: Wolf Garbe <wolf.garbe@faroo.com>
// URL: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
// Description: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
//
// License:
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License, 
// version 3.0 (LGPL-3.0) as published by the Free Software Foundation.
// http://www.opensource.org/licenses/LGPL-3.0
//
// Usage: single word + Enter:  Display spelling suggestions
//        Enter without input:  Terminate the program
'''
pass

In [2]:
import re
import math # GD: needed to calculate logs below
from scipy.stats import poisson # GD: needed to calculate emission probability

In [38]:
max_edit_distance = 3
not_found_str = '<not found>'

#Initialization:

In [4]:
def get_deletes_list(w):
    '''given a word, derive strings with up to max_edit_distance characters deleted'''
    deletes = []
    queue = [w]
    for d in range(max_edit_distance):
        temp_queue = []
        for word in queue:
            if len(word)>1:
                for c in range(len(word)):  # character index
                    word_minus_c = word[:c] + word[c+1:]
                    if word_minus_c not in deletes:
                        deletes.append(word_minus_c)
                    if word_minus_c not in temp_queue:
                        temp_queue.append(word_minus_c)
        queue = temp_queue
        
    return deletes

In [5]:
def create_dictionary_entry(w, dictionary, longest_word_length):
    '''add word and its derived deletions to dictionary'''
    # check if word is already in dictionary
    # dictionary entries are in the form: (list of suggested corrections, frequency of word in corpus)

    new_real_word_added = False
    if w in dictionary:
        dictionary[w] = (dictionary[w][0], dictionary[w][1] + 1)  # increment count of word in corpus
    else:
        dictionary[w] = ([], 1)  
        longest_word_length = max(longest_word_length, len(w))
        
    if dictionary[w][1]==1:
        # first appearance of word in corpus
        # n.b. word may already be in dictionary as a derived word (deleting character from a real word)
        # but counter of frequency of word in corpus is not incremented in those cases)
        
        new_real_word_added = True
        deletes = get_deletes_list(w)
        
        for item in deletes:
            if item in dictionary:
                # add (correct) word to delete's suggested correction list if not already there
                if item not in dictionary[item][0]:
                    dictionary[item][0].append(w)
            else:
                dictionary[item] = ([w], 0)  # note frequency of word in corpus is not incremented
        
    return new_real_word_added, longest_word_length

In [6]:
def create_dictionary(fname):
    
    print "Creating dictionary..." 

    dictionary = dict() # GD: moved here to ensure that dictionary is re-initialized.
    longest_word_length = 0 # GD: moved here to ensure that it is re-initialized.
    start_prob = dict()
    transition_prob = dict()
    word_count = 0
    
    with open(fname) as file:    
        
        for line in file:
            
            for sentence in line.split('.'): # GD: added to ensure split at sentence level
                
                words = re.findall('[a-z]+', sentence.lower())  # separate by words by non-alphabetical characters      
                
                for w, word in enumerate(words):
                    
                    new_word, longest_word_length = \
                        create_dictionary_entry(word, dictionary, longest_word_length)
                    
                    if new_word:
                        word_count += 1
                        
                    # GD: added to calculate probabilities for Hidden Markov Model
                    if w == 0:

                        # Probability of a word being at the beginning of a sentence
                        if word in start_prob:
                            start_prob[word] += 1
                        else:
                            start_prob[word] = 1
                    else:
                        
                        # Probability of transitionining from one word to another
                        # dictionary format: {previous word: {word1 : P(word1|prevous word), word2 : P(word2|prevous word)}}
                        # Check that prior word is present - create if not
                        if words[w - 1] not in transition_prob:
                            transition_prob[words[w - 1]] = dict()
                            
                        # Check that current word is present - create if not
                        if word not in transition_prob[words[w - 1]]:
                            transition_prob[words[w - 1]][word] = 0
                            
                        # Update value
                        transition_prob[words[w - 1]][word] += 1
                              
    # GD: added to convert counts to log-probabilities (to avoid underflow)
    # Note: natural logarithm, not base-10
    
    total_start_words = float(sum(start_prob.values()))
    default_start_prob = math.log(1/total_start_words)
    start_prob.update({k: math.log(v/total_start_words) for k, v in start_prob.items()})
    
    default_transition_prob = math.log(1./word_count)    
    transition_prob.update({k: {k1: math.log(float(v1)/sum(v.values())) for k1, v1 in v.items()} \
                            for k, v in transition_prob.items()})

    print "Total unique words in corpus: %i" % word_count
    print "Total items in dictionary (corpus words and deletions): %i" % len(dictionary)
    print "  Edit distance for deletions: %i" % max_edit_distance
    print "  Length of longest word in corpus: %i" % longest_word_length
    print "Total unique words appearing at the start of a sentence: %i" % len(start_prob)
    print "Total unique word transitions: %i" % len(transition_prob)
        
    return dictionary, longest_word_length, start_prob, default_start_prob, \
            transition_prob, default_transition_prob

In [7]:
%%time
dictionary, longest_word_length, start_prob, default_start_prob, transition_prob, default_transition_prob = \
    create_dictionary("testdata/big.txt")
pass

Creating dictionary...
Total unique words in corpus: 29157
Total items in dictionary (corpus words and deletions): 2151998
  Edit distance for deletions: 3
  Length of longest word in corpus: 18
Total unique words appearing at the start of a sentence: 15297
Total unique word transitions: 27224
CPU times: user 33.7 s, sys: 815 ms, total: 34.5 s
Wall time: 34.7 s


<div class="alert alert-info">
<p><b>For testing:</b></p>
<p>
Can look up a specific entry in the dictionary below. <br>
shows (possible corrections, and frequency that entry itself is in corpus - 0 if not a real word) <br>
Note: will return key error if there are no corrections (because we are accessing dictionary directly here)
</p>
</div>

In [8]:
try:
    print dictionary["essential"]
except KeyError:
    print 'Not in dictionary'

(['essentially', 'essentials'], 92)


In [9]:
try:
    print dictionary["wack"]
except KeyError:
    print 'Not in dictionary'

(['wrack'], 0)


#Word-level correction:

In [10]:
def dameraulevenshtein(seq1, seq2):
    """Calculate the Damerau-Levenshtein distance between sequences.

    Source: http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
    
    This distance is the number of additions, deletions, substitutions,
    and transpositions needed to transform the first sequence into the
    second. Although generally used with strings, any sequences of
    comparable objects will work.

    Transpositions are exchanges of *consecutive* characters; all other
    operations are self-explanatory.

    This implementation is O(N*M) time and O(M) space, for N and M the
    lengths of the two sequences.

    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2

    It works with arbitrary sequences too:
    >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
    2
    """
    # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
    # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
    # However, only the current and two previous rows are needed at once,
    # so we only store those.
    oneago = None
    thisrow = range(1, len(seq2) + 1) + [0]
    for x in xrange(len(seq1)):
        # Python lists wrap around for negative indices, so put the
        # leftmost column at the *end* of the list. This matches with
        # the zero-indexed strings and saves extra calculation.
        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # This block deals with transpositions
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
    return thisrow[len(seq2) - 1]

In [11]:
def get_suggestions(string, dictionary, longest_word_length, silent=False, min_count=0):
    '''return list of suggested corrections for potentially incorrectly spelled word'''
    
    if (len(string) - longest_word_length) > max_edit_distance:
        if not silent:
            print "no items in dictionary within maximum edit distance"
        return []
    
    # suggestions = []
    # s_dictionary = {}
    suggest_dict = {}
    
    queue = [string]
    q_dictionary = {}  # items other than string that we've checked
    
    while len(queue)>0:
        q_item = queue[0]  # pop
        # print "processing '%s'" % q_item
        queue = queue[1:]
        
        # process queue item
        if (q_item in dictionary) and (q_item not in suggest_dict):
            if (dictionary[q_item][1]>=min_count):# GD added to trim list for context checking
            # word is in dictionary, and is a word from the corpus, and not already in suggestion list
            # so add to suggestion dictionary, indexed by the word with value (frequency in corpus, edit distance)
            # note q_items that are not the input string are shorter than input string 
            # since only deletes are added (unless manual dictionary corrections are added)
                assert len(string)>=len(q_item)
                suggest_dict[q_item] = (dictionary[q_item][1], len(string) - len(q_item))

            ## the suggested corrections for q_item as stored in dictionary (whether or not
            ## q_item itself is a valid word or merely a delete) can be valid corrections
            for sc_item in dictionary[q_item][0]:
                if (sc_item not in suggest_dict):
                    # compute edit distance
                    # if len(sc_item)==len(q_item):
                    #    item_dist = len(string) - len(q_item)
                    # suggested items should always be longer (unless manual corrections are added)
                    assert len(sc_item)>len(q_item)
                    # q_items that are not input should be shorter than original string 
                    # (unless manual corrections added)
                    assert len(q_item)<=len(string)
                    if len(q_item)==len(string):
                        assert q_item==string
                        item_dist = len(sc_item) - len(q_item)
                    #elif len(q_item)==len(string):
                        # a suggestion could be longer or shorter than original string (bug in original FAROO?)
                        # if suggestion is from string's suggestion list, sc_item will be longer
                        # if suggestion is from a delete's suggestion list, sc_item may be shorter
                    #   item_dist = abs(len(sc_item) - len(q_item))
                    #else:
                    # check in original code, but probably not necessary because string has already checked
                    assert sc_item!=string

                    # calculate edit distance using, for example, Damerau-Levenshtein distance
                    item_dist = dameraulevenshtein(sc_item, string)

                    if item_dist<=max_edit_distance:
                        assert sc_item in dictionary  # should already be in dictionary if in suggestion list
                        if (dictionary[q_item][1]>=min_count):# GD added to trim list for context checking
                            suggest_dict[sc_item] = (dictionary[sc_item][1], item_dist)

        # now generate deletes (e.g. a substring of string or of a delete) from the queue item
        # as additional items to check -- add to end of queue
        assert len(string)>=len(q_item)
        if (len(string)-len(q_item))<max_edit_distance and len(q_item)>1:
            for c in range(len(q_item)): # character index        
                word_minus_c = q_item[:c] + q_item[c+1:]
                if word_minus_c not in q_dictionary:
                    queue.append(word_minus_c)
                    q_dictionary[word_minus_c] = None  # arbitrary value, just to identify we checked this
             
    # queue is now empty: convert suggestions in dictionary to list for output
    
    if not silent:
        print "number of possible corrections: %i" %len(suggest_dict)
        print "  edit distance for deletions: %i" % max_edit_distance
    
    # output option 1
    # sort results by ascending order of edit distance and descending order of frequency
    #     and return list of suggested corrections only:
    # return sorted(suggest_dict, key = lambda x: (suggest_dict[x][1], -suggest_dict[x][0]))

    # output option 2
    # return list of suggestions with (correction, (frequency in corpus, edit distance)):
    as_list = suggest_dict.items()
    return sorted(as_list, key = lambda (term, (freq, dist)): (dist, -freq))

    '''
    Option 1:
    get_suggestions("file")
    ['file', 'five', 'fire', 'fine', ...]
    
    Option 2:
    get_suggestions("file")
    [('file', (5, 0)),
     ('five', (67, 1)),
     ('fire', (54, 1)),
     ('fine', (17, 1))...]  
    '''

<div class="alert alert-info">
<p><b>For testing:</b></p>
<p>
Type in word to correct below, to test and get whole list of possible suggestions.
</p>
</div>

In [12]:
%%time
get_suggestions("mittens", dictionary, longest_word_length)

number of possible corrections: 142
  edit distance for deletions: 3
CPU times: user 11.4 ms, sys: 966 µs, total: 12.3 ms
Wall time: 11.5 ms


[('mitten', (0, 1)),
 ('mittes', (0, 1)),
 ('ittens', (0, 1)),
 ('matters', (136, 2)),
 ('bitten', (13, 2)),
 ('kitten', (7, 2)),
 ('listens', (2, 2)),
 ('battens', (1, 2)),
 ('smitten', (1, 2)),
 ('itten', (0, 2)),
 ('mites', (0, 2)),
 ('miten', (0, 2)),
 ('ittes', (0, 2)),
 ('mtten', (0, 2)),
 ('mttes', (0, 2)),
 ('ttens', (0, 2)),
 ('itens', (0, 2)),
 ('miens', (0, 2)),
 ('mittn', (0, 2)),
 ('mitte', (0, 2)),
 ('ittns', (0, 2)),
 ('mitts', (0, 2)),
 ('matter', (365, 3)),
 ('sitting', (269, 3)),
 ('minutes', (146, 3)),
 ('written', (117, 3)),
 ('miles', (110, 3)),
 ('citizens', (109, 3)),
 ('letters', (108, 3)),
 ('listen', (100, 3)),
 ('cities', (77, 3)),
 ('bitter', (47, 3)),
 ('masters', (37, 3)),
 ('intense', (34, 3)),
 ('witness', (33, 3)),
 ('attend', (29, 3)),
 ('mistress', (24, 3)),
 ('fitted', (23, 3)),
 ('mines', (22, 3)),
 ('fitting', (21, 3)),
 ('miners', (19, 3)),
 ('mitenka', (16, 3)),
 ('tens', (16, 3)),
 ('sisters', (16, 3)),
 ('intent', (13, 3)),
 ('mothers', (12, 3)

In [13]:
%%time
#benchmark timing
for i in range(1000):
    get_suggestions("acamodation", dictionary, longest_word_length, silent=True)

CPU times: user 1.82 s, sys: 5.75 ms, total: 1.83 s
Wall time: 1.83 s


In [14]:
%%time
#benchmark timing
for i in range(1000):
    get_suggestions("acomodation", dictionary, longest_word_length, silent=True)

CPU times: user 2.07 s, sys: 3.79 ms, total: 2.08 s
Wall time: 2.08 s


In [15]:
%%time
#benchmark timing
for i in range(1000):
    get_suggestions("hous", dictionary, longest_word_length, silent=True)

CPU times: user 39.4 s, sys: 213 ms, total: 39.6 s
Wall time: 39.9 s


In [16]:
# get best word
def best_word(s, dictionary, silent=False):
    try:
        return get_suggestions(s, dictionary, longest_word_length, silent)[0]
    except:
        return None

<div class="alert alert-info">
<p><b>For testing:</b></p>
<p>
Type in word to correct below, to test and get most suggested word.
</p>
</div>

In [17]:
best_word("hello", dictionary)

number of possible corrections: 349
  edit distance for deletions: 3


('hello', (1, 0))

In [18]:
def correct_document(fname, dictionary):
    with open(fname) as file:
        doc_word_count = 0
        corrected_word_count = 0
        unknown_word_count = 0
        print "Finding misspelled words in your document..." 
        
        for i, line in enumerate(file):
            doc_words = re.findall('[a-z]+', line.lower())  # separate by words by non-alphabetical characters      
            for doc_word in doc_words:
                doc_word_count += 1
                suggestion = best_word(doc_word, dictionary, silent=True)
                if suggestion is None:
                    print "In line %i, the word < %s > was not found (no suggested correction)" % (i, doc_word)
                    unknown_word_count += 1
                elif suggestion[0]!=doc_word:
                    print "In line %i, %s: suggested correction is < %s >" % (i, doc_word, suggestion[0])
                    corrected_word_count += 1
        
    print "-----"
    print "total words checked: %i" % doc_word_count
    print "total unknown words: %i" % unknown_word_count
    print "total potential errors found: %i" % corrected_word_count

    return

<div class="alert alert-info">
<p><b>For testing:</b></p>
<p>
Provide text file to correct, and give all best word suggestions (word level only) for errors.
</p>
</div>

In [19]:
correct_document("testdata/test.txt", dictionary)

Finding misspelled words in your document...
-----
total words checked: 27
total unknown words: 0
total potential errors found: 0


In [20]:
# from http://www.columbia.edu/acis/cria/rosenberg/sample/
correct_document("testdata/OCRsample.txt", dictionary)

Finding misspelled words in your document...
In line 3, taiths: suggested correction is < taits >
In line 11, the word < oonipiittee > was not found (no suggested correction)
In line 13, tj: suggested correction is < to >
In line 13, mnnff: suggested correction is < snuff >
In line 13, gjpt: suggested correction is < gpt >
In line 15, unuer: suggested correction is < under >
In line 20, mthiitt: suggested correction is < miitt >
In line 22, pythian: suggested correction is < ythian >
In line 28, debbs: suggested correction is < debts >
In line 29, nericans: suggested correction is < ericans >
In line 33, unorthodox: suggested correction is < orthodox >
In line 33, nenance: suggested correction is < penance >
In line 38, williaij: suggested correction is < william >
In line 40, fcsf: suggested correction is < fcs >
In line 42, unorthodoxy: suggested correction is < orthodox >
In line 42, thpt: suggested correction is < that >
In line 42, the word < senbrnrgs > was not found (no suggeste

#Context-level correction:

**Model setup:**  
Each sentence is modeled as a hidden Markov model. Prior probabilities (for first word in the sentence) and transition probabilities (for all subsequent words) are calculated when generating the main dictionary, using the same corpus. Emission probabilities are generated on the fly by parameterizing a Poisson distribution with the edit distance. The state space of possible corrections is based on the suggested words from the word-level correction.  
All probabilities are stored in log-space to avoid underflow. Pre-defined minimum values are used for words that are not present in the dictionary and/or probability tables.

In [21]:
def get_emission_prob(edit_dist, poisson_lambda=0.01):
    
    # Poisson(k, l), where k = edit distance and l=0.01
    # TODO - validate lambda parameter (taken from Verena's code)
    
    return math.log(poisson.pmf(edit_dist, poisson_lambda))

In [22]:
def get_start_prob(word, start_prob, default_start_prob):
    try:
        return start_prob[word]
    except KeyError:
        return default_start_prob

In [23]:
def get_transition_prob(cur_word, prev_word, transition_prob, default_transition_prob):
    try:
        return transition_prob[prev_word][cur_word]
    except KeyError:
        return default_transition_prob

In [24]:
def get_belief(prev_word, prev_belief):
    try:
        return prev_belief[prev_word]
    except KeyError:
        return math.log(math.exp(min(prev_belief.values()))/2.) # TODO - confirm default value

In [42]:
# Modified from AM207 lecture notes
def viterbi(words, dictionary, longest_word_length, start_prob, default_start_prob, \
                transition_prob, default_transition_prob, \
                num_word_suggestions=5000):
    
    V = [{}]
    path = {}
    path_context = []
    path_word = []
    
    # FOR TESTING - DELETE EVENTUALLY
    if type(words) != list:
        words = re.findall('[a-z]+', words.lower())  # separate by words by non-alphabetical characters
        
    # Character level correction
    corrections = get_suggestions(words[0], dictionary, longest_word_length, \
                                  silent=True, min_count=1)

    # To ensure Viterbi can keep running
    if len(corrections) == 0:
        corrections = [(words[0], (1, 0))]
        path_word.append(not_found_str)
    else:    
        if len(corrections) > num_word_suggestions:
            corrections = corrections[0:num_word_suggestions]
        if len(corrections) > 0:
            path_word.append(corrections[0][0])  # string of most frequent word tuple
        
    # Initialize base cases (t == 0)
    for sug_word in corrections:
        
        # compute the value for all possible starting states
        V[0][sug_word[0]] = math.exp(get_start_prob(sug_word[0], start_prob, default_start_prob) \
                                     + get_emission_prob(sug_word[1][1]))
        
        # remember all the different paths (here its only one state so far)
        path[sug_word[0]] = [sug_word[0]]
 
    # normalize for numerical stability
    path_temp_sum = sum(V[0].values())
    V[0].update({k: math.log(v/path_temp_sum) for k, v in V[0].items()})
    prev_corrections = [i[0] for i in corrections]
    
    if len(words) == 1:
        path_context = [max(V[0], key=lambda i: V[0][i])]
        return path_word, path_context

    # Run Viterbi for t > 0
    for t in range(1, len(words)):

        V.append({})
        new_path = {}
        
        # Character level correction
        corrections = get_suggestions(words[t], dictionary, longest_word_length, \
                        silent=True, min_count=1)
        
        # To ensure Viterbi can keep running
        if len(corrections) == 0:
            corrections = [(words[t], (1, 0))]
            path_word.append(not_found_str)
        else:
            if len(corrections) > num_word_suggestions:
                corrections = corrections[0:num_word_suggestions]
            if len(corrections) > 0:
                path_word.append(corrections[0][0])  # string of most frequent word tuple
 
        for sug_word in corrections:
        
            sug_word_emission_prob = get_emission_prob(sug_word[1][1])
            
            # compute the values coming from all possible previous states, only keep the maximum
            (prob, word) = max((get_belief(prev_word, V[t-1]) \
                            + get_transition_prob(sug_word[0], prev_word, transition_prob, default_transition_prob) \
                            + sug_word_emission_prob, prev_word) for prev_word in prev_corrections)

            # save the maximum value for each state
            V[t][sug_word[0]] = math.exp(prob)
            # remember the path we came from to get this maximum value
            new_path[sug_word[0]] = path[word] + [sug_word[0]]
            
        # normalize for numerical stability
        path_temp_sum = sum(V[t].values())
        V[t].update({k: math.log(v/path_temp_sum) for k, v in V[t].items()})
        prev_corrections = [i[0] for i in corrections]
 
        # Don't need to remember the old paths
        path = new_path
     
    (prob, word) = max((V[t][sug_word[0]], sug_word[0]) for sug_word in corrections)
    path_context = path[word]
    
    assert len(path_word) == len(path_context)

    return path_word, path_context

In [40]:
def correct_document_context(fname, dictionary, longest_word_length, start_prob, default_start_prob, \
                             transition_prob, default_transition_prob, num_word_suggestions=5000):
    
    doc_word_count = 0
    unknown_word_count = 0
    corrected_word_count = 0
    mismatches = 0
    
    with open(fname) as file:
        
        for i, line in enumerate(file):
            
            for sentence in line.split('.'):
                
                words = re.findall('[a-z]+', sentence.lower())  # separate by words by non-alphabetical characters
                doc_word_count += len(words)
                
                if len(words) > 0:
                
                    suggestion_w, suggestion_c = viterbi(words, dictionary, longest_word_length, \
                                                start_prob, default_start_prob, \
                                                transition_prob, default_transition_prob)

                    # Display sentences where errors have been identified
                    if (words != suggestion_w) or (words != suggestion_c):
                        
                        # Check for unknown words
                        unknown_word_count += sum([w==not_found_str for w in suggestion_w])
                        
                        # Most users will expect to see 1-indexing.
                        print '\nErrors found in line %i. \nOriginal sentence: %s' % (i+1, " ".join(words))

                        # Word-checker and context-checker output match
                        if suggestion_w == suggestion_c:
                            print 'Word & context-level correction: %s' % (" ".join(suggestion_w))
                            corrected_word_count += sum([words[j]!=suggestion_w[j] for j in range(len(words))])
                        
                        # Word-checker and context-checker output don't match
                        else:
                            print 'Word-level correction: %s' % (" ".join(suggestion_w))
                            print 'Context-level correction: %s' % (" ".join(suggestion_c))
                            corrected_word_count += \
                                sum([(words[j]!=suggestion_w[j]) or (words[j]!=suggestion_c[j]) for j in range(len(words))])
                            mismatches += sum([suggestion_w[j] != suggestion_c[j] for j in range(len(words))])
  
    print "-----"
    print "total words checked: %i" % doc_word_count
    print "total unknown words: %i" % unknown_word_count
    print "total potential errors found: %i" % corrected_word_count
    print "total mismatches (word-level vs. context-level): %i" % mismatches

<div class="alert alert-info">
<p><b>For testing:</b></p>
<p>
Provide string to correct, and give all best word suggestions (word level & context level).
</p>
</div>

In [27]:
sentence = "ther sa pile of clothsing on the side of thee train treks"
# sentence = "is a very"
# sentence = "is a test"
# sentence = "a test this"
# sentence = "a test tube"

In [28]:
%%time
print 'Viterbi algorithm:'
word_check, context_check = viterbi(sentence, \
            dictionary, longest_word_length, start_prob, default_start_prob, transition_prob, default_transition_prob)
print 'Original sentence: ', sentence
print 'Word-level check: ', " ".join(word_check)
print 'Context-level check: ', " ".join(context_check)

Viterbi algorithm:
Original sentence:  ther sa pile of clothsing on the side of thee train treks
Word-level check:  the sa pile of clothing on the side of thee train trees
Context-level check:  then a pile of clothing of the side of the train tres
CPU times: user 19.4 s, sys: 181 ms, total: 19.6 s
Wall time: 20 s


<div class="alert alert-info">
<p><b>For testing:</b></p>
<p>
Provide text file to correct, and give all best word suggestions (word level & context level).
</p>
</div>

In [32]:
%%time
correct_document_context("testdata/test.txt", \
                         dictionary, longest_word_length, start_prob, default_start_prob, \
                         transition_prob, default_transition_prob)


Errors found in line 4. 
Original sentence: this is ax test
Word-level correction: this is ax test
Context-level correction: this is a test

Errors found in line 5. 
Original sentence: this is za test
Word & context-level correction: this is a test

Errors found in line 6. 
Original sentence: thee is a test
Word-level correction: thee is a test
Context-level correction: there is a test

Errors found in line 7. 
Original sentence: her tee set
Word & context-level correction: her the set
-----
total words checked: 27
total unknown words: 0
total potential errors found: 4
total mismatches (word-level vs. context-level): 2
CPU times: user 1min 7s, sys: 2 s, total: 1min 9s
Wall time: 1min 22s


In [43]:
%%time
correct_document_context("testdata/OCRsample.txt", \
                         dictionary, longest_word_length, start_prob, default_start_prob, \
                         transition_prob, default_transition_prob)


Errors found in line 1. 
Original sentence: ny
Word-level correction: ny
Context-level correction: no

Errors found in line 4. 
Original sentence: di taiths
Word-level correction: di tastes
Context-level correction: i waits

Errors found in line 7. 
Original sentence: cord n
Word-level correction: cord n
Context-level correction: cord in

Errors found in line 12. 
Original sentence: en raised by the oonipiittee to pay the expenses and
Word-level correction: en raised by the <not found> to pay the expenses and
Context-level correction: in raised by the oonipiittee to pay the expense and

Errors found in line 13. 
Original sentence: charges for preparing and filing the printed record and
Word-level correction: charges for preparing and filing the printed record and
Context-level correction: charge for preparing and filling the printed records and

Errors found in line 14. 
Original sentence: brief for julius and t tj f d th t
Word-level correction: brief for julius and t to f d th t
Con

In [44]:
%%time
correct_document_context("testdata/tiny.txt", \
                         dictionary, longest_word_length, start_prob, default_start_prob, \
                         transition_prob, default_transition_prob)

-----
total words checked: 209
total unknown words: 0
total potential errors found: 0
total mismatches (word-level vs. context-level): 0
CPU times: user 4min 43s, sys: 6.96 s, total: 4min 50s
Wall time: 5min 1s
