<div class="alert alert-info">
  <strong>START RUNNING CODE HERE</strong>
</div>

In [1]:
import re
import math
from scipy.stats import poisson
import itertools

In [2]:
import findspark
import os
findspark.init()
import pyspark
sc = pyspark.SparkContext()
sc.setLogLevel('ERROR')

***
# Pre-processing

In [3]:
n_partitions = 6  # number of partitions to be used
MAX_EDIT_DISTANCE = 3

In [4]:
def get_n_deletes_list(w, n):
    '''given a word, derive list of strings with up to n characters deleted'''
    # since this list is generally of the same magnitude as the number of 
    # characters in a word, it may not make sense to parallelize this
    # so we use python to create the list
    deletes = []
    queue = [w]
    for d in range(n):
        temp_queue = []
        for word in queue:
            if len(word)>1:
                for c in range(len(word)):  # character index
                    word_minus_c = word[:c] + word[c+1:]
                    if word_minus_c not in deletes:
                        deletes.append(word_minus_c)
                    if word_minus_c not in temp_queue:
                        temp_queue.append(word_minus_c)
        queue = temp_queue
        
    return deletes

In [5]:
############
#
# load file & initial processing
#
############

In [6]:
fname = "testdata/big.txt"

In [7]:
regex = re.compile('[^a-z ]')

In [8]:
make_all_lower = sc.textFile(fname).map(lambda line: line.lower()).filter(lambda x: x!='')

In [9]:
print make_all_lower
print make_all_lower.getNumPartitions()
print make_all_lower.count()
print make_all_lower.take(5)

PythonRDD[2] at RDD at PythonRDD.scala:43
2
103600
[u'the project gutenberg ebook of the adventures of sherlock holmes', u'by sir arthur conan doyle', u'(#15 in our series by sir arthur conan doyle)', u'copyright laws are changing all over the world. be sure to check the', u'copyright laws for your country before downloading or redistributing']


In [10]:
split_sentence = make_all_lower.flatMap(lambda line: line.split('.')) \
            .map(lambda sentence: regex.sub(' ', sentence)) \
            .map(lambda sentence: sentence.split())

In [11]:
print split_sentence
print split_sentence.getNumPartitions()
print split_sentence.count()
print split_sentence.take(5)

PythonRDD[5] at RDD at PythonRDD.scala:43
2
162272
[[u'the', u'project', u'gutenberg', u'ebook', u'of', u'the', u'adventures', u'of', u'sherlock', u'holmes'], [u'by', u'sir', u'arthur', u'conan', u'doyle'], [u'in', u'our', u'series', u'by', u'sir', u'arthur', u'conan', u'doyle'], [u'copyright', u'laws', u'are', u'changing', u'all', u'over', u'the', u'world'], [u'be', u'sure', u'to', u'check', u'the']]


In [12]:
############
#
# generate start probabilities
#
############

In [13]:
start_words = split_sentence.map(lambda sentence: sentence[0] if len(sentence)>0 else None) \
    .filter(lambda word: word!=None)

In [14]:
print start_words
print start_words.getNumPartitions()
print start_words.count()
print start_words.take(5)

PythonRDD[8] at RDD at PythonRDD.scala:43
2
137073
[u'the', u'by', u'in', u'copyright', u'be']


In [15]:
accum_total_start_words = sc.accumulator(0)
count_start_words_once = start_words.map(lambda word: (word, 1))
count_total_start_words = count_start_words_once.foreach(lambda x: accum_total_start_words.add(1))
total_start_words = float(accum_total_start_words.value)

In [16]:
print count_start_words_once
print count_start_words_once.getNumPartitions()
print count_start_words_once.count()
print count_start_words_once.take(5)

print 'Total start words:', total_start_words

PythonRDD[12] at RDD at PythonRDD.scala:43
2
137073
[(u'the', 1), (u'by', 1), (u'in', 1), (u'copyright', 1), (u'be', 1)]
Total start words: 137073.0


In [17]:
unique_start_words = count_start_words_once.reduceByKey(lambda a, b: a + b, numPartitions = n_partitions)

In [18]:
print unique_start_words
print unique_start_words.getNumPartitions()
print unique_start_words.count()
print unique_start_words.take(5)

PythonRDD[19] at RDD at PythonRDD.scala:43
6
15297
[(u'aided', 3), (u'suicidal', 1), (u'desirable', 4), (u'all', 562), (u'yellow', 4)]


In [19]:
start_prob_calc = unique_start_words.map(lambda (k,v): (k, math.log(v/total_start_words)))
default_start_prob = math.log(1/total_start_words)

In [20]:
print start_prob_calc
print start_prob_calc.getNumPartitions()
print start_prob_calc.count()
print start_prob_calc.take(5)

print 'Default start probability:', default_start_prob

PythonRDD[22] at RDD at PythonRDD.scala:43
6
15297
[(u'aided', -10.729656620945079), (u'suicidal', -11.82826890961319), (u'desirable', -10.441974548493299), (u'all', -5.496767059719498), (u'yellow', -10.441974548493299)]
Default start probability: -11.8282689096


In [21]:
start_prob = start_prob_calc.collectAsMap()

In [22]:
############
#
# generate transition probabilities
#
############

In [23]:
def get_transitions(sentence):
#     result = []
    if len(sentence)<2:
        return None
    else:
        return [((sentence[i], sentence[i+1]), 1) for i in range(len(sentence)-1)]
#         for i in range(len(sentence)-1):
#             result.append(((sentence[i], sentence[i+1]), 1))
#         return result

In [24]:
accum_total_other_words = sc.accumulator(0)
other_words = split_sentence.map(lambda sentence: get_transitions(sentence)).filter(lambda x: x!=None). \
                flatMap(lambda x: x)
count_total_other_words = other_words.foreach(lambda x: accum_total_other_words.add(1))
total_other_words = float(accum_total_other_words.value)

In [25]:
print other_words
print other_words.getNumPartitions()
print other_words.count()
print other_words.take(5)

print 'Total other words', total_other_words

PythonRDD[26] at RDD at PythonRDD.scala:43
2
968212
[((u'the', u'project'), 1), ((u'project', u'gutenberg'), 1), ((u'gutenberg', u'ebook'), 1), ((u'ebook', u'of'), 1), ((u'of', u'the'), 1)]
Total other words 968212.0


In [26]:
unique_other_words = other_words.reduceByKey(lambda a, b: a + b, numPartitions = n_partitions)

In [27]:
print unique_other_words
print unique_other_words.getNumPartitions()
print unique_other_words.count()
print unique_other_words.take(5)

PythonRDD[33] at RDD at PythonRDD.scala:43
6
319665
[((u'so', u'was'), 5), ((u'mischievous', u'pang'), 1), ((u'gave', u'confused'), 1), ((u'getting', u'stouter'), 1), ((u'long', u'frock'), 1)]


In [28]:
other_words_collapsed = unique_other_words.map(lambda x: (x[0][0], (x[0][1], x[1]))).groupByKey().mapValues(dict)

In [29]:
print other_words_collapsed
print other_words_collapsed.getNumPartitions()
print other_words_collapsed.count()
print other_words_collapsed.take(5)

PythonRDD[40] at RDD at PythonRDD.scala:43
6
27224
[(u'bennigsens', {u'and': 1}), (u'aided', {u'the': 3, u'by': 12, u'augustus': 1}), (u'suicidal', {u'and': 2, u'cut': 1, u'or': 1, u'commented': 1}), (u'linsey', {u'woolseys': 1}), (u'unheeded', {u'to': 1, u'upon': 1})]


In [30]:
def map_transition_prob(x):
    vals = x[1]
    total = float(sum(vals.values()))
    probs = {k: math.log(v/total) for k, v in vals.items()}
    return (x[0], probs)

In [31]:
transition_prob_calc = other_words_collapsed.map(lambda x: map_transition_prob(x))
default_transition_prob = math.log(1/total_other_words)

In [32]:
print transition_prob_calc
print transition_prob_calc.getNumPartitions()
print transition_prob_calc.count()
print transition_prob_calc.take(5)

print 'Default transition probability:', default_transition_prob

PythonRDD[43] at RDD at PythonRDD.scala:43
6
27224
[(u'bennigsens', {u'and': 0.0}), (u'aided', {u'the': -1.6739764335716716, u'by': -0.2876820724517809, u'augustus': -2.772588722239781}), (u'suicidal', {u'and': -0.916290731874155, u'cut': -1.6094379124341003, u'or': -1.6094379124341003, u'commented': -1.6094379124341003}), (u'linsey', {u'woolseys': 0.0}), (u'unheeded', {u'to': -0.6931471805599453, u'upon': -0.6931471805599453})]
Default transition probability: -13.7832063505


In [33]:
transition_prob = transition_prob_calc.collectAsMap()

In [34]:
############
#
# generate dictionary
#
############

In [35]:
all_words = make_all_lower.map(lambda line: regex.sub(' ', line)).flatMap(lambda line: line.split())

In [36]:
print all_words
print all_words.getNumPartitions()
print all_words.count()
print all_words.take(5)

PythonRDD[46] at RDD at PythonRDD.scala:43
2
1105285
[u'the', u'project', u'gutenberg', u'ebook', u'of']


In [37]:
count_once = all_words.map(lambda word: (word, 1))

In [38]:
print count_once
print count_once.getNumPartitions()
print count_once.count()
print count_once.take(5)

PythonRDD[49] at RDD at PythonRDD.scala:43
2
1105285
[(u'the', 1), (u'project', 1), (u'gutenberg', 1), (u'ebook', 1), (u'of', 1)]


In [39]:
unique_words_with_count = count_once.reduceByKey(lambda a, b: a + b, numPartitions = n_partitions).cache()

In [40]:
print unique_words_with_count
print unique_words_with_count.getNumPartitions()
print unique_words_with_count.count()
print unique_words_with_count.take(5)

PythonRDD[56] at RDD at PythonRDD.scala:43
6
29157
[(u'aided', 17), (u'bennigsens', 1), (u'suicidal', 5), (u'linsey', 1), (u'worshiped', 1)]


In [41]:
assert MAX_EDIT_DISTANCE>0 

In [42]:
generate_deletes = unique_words_with_count.map(lambda (parent, count): 
                                                   (parent, get_n_deletes_list(parent, MAX_EDIT_DISTANCE)))

In [43]:
print generate_deletes
print generate_deletes.getNumPartitions()
print generate_deletes.count()
print generate_deletes.take(5)

PythonRDD[59] at RDD at PythonRDD.scala:43
6
29157
[(u'aided', [u'ided', u'aded', u'aied', u'aidd', u'aide', u'ded', u'ied', u'idd', u'ide', u'aed', u'add', u'ade', u'aid', u'aie', u'ed', u'dd', u'de', u'id', u'ie', u'ad', u'ae', u'ai']), (u'bennigsens', [u'ennigsens', u'bnnigsens', u'benigsens', u'benngsens', u'bennisens', u'bennigens', u'bennigsns', u'bennigses', u'bennigsen', u'nnigsens', u'enigsens', u'enngsens', u'ennisens', u'ennigens', u'ennigsns', u'ennigses', u'ennigsen', u'bnigsens', u'bnngsens', u'bnnisens', u'bnnigens', u'bnnigsns', u'bnnigses', u'bnnigsen', u'beigsens', u'bengsens', u'benisens', u'benigens', u'benigsns', u'benigses', u'benigsen', u'bennsens', u'benngens', u'benngsns', u'benngses', u'benngsen', u'benniens', u'bennisns', u'bennises', u'bennisen', u'bennigns', u'benniges', u'bennigen', u'bennigss', u'bennigsn', u'bennigse', u'nigsens', u'nngsens', u'nnisens', u'nnigens', u'nnigsns', u'nnigses', u'nnigsen', u'eigsens', u'engsens', u'enisens', u'enigens', u'eni

In [44]:
expand_deletes = generate_deletes.flatMapValues(lambda x: x)

In [45]:
print expand_deletes
print expand_deletes.getNumPartitions()
print expand_deletes.count()
print expand_deletes.take(5)

PythonRDD[62] at RDD at PythonRDD.scala:43
6
2863776
[(u'aided', u'ided'), (u'aided', u'aded'), (u'aided', u'aied'), (u'aided', u'aidd'), (u'aided', u'aide')]


In [46]:
swap = expand_deletes.map(lambda (orig, delete): (delete, ([orig], 0)))

In [47]:
print swap
print swap.getNumPartitions()
print swap.count()
print swap.take(5)

PythonRDD[65] at RDD at PythonRDD.scala:43
6
2863776
[(u'ided', ([u'aided'], 0)), (u'aded', ([u'aided'], 0)), (u'aied', ([u'aided'], 0)), (u'aidd', ([u'aided'], 0)), (u'aide', ([u'aided'], 0))]


In [48]:
corpus = unique_words_with_count.mapValues(lambda count: ([], count))

In [49]:
print corpus
print corpus.getNumPartitions()
print corpus.count()
print corpus.take(5)

PythonRDD[68] at RDD at PythonRDD.scala:43
6
29157
[(u'aided', ([], 17)), (u'bennigsens', ([], 1)), (u'suicidal', ([], 5)), (u'linsey', ([], 1)), (u'worshiped', ([], 1))]


In [50]:
combine = swap.union(corpus)  # combine deletes with main dictionary, eliminate duplicates

In [51]:
print combine
print combine.getNumPartitions()
print combine.count()
print combine.take(5)

UnionRDD[71] at union at NativeMethodAccessorImpl.java:-2
12
2892933
[(u'ided', ([u'aided'], 0)), (u'aded', ([u'aided'], 0)), (u'aied', ([u'aided'], 0)), (u'aidd', ([u'aided'], 0)), (u'aide', ([u'aided'], 0))]


In [52]:
dictionary = combine.reduceByKeyLocally(lambda a, b: (a[0]+b[0], a[1]+b[1]))

In [53]:
longest_word_length = unique_words_with_count.map(lambda (k, v): len(k)).reduce(max)

***
# Sentence-level parallelization

In [473]:
def get_emission_prob(edit_dist, poisson_lambda=0.01):
    '''
    The emission probability, i.e. P(word typed|word intended)
    is approximated by a Poisson(k, l) distribution, where 
    k=edit distance and l=0.01.
    
    The lambda parameter matches the one used in the AM207
    lecture notes. Various parameters between 0 and 1 were tested
    to confirm that 0.01 yields the most accurate results.
    '''
    
    return math.log(poisson.pmf(edit_dist, poisson_lambda))

In [474]:
def get_start_prob(word, start_prob, default_start_prob):
    try:
        return start_prob[word]
    except KeyError:
        return default_start_prob

In [475]:
def get_transition_prob(cur_word, prev_word, transition_prob, default_transition_prob):
    try:
        return transition_prob[prev_word][cur_word]
    except KeyError:
        return default_transition_prob

In [476]:
def get_belief(prev_word, prev_belief):
    try:
        return prev_belief[prev_word]
    except KeyError:
        return math.log(math.exp(min(prev_belief.values()))/2.)

In [477]:
def dameraulevenshtein(seq1, seq2):
    '''
    Calculate the Damerau-Levenshtein distance between sequences.
    Same code as word-level checking.
    '''
    
    # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
    # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1
    # matrix. However, only the current and two previous rows are
    # needed at once, so we only store those.
    
    oneago = None
    thisrow = range(1, len(seq2) + 1) + [0]
    
    for x in xrange(len(seq1)):
        
        # Python lists wrap around for negative indices, so put the
        # leftmost column at the *end* of the list. This matches with
        # the zero-indexed strings and saves extra calculation.
        twoago, oneago, thisrow = \
            oneago, thisrow, [0] * len(seq2) + [x + 1]
        
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # This block deals with transpositions
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
                
    return thisrow[len(seq2) - 1]

In [478]:
def get_suggestions(string, dictionary, 
                    longest_word_length, min_count=1):
    '''
    Return list of suggested corrections for potentially incorrectly
    spelled word.
    Code based on get_suggestions function from word-level checking,
    with the addition of the min_count parameter, which only
    considers words that have occur more than min_count times in the
    (dictionary) corpus.
    '''
    
    if (len(string) - longest_word_length) > MAX_EDIT_DISTANCE:
        return []
    
    suggest_dict = {}
    
    queue = [string]
    q_dictionary = {}  # items other than string that we've checked
    
    while len(queue)>0:
        q_item = queue[0]  # pop
        queue = queue[1:]
        
        # process queue item
        if (q_item in dictionary) and (q_item not in suggest_dict):
            if (dictionary[q_item][1]>=min_count):
            # word is in dictionary, and is a word from the corpus,
            # and not already in suggestion list so add to suggestion
            # dictionary, indexed by the word with value (frequency
            # in corpus, edit distance)
            # note: q_items that are not the input string are shorter
            # than input string since only deletes are added (unless
            # manual dictionary corrections are added)
                assert len(string)>=len(q_item)
                suggest_dict[q_item] = \
                    (dictionary[q_item][1], len(string) - len(q_item))
            
            # the suggested corrections for q_item as stored in
            # dictionary (whether or not q_item itself is a valid
            # word or merely a delete) can be valid corrections
            for sc_item in dictionary[q_item][0]:
                if (sc_item not in suggest_dict):
                    
                    # compute edit distance
                    # suggested items should always be longer (unless
                    # manual corrections are added)
                    assert len(sc_item)>len(q_item)
                    # q_items that are not input should be shorter
                    # than original string 
                    # (unless manual corrections added)
                    assert len(q_item)<=len(string)
                    if len(q_item)==len(string):
                        assert q_item==string
                        item_dist = len(sc_item) - len(q_item)

                    # item in suggestions list should not be the same
                    # as the string itself
                    assert sc_item!=string           
                    # calculate edit distance using Damerau-
                    # Levenshtein distance
                    item_dist = dameraulevenshtein(sc_item, string)
                    
                    if item_dist<=MAX_EDIT_DISTANCE:
                        # should already be in dictionary if in
                        # suggestion list
                        assert sc_item in dictionary  
                        # trim list to contain state space
                        if (dictionary[q_item][1]>=min_count): 
                            suggest_dict[sc_item] = \
                                (dictionary[sc_item][1], item_dist)
        
        # now generate deletes (e.g. a substring of string or of a
        # delete) from the queue item as additional items to check
        # -- add to end of queue
        assert len(string)>=len(q_item)
        if (len(string)-len(q_item))<MAX_EDIT_DISTANCE \
            and len(q_item)>1:
            for c in range(len(q_item)): # character index        
                word_minus_c = q_item[:c] + q_item[c+1:]
                if word_minus_c not in q_dictionary:
                    queue.append(word_minus_c)
                    # arbitrary value to identify we checked this
                    q_dictionary[word_minus_c] = None

    # return list of suggestions:
    # (correction, (frequency in corpus, edit distance)):
    as_list = suggest_dict.items()
    return sorted(as_list, key = \
                  lambda (term, (freq, dist)): (dist, -freq))

    '''
    Output format:
    get_suggestions('file')
    [('file', (5, 0)),
     ('five', (67, 1)),
     ('fire', (54, 1)),
     ('fine', (17, 1))...]  
    '''

In [479]:
def viterbi(words, dictionary, longest_word_length,
            start_prob, default_start_prob, 
            transition_prob, default_transition_prob,
            num_word_suggestions=5000):
    
    V = [{}]
    path = {}
    path_context = []
    
    # character level correction - used to determine state space
    corrections = get_suggestions(
        words[0], dictionary, longest_word_length)

    # to ensure Viterbi can keep running
    # -- use the word itself if no corrections are found
    if len(corrections) == 0:
        corrections = [(words[0], (1, 0))]
    else:    
        if len(corrections) > num_word_suggestions:
            corrections = corrections[0:num_word_suggestions]
        
    # Initialize base cases (t == 0)
    for sug_word in corrections:
        
        # compute the value for all possible starting states
        V[0][sug_word[0]] = math.exp(
            get_start_prob(sug_word[0], start_prob, 
                           default_start_prob)
            + get_emission_prob(sug_word[1][1]))
        
        # remember all the different paths (only one state so far)
        path[sug_word[0]] = [sug_word[0]]
 
    # normalize for numerical stability
    path_temp_sum = sum(V[0].values())
    V[0].update({k: math.log(v/path_temp_sum) 
                 for k, v in V[0].items()})
    
    # keep track of previous state space
    prev_corrections = [i[0] for i in corrections]
    
    if len(words) == 1:
        path_context = [max(V[0], key=lambda i: V[0][i])]
        return path_context

    # run Viterbi for t > 0
    for t in range(1, len(words)):

        V.append({})
        new_path = {}
        
        # character level correction
        corrections = get_suggestions(
            words[t], dictionary, longest_word_length)
        
        # to ensure Viterbi can keep running
        # -- use the word itself if no corrections are found
        if len(corrections) == 0:
            corrections = [(words[t], (1, 0))]
        else:
            if len(corrections) > num_word_suggestions:
                corrections = corrections[0:num_word_suggestions]
 
        for sug_word in corrections:
        
            sug_word_emission_prob = get_emission_prob(sug_word[1][1])
            
            # compute the values coming from all possible previous
            # states, only keep the maximum
            (prob, word) = max(
                (get_belief(prev_word, V[t-1]) 
                + get_transition_prob(sug_word[0], prev_word, 
                    transition_prob, default_transition_prob)
                + sug_word_emission_prob, prev_word) 
                               for prev_word in prev_corrections)

            # save the maximum value for each state
            V[t][sug_word[0]] = math.exp(prob)
            
            # remember the path we came from to get this maximum value
            new_path[sug_word[0]] = path[word] + [sug_word[0]]
            
        # normalize for numerical stability
        path_temp_sum = sum(V[t].values())
        V[t].update({k: math.log(v/path_temp_sum) 
                     for k, v in V[t].items()})
        
        # keep track of previous state space
        prev_corrections = [i[0] for i in corrections]
 
        # don't need to remember the old paths
        path = new_path
     
    (prob, word) = max((V[t][sug_word[0]], sug_word[0]) 
                       for sug_word in corrections)
    path_context = path[word]
    
    return path_context

In [480]:
############
#
# load file & initial processing
#
############

In [481]:
fname = "testdata/test.txt"

In [482]:
# broadcast Python dictionaries to workers
bc_dictionary = sc.broadcast(dictionary)
bc_start_prob = sc.broadcast(start_prob)
bc_transition_prob = sc.broadcast(transition_prob)

In [483]:
make_all_lower = sc.textFile(fname).map(lambda line: line.lower()).filter(lambda x: x!='')

In [484]:
print make_all_lower
print make_all_lower.getNumPartitions()
print make_all_lower.count()
print make_all_lower.take(5)

PythonRDD[562] at RDD at PythonRDD.scala:43
2
7
[u'this is a test', u'this is a test', u'here is54a test', u'this is ax test', u'this is za test']


In [485]:
split_sentence = make_all_lower.flatMap(lambda line: line.split('.')).map(lambda sentence: regex.sub(' ', sentence)) \
            .map(lambda sentence: sentence.split())

In [486]:
print split_sentence
print split_sentence.getNumPartitions()
print split_sentence.count()
print split_sentence.take(5)

PythonRDD[566] at RDD at PythonRDD.scala:43
2
7
[[u'this', u'is', u'a', u'test'], [u'this', u'is', u'a', u'test'], [u'here', u'is', u'a', u'test'], [u'this', u'is', u'ax', u'test'], [u'this', u'is', u'za', u'test']]


In [487]:
# use accumulator to count the number of words checked
accum_total_words = sc.accumulator(0)
split_words = split_sentence.flatMap(lambda x: x).foreach(lambda x: accum_total_words.add(1))
print 'Words checked: ', accum_total_words.value

Words checked:  27


In [488]:
sentence_id = split_sentence.zipWithIndex().map(lambda (k, v): (v, k))

In [489]:
print sentence_id
print sentence_id.getNumPartitions()
print sentence_id.count()
print sentence_id.take(5)

PythonRDD[572] at RDD at PythonRDD.scala:43
2
7
[(0, [u'this', u'is', u'a', u'test']), (1, [u'this', u'is', u'a', u'test']), (2, [u'here', u'is', u'a', u'test']), (3, [u'this', u'is', u'ax', u'test']), (4, [u'this', u'is', u'za', u'test'])]


In [490]:
sentence_correction = sentence_id.map(lambda (k, v): (k, (v, viterbi(
                v, bc_dictionary.value, longest_word_length, bc_start_prob.value, 
                default_start_prob, bc_transition_prob.value, default_transition_prob))))

In [491]:
print sentence_correction
print sentence_correction.getNumPartitions()
print sentence_correction.count()
print sentence_correction.take(5)

PythonRDD[576] at RDD at PythonRDD.scala:43
2
7
[(0, ([u'this', u'is', u'a', u'test'], [u'this', u'is', u'a', u'test'])), (1, ([u'this', u'is', u'a', u'test'], [u'this', u'is', u'a', u'test'])), (2, ([u'here', u'is', u'a', u'test'], [u'here', u'is', u'a', u'test'])), (3, ([u'this', u'is', u'ax', u'test'], [u'this', u'is', u'ax', u'test'])), (4, ([u'this', u'is', u'za', u'test'], [u'this', u'is', u'za', u'test']))]


In [492]:
def get_sentence_mismatches(sentences):
    orig_sentence, sug_sentence = sentences
    mismatches = [(orig_sentence[i], sug_sentence[i]) for i in range(len(orig_sentence)) 
            if orig_sentence[i]!=sug_sentence[i]]
    if len(mismatches)==0:
        return None
    else:
        return mismatches

In [493]:
sentence_mismatch = sentence_correction.map(lambda (k, v): (k, get_sentence_mismatches(v))) \
                .filter(lambda (k,v): v!=None)

In [494]:
print sentence_mismatch
print sentence_mismatch.getNumPartitions()
print sentence_mismatch.count()
print sentence_mismatch.take(5)

PythonRDD[580] at RDD at PythonRDD.scala:43
2
0
[]


In [495]:
def split_mismatches(mismatches):
    sent_id, word_list = mismatches
#     result = []
#     for word in word_list:
#         result.append([sent_id, word[0], word[1]])
#     return result
    return [[sent_id, word[0], word[1]] for word in word_list]

In [496]:
word_mismatch = sentence_mismatch.flatMap(lambda x: split_mismatches(x))

In [497]:
print word_mismatch
print word_mismatch.getNumPartitions()
print word_mismatch.count()
print word_mismatch.take(5)

PythonRDD[584] at RDD at PythonRDD.scala:43
2
0
[]


In [498]:
# use accumulator to count the number of mismatches
accum_total_mismatches = sc.accumulator(0)
count_mismatches = word_mismatch.foreach(lambda x: accum_total_mismatches.add(1))
print 'Potential mismatches: ', accum_total_mismatches.value

Potential mismatches:  0


In [499]:
printlist=True

In [500]:
# ERROR words are words where the word does not match the first tuple's word (top match)
if printlist:
    print '    Words with suggested corrections (line number, word in text, top match):'
    print word_mismatch.map(lambda x: (x[0], str(x[1]) + " --> " + str(x[2]))).collect()

    Words with suggested corrections (line number, word in text, top match):
[]


***
# Word-level parallelization

In [501]:
fname = "testdata/test.txt"

In [502]:
# broadcast Python dictionaries to workers
bc_dictionary = sc.broadcast(dictionary)
bc_start_prob = sc.broadcast(start_prob)
bc_transition_prob = sc.broadcast(transition_prob)

In [503]:
make_all_lower = sc.textFile(fname).map(lambda line: line.lower()).filter(lambda x: x!='')

In [504]:
print make_all_lower
print make_all_lower.getNumPartitions()
print make_all_lower.count()
print make_all_lower.take(5)

PythonRDD[592] at RDD at PythonRDD.scala:43
2
7
[u'this is a test', u'this is a test', u'here is54a test', u'this is ax test', u'this is za test']


In [505]:
split_sentence = make_all_lower.flatMap(lambda line: line.split('.')).map(lambda sentence: regex.sub(' ', sentence)) \
            .map(lambda sentence: sentence.split())

In [506]:
print split_sentence
print split_sentence.getNumPartitions()
print split_sentence.count()
print split_sentence.take(5)

PythonRDD[596] at RDD at PythonRDD.scala:43
2
7
[[u'this', u'is', u'a', u'test'], [u'this', u'is', u'a', u'test'], [u'here', u'is', u'a', u'test'], [u'this', u'is', u'ax', u'test'], [u'this', u'is', u'za', u'test']]


In [507]:
# use accumulator to count the number of words checked
accum_total_words = sc.accumulator(0)
split_words = split_sentence.flatMap(lambda x: x).foreach(lambda x: accum_total_words.add(1))
print 'Words checked: ', accum_total_words.value

Words checked:  27


In [508]:
sentence_id = split_sentence.zipWithIndex().map(lambda (k, v): (v, k))

In [509]:
print sentence_id
print sentence_id.getNumPartitions()
print sentence_id.count()
print sentence_id.take(5)

PythonRDD[602] at RDD at PythonRDD.scala:43
2
7
[(0, [u'this', u'is', u'a', u'test']), (1, [u'this', u'is', u'a', u'test']), (2, [u'here', u'is', u'a', u'test']), (3, [u'this', u'is', u'ax', u'test']), (4, [u'this', u'is', u'za', u'test'])]


In [568]:
def map_sentence_words(sentence, tmp_dict, longest_word_length):
    return [[word, get_suggestions(word, tmp_dict, longest_word_length)] 
            for i, word in enumerate(sentence)]

In [569]:
sentence_words = sentence_id.map(lambda (k,v): (k, 
                                map_sentence_words(v, bc_dictionary.value, longest_word_length)))

In [570]:
print sentence_words
print sentence_words.getNumPartitions()
print sentence_words.count()
print sentence_words.take(5)

PythonRDD[641] at RDD at PythonRDD.scala:43
2
7
[(0, [[u'this', [(u'this', (4, 0)), (u'is', (6, 2))]], [u'is', [(u'is', (6, 0)), (u'this', (4, 2))]], [u'a', [(u'a', (4, 0)), (u'ax', (1, 1)), (u'za', (1, 1))]], [u'test', [(u'test', (6, 0))]]]), (1, [[u'this', [(u'this', (4, 0)), (u'is', (6, 2))]], [u'is', [(u'is', (6, 0)), (u'this', (4, 2))]], [u'a', [(u'a', (4, 0)), (u'ax', (1, 1)), (u'za', (1, 1))]], [u'test', [(u'test', (6, 0))]]]), (2, [[u'here', [(u'here', (1, 0)), (u'her', (1, 1))]], [u'is', [(u'is', (6, 0)), (u'this', (4, 2))]], [u'a', [(u'a', (4, 0)), (u'ax', (1, 1)), (u'za', (1, 1))]], [u'test', [(u'test', (6, 0))]]]), (3, [[u'this', [(u'this', (4, 0)), (u'is', (6, 2))]], [u'is', [(u'is', (6, 0)), (u'this', (4, 2))]], [u'ax', [(u'ax', (1, 0)), (u'a', (4, 1)), (u'za', (1, 2))]], [u'test', [(u'test', (6, 0))]]]), (4, [[u'this', [(u'this', (4, 0)), (u'is', (6, 2))]], [u'is', [(u'is', (6, 0)), (u'this', (4, 2))]], [u'za', [(u'za', (1, 0)), (u'a', (4, 1)), (u'ax', (1, 2))]], [u'test

In [571]:
########################
# testing itertools
########################

In [572]:
# suggestions
# 'this': [('this', (4, 0)), ('is', (6, 2))]				2
# 'is': [('is', (6, 0)), (u'this', (4, 2))]					2
# 'a': [('a', (4, 0)), (u'ax', (1, 1)), (u'za', (1, 1))]	3
# 'test': [('test', (6, 0))]								1
# combinations: 2*2*3*1=12

In [524]:
w_this = [('this', (4, 0)), ('is', (6, 2))]
w_is = [('is', (6, 0)), (u'this', (4, 2))]
w_a = [('a', (4, 0)), (u'ax', (1, 1)), (u'za', (1, 1))]
w_test = [('test', (6, 0))]

In [531]:
list(itertools.product(w_this, w_is, w_a, w_test))

[(('this', (4, 0)), ('is', (6, 0)), ('a', (4, 0)), ('test', (6, 0))),
 (('this', (4, 0)), ('is', (6, 0)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), ('is', (6, 0)), (u'za', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), ('a', (4, 0)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), (u'za', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), ('a', (4, 0)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), (u'za', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), ('a', (4, 0)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), (u'za', (1, 1)), ('test', (6, 0)))]

In [532]:
big_list = [w_this, w_is, w_a, w_test]

In [552]:
list(itertools.product(*big_list))

[(('this', (4, 0)), ('is', (6, 0)), ('a', (4, 0)), ('test', (6, 0))),
 (('this', (4, 0)), ('is', (6, 0)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), ('is', (6, 0)), (u'za', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), ('a', (4, 0)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), (u'za', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), ('a', (4, 0)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), (u'za', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), ('a', (4, 0)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), (u'za', (1, 1)), ('test', (6, 0)))]

In [None]:
########################
# back to live code
########################

In [576]:
def split_suggestions(sentence):
    result = []
    for word in sentence:
        w = word[0]
        sug = [(s[0],s[1][1]) for s in word[1]]
        result.append([(w, s[0], get_emission_prob(s[1])) for s in sug])
    return result

In [577]:
sentence_word_sug = sentence_words.map(lambda (k,v): (k, split_suggestions(v)))

In [578]:
print sentence_word_sug
print sentence_word_sug.getNumPartitions()
print sentence_word_sug.count()
print sentence_word_sug.take(1)

PythonRDD[647] at RDD at PythonRDD.scala:43
2
7
[(0, [[(u'this', u'this', -0.009999999999999946), (u'this', u'is', -9.913487552536127)], [(u'is', u'is', -0.009999999999999946), (u'is', u'this', -9.913487552536127)], [(u'a', u'a', -0.009999999999999946), (u'a', u'ax', -4.615170185988091), (u'a', u'za', -4.615170185988091)], [(u'test', u'test', -0.009999999999999946)]])]


In [585]:
def get_word_combos(sug_lists):
    return list(itertools.product(*sug_lists))

In [586]:
sentence_word_combos = sentence_word_sug.map(lambda (k, v): (k, get_word_combos(v)))

In [587]:
print sentence_word_combos
print sentence_word_combos.getNumPartitions()
print sentence_word_combos.count()
print sentence_word_combos.take(1)

PythonRDD[656] at RDD at PythonRDD.scala:43
2
7
[(0, [((u'this', u'this', -0.009999999999999946), (u'is', u'is', -0.009999999999999946), (u'a', u'a', -0.009999999999999946), (u'test', u'test', -0.009999999999999946)), ((u'this', u'this', -0.009999999999999946), (u'is', u'is', -0.009999999999999946), (u'a', u'ax', -4.615170185988091), (u'test', u'test', -0.009999999999999946)), ((u'this', u'this', -0.009999999999999946), (u'is', u'is', -0.009999999999999946), (u'a', u'za', -4.615170185988091), (u'test', u'test', -0.009999999999999946)), ((u'this', u'this', -0.009999999999999946), (u'is', u'this', -9.913487552536127), (u'a', u'a', -0.009999999999999946), (u'test', u'test', -0.009999999999999946)), ((u'this', u'this', -0.009999999999999946), (u'is', u'this', -9.913487552536127), (u'a', u'ax', -4.615170185988091), (u'test', u'test', -0.009999999999999946)), ((u'this', u'this', -0.009999999999999946), (u'is', u'this', -9.913487552536127), (u'a', u'za', -4.615170185988091), (u'test', u'test'

In [594]:
def split_combos(combos):
    sent_id, combo_list = combos
    return [[sent_id, c] for c in combo_list]

In [595]:
sentence_word_combos_split = sentence_word_combos.flatMap(lambda x: split_combos(x))

In [596]:
print sentence_word_combos_split
print sentence_word_combos_split.getNumPartitions()
print sentence_word_combos_split.count()
print sentence_word_combos_split.take(5)

PythonRDD[665] at RDD at PythonRDD.scala:43
2
76
[[0, ((u'this', u'this', -0.009999999999999946), (u'is', u'is', -0.009999999999999946), (u'a', u'a', -0.009999999999999946), (u'test', u'test', -0.009999999999999946))], [0, ((u'this', u'this', -0.009999999999999946), (u'is', u'is', -0.009999999999999946), (u'a', u'ax', -4.615170185988091), (u'test', u'test', -0.009999999999999946))], [0, ((u'this', u'this', -0.009999999999999946), (u'is', u'is', -0.009999999999999946), (u'a', u'za', -4.615170185988091), (u'test', u'test', -0.009999999999999946))], [0, ((u'this', u'this', -0.009999999999999946), (u'is', u'this', -9.913487552536127), (u'a', u'a', -0.009999999999999946), (u'test', u'test', -0.009999999999999946))], [0, ((u'this', u'this', -0.009999999999999946), (u'is', u'this', -9.913487552536127), (u'a', u'ax', -4.615170185988091), (u'test', u'test', -0.009999999999999946))]]


In [638]:
def get_combo_prob(combo, tmp_sp, d_sp, tmp_tp, d_tp):
    
    # first word in sentence
    # emission prob * start prob
    orig_path = [combo[0][0]]
    sug_path = [combo[0][1]]
    prob = combo[0][2] + get_start_prob(combo[0][1], tmp_sp, d_sp)
    
    # subsequent words
    for i, w in enumerate(combo[1:]):
        orig_path.append(w[0])
        sug_path.append(w[1])
        prob += w[2] + get_transition_prob(w[1], combo[i-1][1], tmp_tp, d_tp)
    
    return orig_path, sug_path, prob

In [639]:
sentence_word_combos_prob = sentence_word_combos_split.map(lambda (k,v): (k, 
                                get_combo_prob(v, bc_start_prob.value, default_start_prob, 
                                               bc_transition_prob.value, default_transition_prob)))

In [654]:
print sentence_word_combos_prob
print sentence_word_combos_prob.getNumPartitions()
print sentence_word_combos_prob.count()
print sentence_word_combos_prob.take(12)

PythonRDD[703] at RDD at PythonRDD.scala:43
2
76
[(0, ([u'this', u'is', u'a', u'test'], [u'this', u'is', u'a', u'test'], -9.586812608597395)), (0, ([u'this', u'is', u'a', u'test'], [u'this', u'is', u'ax', u'test'], -14.191982794585487)), (0, ([u'this', u'is', u'a', u'test'], [u'this', u'is', u'za', u'test'], -14.191982794585487)), (0, ([u'this', u'is', u'a', u'test'], [u'this', u'this', u'a', u'test'], -19.490300161133522)), (0, ([u'this', u'is', u'a', u'test'], [u'this', u'this', u'ax', u'test'], -24.095470347121616)), (0, ([u'this', u'is', u'a', u'test'], [u'this', u'this', u'za', u'test'], -24.095470347121616)), (0, ([u'this', u'is', u'a', u'test'], [u'is', u'is', u'a', u'test'], -18.286327356807586)), (0, ([u'this', u'is', u'a', u'test'], [u'is', u'is', u'ax', u'test'], -24.277791903915567)), (0, ([u'this', u'is', u'a', u'test'], [u'is', u'is', u'za', u'test'], -24.277791903915567)), (0, ([u'this', u'is', u'a', u'test'], [u'is', u'this', u'a', u'test'], -28.189814909343713)), (0, (

In [674]:
sentence_max_prob = sentence_word_combos_prob.reduceByKey(lambda a,b: a if a[2] > b[2] else b, 
                                                          numPartitions = n_partitions)

In [675]:
print sentence_max_prob
print sentence_max_prob.getNumPartitions()
print sentence_max_prob.count()
print sentence_max_prob.take(5)

PythonRDD[794] at RDD at PythonRDD.scala:43
6
7
[(0, ([u'this', u'is', u'a', u'test'], [u'this', u'is', u'a', u'test'], -9.586812608597395)), (6, ([u'her', u'tee', u'set'], [u'her', u'tee', u'set'], -7.967374696163295)), (1, ([u'this', u'is', u'a', u'test'], [u'this', u'is', u'a', u'test'], -9.586812608597395)), (2, ([u'here', u'is', u'a', u'test'], [u'here', u'is', u'a', u'test'], -10.973106969717286)), (3, ([u'this', u'is', u'ax', u'test'], [u'this', u'is', u'ax', u'test'], -9.586812608597395))]


In [686]:
sentence_mismatch = sentence_max_prob.map(lambda (k,v): (k, (v[0], v[1]))) \
     .map(lambda (k, v): (k, get_sentence_mismatches(v))) \
     .filter(lambda (k,v): v!=None)

In [687]:
print sentence_mismatch
print sentence_mismatch.getNumPartitions()
print sentence_mismatch.count()
print sentence_mismatch.take(5)

PythonRDD[823] at RDD at PythonRDD.scala:43
6
7
[(0, None), (6, None), (1, None), (2, None), (3, None)]
