In [1]:
import re
import math
from scipy.stats import poisson
import itertools

In [2]:
import findspark
import os
findspark.init()
import pyspark
sc = pyspark.SparkContext()
sc.setLogLevel('ERROR')

***
# Pre-processing

In [3]:
n_partitions = 6  # number of partitions to be used
MAX_EDIT_DISTANCE = 3

In [4]:
def get_deletes_list(w):
    '''given a word, derive list of strings with up to n characters deleted'''
    # since this list is generally of the same magnitude as the number of 
    # characters in a word, it may not make sense to parallelize this
    # so we use python to create the list
    deletes = []
    queue = [w]
    for d in range(MAX_EDIT_DISTANCE):
        temp_queue = []
        for word in queue:
            if len(word)>1:
                for c in range(len(word)):  # character index
                    word_minus_c = word[:c] + word[c+1:]
                    if word_minus_c not in deletes:
                        deletes.append(word_minus_c)
                    if word_minus_c not in temp_queue:
                        temp_queue.append(word_minus_c)
        queue = temp_queue
        
    return deletes

In [5]:
############
#
# load file & initial processing
#
############

In [9]:
fname = "testdata/yelp1review.txt"

In [10]:
regex = re.compile('[^a-z ]')

In [28]:
make_all_lower = sc.textFile(fname).map(lambda line: line.lower()).filter(lambda x: x!='')
make_all_lower.take(5)

[u'this place was delicious!!  my parents saw a recommendation to visit this place from rick sebak\'s "25 things i like about pittsburgh" and he\'s usually pretty accurate.  his recommendations were to try the reuben, fish sandwich and open-faced steak sandwich.  we went early afternoon for a late lunch today (a saturday) and were seated right away.  the staff is extremely friendly.  my mom & i each had the fish sandwich, while my dad & brother had a reuben sandwich.  the fish was very good, but the reuben was to die for!  both dishes were massive, and could very easily be shared between two people.  on top of being extremely large portions, it was incredibly affordable.  the giant fish sandwich was $8 and the giant reuben was $7.50.  our drinks were always filled and we were checked on several times during the meal.  we will definitely be back!!!  oh and a bit of advice ahead of time - they take cash only.  so come prepared, but i\'m pretty sure i saw an atm there as well.  and i do b

In [30]:
sample = u'this place was delicious!!  my parents saw a recommendation to visit this place from rick sebak\'s "25 things i like about pittsburgh" and he\'s usually pretty accurate.  his recommendations were to try the reuben, fish sandwich and open-faced steak sandwich.  we went early afternoon for a late lunch today (a saturday) and were seated right away.  the staff is extremely friendly.  my mom & i each had the fish sandwich, while my dad & brother had a reuben sandwich.  the fish was very good, but the reuben was to die for!  both dishes were massive, and could very easily be shared between two people.  on top of being extremely large portions, it was incredibly affordable.  the giant fish sandwich was $8 and the giant reuben was $7.50.  our drinks were always filled and we were checked on several times during the meal.  we will definitely be back!!!  oh and a bit of advice ahead of time - they take cash only.  so come prepared, but i\'m pretty sure i saw an atm there as well.  and i do believe they are closed on sundays & mondays.'

In [35]:
sample.replace('?','.').replace('!','.').split('.')

[u'this place was delicious',
 u'',
 u'  my parents saw a recommendation to visit this place from rick sebak\'s "25 things i like about pittsburgh" and he\'s usually pretty accurate',
 u'  his recommendations were to try the reuben, fish sandwich and open-faced steak sandwich',
 u'  we went early afternoon for a late lunch today (a saturday) and were seated right away',
 u'  the staff is extremely friendly',
 u'  my mom & i each had the fish sandwich, while my dad & brother had a reuben sandwich',
 u'  the fish was very good, but the reuben was to die for',
 u'  both dishes were massive, and could very easily be shared between two people',
 u'  on top of being extremely large portions, it was incredibly affordable',
 u'  the giant fish sandwich was $8 and the giant reuben was $7',
 u'50',
 u'  our drinks were always filled and we were checked on several times during the meal',
 u'  we will definitely be back',
 u'',
 u'',
 u'  oh and a bit of advice ahead of time - they take cash only'

In [39]:
split_sentence = make_all_lower.flatMap(lambda line: line.replace('?','.').replace('!','.').split('.')) \
             .map(lambda sentence: regex.sub(' ', sentence)) \
             .map(lambda sentence: sentence.split()) \
             .filter(lambda x: x!=[])
split_sentence.take(5)

[[u'this', u'place', u'was', u'delicious'],
 [u'my',
  u'parents',
  u'saw',
  u'a',
  u'recommendation',
  u'to',
  u'visit',
  u'this',
  u'place',
  u'from',
  u'rick',
  u'sebak',
  u's',
  u'things',
  u'i',
  u'like',
  u'about',
  u'pittsburgh',
  u'and',
  u'he',
  u's',
  u'usually',
  u'pretty',
  u'accurate'],
 [u'his',
  u'recommendations',
  u'were',
  u'to',
  u'try',
  u'the',
  u'reuben',
  u'fish',
  u'sandwich',
  u'and',
  u'open',
  u'faced',
  u'steak',
  u'sandwich'],
 [u'we',
  u'went',
  u'early',
  u'afternoon',
  u'for',
  u'a',
  u'late',
  u'lunch',
  u'today',
  u'a',
  u'saturday',
  u'and',
  u'were',
  u'seated',
  u'right',
  u'away'],
 [u'the', u'staff', u'is', u'extremely', u'friendly']]

In [10]:
############
#
# generate start probabilities
#
############

In [11]:
start_words = split_sentence.map(lambda sentence: sentence[0] if len(sentence)>0 else None) \
    .filter(lambda word: word!=None)
start_words.take(5)

[u'the', u'by', u'in', u'copyright', u'be']

In [12]:
count_start_words_once = start_words.map(lambda word: (word, 1))
count_start_words_once.take(5)

[(u'the', 1), (u'by', 1), (u'in', 1), (u'copyright', 1), (u'be', 1)]

In [13]:
accum_total_start_words = sc.accumulator(0)
count_start_words_once.foreach(lambda x: accum_total_start_words.add(1))
total_start_words = float(accum_total_start_words.value)
total_start_words

137073.0

In [14]:
unique_start_words = count_start_words_once.reduceByKey(lambda a, b: a + b, numPartitions = n_partitions)
unique_start_words.take(5)

[(u'aided', 3),
 (u'suicidal', 1),
 (u'desirable', 4),
 (u'all', 562),
 (u'yellow', 4)]

In [15]:
start_prob_calc = unique_start_words.map(lambda (k,v): (k, math.log(v/total_start_words)))
start_prob_calc.take(5)

[(u'aided', -10.729656620945079),
 (u'suicidal', -11.82826890961319),
 (u'desirable', -10.441974548493299),
 (u'all', -5.496767059719498),
 (u'yellow', -10.441974548493299)]

In [16]:
default_start_prob = math.log(1/total_start_words)
default_start_prob

-11.82826890961319

In [17]:
start_prob = start_prob_calc.collectAsMap()

In [18]:
############
#
# generate transition probabilities
#
############

In [19]:
def get_transitions(sentence):
    if len(sentence)<2:
        return None
    else:
        return [((sentence[i], sentence[i+1]), 1) for i in range(len(sentence)-1)]

In [20]:
other_words = split_sentence.map(lambda sentence: get_transitions(sentence)).filter(lambda x: x!=None). \
                flatMap(lambda x: x)
other_words.take(5)

[((u'the', u'project'), 1),
 ((u'project', u'gutenberg'), 1),
 ((u'gutenberg', u'ebook'), 1),
 ((u'ebook', u'of'), 1),
 ((u'of', u'the'), 1)]

In [21]:
accum_total_other_words = sc.accumulator(0)
count_total_other_words = other_words.foreach(lambda x: accum_total_other_words.add(1))
total_other_words = float(accum_total_other_words.value)
total_other_words

968212.0

In [22]:
unique_other_words = other_words.reduceByKey(lambda a, b: a + b)
unique_other_words.take(5)

[((u'little', u'note'), 2),
 ((u'spirits', u'he'), 1),
 ((u'mischievous', u'pang'), 1),
 ((u'gave', u'confused'), 1),
 ((u'the', u'basle'), 1)]

In [23]:
other_words_collapsed = unique_other_words.map(lambda x: (x[0][0], (x[0][1], x[1]))).groupByKey().mapValues(dict)
other_words_collapsed.take(5)

[(u'sophism', {u'of': 1}),
 (u'systematic',
  {u'efforts': 1,
   u'examination': 2,
   u'exercises': 1,
   u'forms': 1,
   u'its': 1,
   u'supervision': 1,
   u'survey': 1,
   u'use': 1,
   u'virtually': 1}),
 (u'nunnery', {u'and': 1, u'as': 1}),
 (u'inadequacy', {u'of': 2}),
 (u'yellow',
  {u'and': 3,
   u'backed': 2,
   u'band': 1,
   u'blotches': 1,
   u'chintz': 1,
   u'colour': 3,
   u'creamy': 1,
   u'dress': 1,
   u'elastic': 1,
   u'envelope': 1,
   u'expressed': 1,
   u'face': 1,
   u'faced': 1,
   u'faces': 1,
   u'floor': 1,
   u'foci': 1,
   u'foliage': 1,
   u'gloves': 1,
   u'granules': 1,
   u'heat': 1,
   u'icteric': 1,
   u'instead': 1,
   u'legs': 1,
   u'light': 5,
   u'line': 1,
   u'metal': 1,
   u'necrosed': 1,
   u'or': 4,
   u'particles': 1,
   u'pasty': 1,
   u'patches': 2,
   u'pigment': 1,
   u'pultaceous': 1,
   u'purulent': 2,
   u'pus': 4,
   u'ring': 1,
   u'rye': 1,
   u'scab': 1,
   u'scabs': 1,
   u'serum': 1,
   u'shakos': 1,
   u'spot': 1,
   u'stubb

In [24]:
def map_transition_prob(vals):
    total = float(sum(vals.values()))
    return {k: math.log(v/total) for k, v in vals.items()}

In [25]:
transition_prob_calc = other_words_collapsed.mapValues(lambda v: map_transition_prob(v))
transition_prob_calc.take(5)

[(u'sophism', {u'of': 0.0}),
 (u'systematic',
  {u'efforts': -2.3025850929940455,
   u'examination': -1.6094379124341003,
   u'exercises': -2.3025850929940455,
   u'forms': -2.3025850929940455,
   u'its': -2.3025850929940455,
   u'supervision': -2.3025850929940455,
   u'survey': -2.3025850929940455,
   u'use': -2.3025850929940455,
   u'virtually': -2.3025850929940455}),
 (u'nunnery', {u'and': -0.6931471805599453, u'as': -0.6931471805599453}),
 (u'inadequacy', {u'of': 0.0}),
 (u'yellow',
  {u'and': -3.0757749812275272,
   u'backed': -3.481240089335692,
   u'band': -4.174387269895637,
   u'blotches': -4.174387269895637,
   u'chintz': -4.174387269895637,
   u'colour': -3.0757749812275272,
   u'creamy': -4.174387269895637,
   u'dress': -4.174387269895637,
   u'elastic': -4.174387269895637,
   u'envelope': -4.174387269895637,
   u'expressed': -4.174387269895637,
   u'face': -4.174387269895637,
   u'faced': -4.174387269895637,
   u'faces': -4.174387269895637,
   u'floor': -4.174387269895637,

In [26]:
default_transition_prob = math.log(1/total_other_words)

In [27]:
transition_prob = transition_prob_calc.collectAsMap()

In [28]:
############
#
# generate dictionary
#
############

In [29]:
all_words = make_all_lower.map(lambda line: regex.sub(' ', line)).flatMap(lambda line: line.split())
all_words.take(5)

[u'the', u'project', u'gutenberg', u'ebook', u'of']

In [30]:
count_once = all_words.map(lambda word: (word, 1))
count_once.take(5)

[(u'the', 1), (u'project', 1), (u'gutenberg', 1), (u'ebook', 1), (u'of', 1)]

In [31]:
unique_words_with_count = count_once.reduceByKey(lambda a, b: a + b, numPartitions = n_partitions).cache()
unique_words_with_count.take(5)

[(u'aided', 17),
 (u'bennigsens', 1),
 (u'suicidal', 5),
 (u'linsey', 1),
 (u'worshiped', 1)]

In [32]:
generate_deletes = unique_words_with_count.map(lambda (parent, count): 
                                                   (parent, get_deletes_list(parent)))
generate_deletes.take(5)

[(u'aided',
  [u'ided',
   u'aded',
   u'aied',
   u'aidd',
   u'aide',
   u'ded',
   u'ied',
   u'idd',
   u'ide',
   u'aed',
   u'add',
   u'ade',
   u'aid',
   u'aie',
   u'ed',
   u'dd',
   u'de',
   u'id',
   u'ie',
   u'ad',
   u'ae',
   u'ai']),
 (u'bennigsens',
  [u'ennigsens',
   u'bnnigsens',
   u'benigsens',
   u'benngsens',
   u'bennisens',
   u'bennigens',
   u'bennigsns',
   u'bennigses',
   u'bennigsen',
   u'nnigsens',
   u'enigsens',
   u'enngsens',
   u'ennisens',
   u'ennigens',
   u'ennigsns',
   u'ennigses',
   u'ennigsen',
   u'bnigsens',
   u'bnngsens',
   u'bnnisens',
   u'bnnigens',
   u'bnnigsns',
   u'bnnigses',
   u'bnnigsen',
   u'beigsens',
   u'bengsens',
   u'benisens',
   u'benigens',
   u'benigsns',
   u'benigses',
   u'benigsen',
   u'bennsens',
   u'benngens',
   u'benngsns',
   u'benngses',
   u'benngsen',
   u'benniens',
   u'bennisns',
   u'bennises',
   u'bennisen',
   u'bennigns',
   u'benniges',
   u'bennigen',
   u'bennigss',
   u'bennigsn',
 

In [33]:
expand_deletes = generate_deletes.flatMapValues(lambda x: x)
expand_deletes.take(5)

[(u'aided', u'ided'),
 (u'aided', u'aded'),
 (u'aided', u'aied'),
 (u'aided', u'aidd'),
 (u'aided', u'aide')]

In [34]:
swap = expand_deletes.map(lambda (orig, delete): (delete, ([orig], 0)))
swap.take(5)

[(u'ided', ([u'aided'], 0)),
 (u'aded', ([u'aided'], 0)),
 (u'aied', ([u'aided'], 0)),
 (u'aidd', ([u'aided'], 0)),
 (u'aide', ([u'aided'], 0))]

In [35]:
corpus = unique_words_with_count.mapValues(lambda count: ([], count))
corpus.take(5)

[(u'aided', ([], 17)),
 (u'bennigsens', ([], 1)),
 (u'suicidal', ([], 5)),
 (u'linsey', ([], 1)),
 (u'worshiped', ([], 1))]

In [36]:
combine = swap.union(corpus)  # combine deletes with main dictionary, eliminate duplicates
combine.take(5)

[(u'ided', ([u'aided'], 0)),
 (u'aded', ([u'aided'], 0)),
 (u'aied', ([u'aided'], 0)),
 (u'aidd', ([u'aided'], 0)),
 (u'aide', ([u'aided'], 0))]

In [37]:
dictionary = combine.reduceByKeyLocally(lambda a, b: (a[0]+b[0], a[1]+b[1]))

***
# Sentence-level parallelization

In [38]:
def get_emission_prob(edit_dist, poisson_lambda=0.01):
    '''
    The emission probability, i.e. P(word typed|word intended)
    is approximated by a Poisson(k, l) distribution, where 
    k=edit distance and l=0.01.
    
    The lambda parameter matches the one used in the AM207
    lecture notes. Various parameters between 0 and 1 were tested
    to confirm that 0.01 yields the most accurate results.
    '''
    
    return math.log(poisson.pmf(edit_dist, poisson_lambda))

In [39]:
def get_start_prob(word, start_prob, default_start_prob):
    try:
        return start_prob[word]
    except KeyError:
        return default_start_prob

In [40]:
def get_transition_prob(cur_word, prev_word, transition_prob, default_transition_prob):
    try:
        return transition_prob[prev_word][cur_word]
    except KeyError:
        return default_transition_prob

In [41]:
def get_belief(prev_word, prev_belief):
    try:
        return prev_belief[prev_word]
    except KeyError:
        return math.log(math.exp(min(prev_belief.values()))/2.)

In [42]:
def dameraulevenshtein(seq1, seq2):
    '''
    Calculate the Damerau-Levenshtein distance between sequences.
    Same code as word-level checking.
    '''
    
    # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
    # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1
    # matrix. However, only the current and two previous rows are
    # needed at once, so we only store those.
    
    oneago = None
    thisrow = range(1, len(seq2) + 1) + [0]
    
    for x in xrange(len(seq1)):
        
        # Python lists wrap around for negative indices, so put the
        # leftmost column at the *end* of the list. This matches with
        # the zero-indexed strings and saves extra calculation.
        twoago, oneago, thisrow = \
            oneago, thisrow, [0] * len(seq2) + [x + 1]
        
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # This block deals with transpositions
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
                
    return thisrow[len(seq2) - 1]

In [43]:
def get_suggestions(string, dictionary, longest_word_length=20, 
                    min_count=100, max_sug=10):
    '''
    Return list of suggested corrections for potentially incorrectly
    spelled word.
    Code based on get_suggestions function from word-level checking,
    with the addition of the min_count parameter, which only
    considers words that have occur more than min_count times in the
    (dictionary) corpus.
    '''
    
    if (len(string) - longest_word_length) > MAX_EDIT_DISTANCE:
        # to ensure Viterbi can keep running -- use the word itself
        return [(string, 0)]
    
    suggest_dict = {}
    
    queue = [string]
    q_dictionary = {}  # items other than string that we've checked
    
    while len(queue)>0:
        q_item = queue[0]  # pop
        queue = queue[1:]
        
        # process queue item
        if (q_item in dictionary) and (q_item not in suggest_dict):
            if (dictionary[q_item][1]>0):
            # word is in dictionary, and is a word from the corpus,
            # and not already in suggestion list so add to suggestion
            # dictionary, indexed by the word with value (frequency
            # in corpus, edit distance)
            # note: q_items that are not the input string are shorter
            # than input string since only deletes are added (unless
            # manual dictionary corrections are added)
                assert len(string)>=len(q_item)
                suggest_dict[q_item] = \
                    (dictionary[q_item][1], len(string) - len(q_item))
            
            # the suggested corrections for q_item as stored in
            # dictionary (whether or not q_item itself is a valid
            # word or merely a delete) can be valid corrections
            for sc_item in dictionary[q_item][0]:
                if (sc_item not in suggest_dict):
                    
                    # compute edit distance
                    # suggested items should always be longer (unless
                    # manual corrections are added)
                    assert len(sc_item)>len(q_item)
                    # q_items that are not input should be shorter
                    # than original string 
                    # (unless manual corrections added)
                    assert len(q_item)<=len(string)
                    if len(q_item)==len(string):
                        assert q_item==string
                        item_dist = len(sc_item) - len(q_item)

                    # item in suggestions list should not be the same
                    # as the string itself
                    assert sc_item!=string           
                    # calculate edit distance using Damerau-
                    # Levenshtein distance
                    item_dist = dameraulevenshtein(sc_item, string)
                    
                    if item_dist<=MAX_EDIT_DISTANCE:
                        # should already be in dictionary if in
                        # suggestion list
                        assert sc_item in dictionary  
                        # trim list to contain state space
                        if (dictionary[q_item][1]>0): 
                            suggest_dict[sc_item] = \
                                (dictionary[sc_item][1], item_dist)
        
        # now generate deletes (e.g. a substring of string or of a
        # delete) from the queue item as additional items to check
        # -- add to end of queue
        assert len(string)>=len(q_item)
        if (len(string)-len(q_item))<MAX_EDIT_DISTANCE \
            and len(q_item)>1:
            for c in range(len(q_item)): # character index        
                word_minus_c = q_item[:c] + q_item[c+1:]
                if word_minus_c not in q_dictionary:
                    queue.append(word_minus_c)
                    # arbitrary value to identify we checked this
                    q_dictionary[word_minus_c] = None

    # return list of suggestions: (correction, edit distance)
    
    # only include words that have appeared a minimum number of times
    # make sure that we do not lose the original word
    as_list = [i for i in suggest_dict.items() 
               if (i[1][0]>min_count or i[0]==string)]
    
    # only include the most likely suggestions (based on frequency
    # and edit distance from original word)
    trunc_as_list = sorted(as_list, 
            key = lambda (term, (freq, dist)): (dist, -freq))[:max_sug]
    
    if len(trunc_as_list)==0:
        # to ensure Viterbi can keep running
        # -- use the word itself if no corrections are found
        return [(string, 0)]
        
    else:
        # drop the word frequency - not needed beyond this point
        return [(i[0], i[1][1]) for i in trunc_as_list]

    '''
    Output format:
    get_suggestions('file', dictionary)
    [('file', 0), ('five', 1), ('fire', 1), ('fine', 1), ('will', 2),
    ('time', 2), ('face', 2), ('like', 2), ('life', 2), ('while', 2)]
    '''

In [44]:
def viterbi(words, dictionary, start_prob, default_start_prob, 
            transition_prob, default_transition_prob):
    
    V = [{}]
    path = {}
    path_context = []
    
    # character level correction - used to determine state space
    corrections = get_suggestions(words[0], dictionary)
        
    # Initialize base cases (t == 0)
    for sug_word in corrections:
        
        # compute the value for all possible starting states
        V[0][sug_word[0]] = math.exp(
            get_start_prob(sug_word[0], start_prob, 
                           default_start_prob)
            + get_emission_prob(sug_word[1]))
        
        # remember all the different paths (only one state so far)
        path[sug_word[0]] = [sug_word[0]]
 
    # normalize for numerical stability
    path_temp_sum = sum(V[0].values())
    V[0].update({k: math.log(v/path_temp_sum) 
                 for k, v in V[0].items()})
    
    # keep track of previous state space
    prev_corrections = [i[0] for i in corrections]
    
    if len(words) == 1:
        path_context = [max(V[0], key=lambda i: V[0][i])]
        return path_context

    # run Viterbi for t > 0
    for t in range(1, len(words)):

        V.append({})
        new_path = {}
        
        # character level correction
        corrections = get_suggestions(words[t], dictionary)
 
        for sug_word in corrections:
        
            sug_word_emission_prob = get_emission_prob(sug_word[1])
            
            # compute the values coming from all possible previous
            # states, only keep the maximum
            (prob, word) = max(
                (get_belief(prev_word, V[t-1]) 
                + get_transition_prob(sug_word[0], prev_word, 
                    transition_prob, default_transition_prob)
                + sug_word_emission_prob, prev_word) 
                               for prev_word in prev_corrections)

            # save the maximum value for each state
            V[t][sug_word[0]] = math.exp(prob)
            
            # remember the path we came from to get this maximum value
            new_path[sug_word[0]] = path[word] + [sug_word[0]]
            
        # normalize for numerical stability
        path_temp_sum = sum(V[t].values())
        V[t].update({k: math.log(v/path_temp_sum) 
                     for k, v in V[t].items()})
        
        # keep track of previous state space
        prev_corrections = [i[0] for i in corrections]
 
        # don't need to remember the old paths
        path = new_path
     
    (prob, word) = max((V[t][sug_word[0]], sug_word[0]) 
                       for sug_word in corrections)
    path_context = path[word]
    
    return path_context

In [45]:
############
#
# load file & initial processing
#
############

In [46]:
fname = "testdata/test.txt"

In [47]:
# broadcast Python dictionaries to workers
bc_dictionary = sc.broadcast(dictionary)
bc_start_prob = sc.broadcast(start_prob)
bc_transition_prob = sc.broadcast(transition_prob)

In [49]:
make_all_lower = sc.textFile(fname).map(lambda line: line.lower()).filter(lambda x: x!='')
make_all_lower.take(5)

[u'this is a test',
 u'this is a test',
 u'here is54a test',
 u'this is ax test',
 u'this is za test']

In [50]:
split_sentence = make_all_lower.flatMap(lambda line: line.split('.')).map(lambda sentence: regex.sub(' ', sentence)) \
            .map(lambda sentence: sentence.split())
split_sentence.take(5)

[[u'this', u'is', u'a', u'test'],
 [u'this', u'is', u'a', u'test'],
 [u'here', u'is', u'a', u'test'],
 [u'this', u'is', u'ax', u'test'],
 [u'this', u'is', u'za', u'test']]

In [51]:
# use accumulator to count the number of words checked
accum_total_words = sc.accumulator(0)
split_words = split_sentence.flatMap(lambda x: x).foreach(lambda x: accum_total_words.add(1))
print 'Words checked: ', accum_total_words.value

Words checked:  27


In [52]:
sentence_id = split_sentence.zipWithIndex().map(lambda (k, v): (v, k))
sentence_id.take(5)

[(0, [u'this', u'is', u'a', u'test']),
 (1, [u'this', u'is', u'a', u'test']),
 (2, [u'here', u'is', u'a', u'test']),
 (3, [u'this', u'is', u'ax', u'test']),
 (4, [u'this', u'is', u'za', u'test'])]

In [53]:
sentence_correction = sentence_id.map(lambda (k, v): (k, (v, viterbi(
                v, bc_dictionary.value, bc_start_prob.value, 
                default_start_prob, bc_transition_prob.value, default_transition_prob))))
sentence_correction.take(5)

[(0, ([u'this', u'is', u'a', u'test'], [u'this', u'is', u'a', u'test'])),
 (1, ([u'this', u'is', u'a', u'test'], [u'this', u'is', u'a', u'test'])),
 (2, ([u'here', u'is', u'a', u'test'], [u'here', u'is', u'a', u'test'])),
 (3, ([u'this', u'is', u'ax', u'test'], [u'this', u'is', u'a', u'test'])),
 (4, ([u'this', u'is', u'za', u'test'], [u'this', u'is', u'a', u'test']))]

In [54]:
def get_count_mismatches(sentences):
    orig_sentence, sug_sentence = sentences
    count_mismatches = len([(orig_sentence[i], sug_sentence[i]) for i in range(len(orig_sentence))
            if orig_sentence[i]!=sug_sentence[i]])
    return count_mismatches, orig_sentence, sug_sentence

In [55]:
sentence_errors = sentence_correction.mapValues(lambda v: (get_count_mismatches(v))). \
            filter(lambda (k, v): v[0]>0).cache()
sentence_errors.take(5)

[(3, (1, [u'this', u'is', u'ax', u'test'], [u'this', u'is', u'a', u'test'])),
 (4, (1, [u'this', u'is', u'za', u'test'], [u'this', u'is', u'a', u'test'])),
 (5, (1, [u'thee', u'is', u'a', u'test'], [u'there', u'is', u'a', u'test'])),
 (6, (1, [u'her', u'tee', u'set'], [u'her', u'to', u'set']))]

In [56]:
sentence_errors_list = sentence_errors.collect()
print sentence_errors_list

[(3, (1, [u'this', u'is', u'ax', u'test'], [u'this', u'is', u'a', u'test'])), (4, (1, [u'this', u'is', u'za', u'test'], [u'this', u'is', u'a', u'test'])), (5, (1, [u'thee', u'is', u'a', u'test'], [u'there', u'is', u'a', u'test'])), (6, (1, [u'her', u'tee', u'set'], [u'her', u'to', u'set']))]


In [57]:
num_errors = sum([s[1][0] for s in sentence_errors_list])
print num_errors

4


In [58]:
for sentence in sentence_errors_list:
    print 'Line %i: %s --> %s' % (sentence[0], ' '.join(sentence[1][1]), ' '.join(sentence[1][2]))

Line 3: this is ax test --> this is a test
Line 4: this is za test --> this is a test
Line 5: thee is a test --> there is a test
Line 6: her tee set --> her to set


***
# Word-level parallelization

In [48]:
fname = "testdata/test.txt"

# broadcast Python dictionaries to workers
bc_dictionary = sc.broadcast(dictionary)
bc_start_prob = sc.broadcast(start_prob)
bc_transition_prob = sc.broadcast(transition_prob)

# convert all text to lowercase and drop empty lines
make_all_lower = sc.textFile(fname) \
    .map(lambda line: line.lower()) \
    .filter(lambda x: x!='')

regex = re.compile('[^a-z ]')

# split into sentences -> remove special characters -> convert into list of words
split_sentence = make_all_lower.flatMap(lambda line: line.split('.')) \
        .map(lambda sentence: regex.sub(' ', sentence)) \
        .map(lambda sentence: sentence.split()).cache()

# use accumulator to count the number of words checked
accum_total_words = sc.accumulator(0)
split_words = split_sentence.flatMap(lambda x: x).foreach(lambda x: accum_total_words.add(1))

# assign each sentence a unique id
sentence_id = split_sentence.zipWithIndex().map(lambda (k, v): (v, k)).partitionBy(n_partitions).cache()
sentence_id.take(5)

[(0, [u'this', u'is', u'a', u'test']),
 (6, [u'her', u'tee', u'set']),
 (1, [u'this', u'is', u'a', u'test']),
 (2, [u'here', u'is', u'a', u'test']),
 (3, [u'this', u'is', u'ax', u'test'])]

In [49]:
def map_sentence_words(sentence, tmp_dict):
    return [[word, get_suggestions(word, tmp_dict)] 
            for i, word in enumerate(sentence)]

In [50]:
sentence_words = sentence_id.mapValues(lambda v: map_sentence_words(v, bc_dictionary.value))
sentence_words.take(5)

[(0,
  [[u'this',
    [(u'this', 0),
     (u'his', 1),
     (u'thus', 1),
     (u'thin', 1),
     (u'the', 2),
     (u'that', 2),
     (u'is', 2),
     (u'him', 2),
     (u'they', 2),
     (u'their', 2)]],
   [u'is',
    [(u'is', 0),
     (u'in', 1),
     (u'it', 1),
     (u'his', 1),
     (u'as', 1),
     (u'i', 1),
     (u's', 1),
     (u'if', 1),
     (u'its', 1),
     (u'us', 1)]],
   [u'a',
    [(u'a', 0),
     (u'as', 1),
     (u'at', 1),
     (u'an', 1),
     (u'am', 1),
     (u'ah', 1),
     (u'and', 2),
     (u'was', 2),
     (u'had', 2),
     (u'all', 2)]],
   [u'test',
    [(u'test', 0),
     (u'west', 1),
     (u'best', 1),
     (u'rest', 1),
     (u'that', 2),
     (u'these', 2),
     (u'went', 2),
     (u'must', 2),
     (u'most', 2),
     (u'left', 2)]]]),
 (6,
  [[u'her',
    [(u'her', 0),
     (u'he', 1),
     (u'here', 1),
     (u'hear', 1),
     (u'the', 2),
     (u'his', 2),
     (u'had', 2),
     (u'for', 2),
     (u'be', 2),
     (u'or', 2)]],
   [u'tee',
    [(u'

In [571]:
########################
# testing itertools
########################

In [572]:
# suggestions
# 'this': [('this', (4, 0)), ('is', (6, 2))]				2
# 'is': [('is', (6, 0)), (u'this', (4, 2))]					2
# 'a': [('a', (4, 0)), (u'ax', (1, 1)), (u'za', (1, 1))]	3
# 'test': [('test', (6, 0))]								1
# combinations: 2*2*3*1=12

In [524]:
w_this = [('this', (4, 0)), ('is', (6, 2))]
w_is = [('is', (6, 0)), (u'this', (4, 2))]
w_a = [('a', (4, 0)), (u'ax', (1, 1)), (u'za', (1, 1))]
w_test = [('test', (6, 0))]

In [531]:
list(itertools.product(w_this, w_is, w_a, w_test))

[(('this', (4, 0)), ('is', (6, 0)), ('a', (4, 0)), ('test', (6, 0))),
 (('this', (4, 0)), ('is', (6, 0)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), ('is', (6, 0)), (u'za', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), ('a', (4, 0)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), (u'za', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), ('a', (4, 0)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), (u'za', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), ('a', (4, 0)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), (u'za', (1, 1)), ('test', (6, 0)))]

In [532]:
big_list = [w_this, w_is, w_a, w_test]

In [552]:
list(itertools.product(*big_list))

[(('this', (4, 0)), ('is', (6, 0)), ('a', (4, 0)), ('test', (6, 0))),
 (('this', (4, 0)), ('is', (6, 0)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), ('is', (6, 0)), (u'za', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), ('a', (4, 0)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('this', (4, 0)), (u'this', (4, 2)), (u'za', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), ('a', (4, 0)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), ('is', (6, 0)), (u'za', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), ('a', (4, 0)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), (u'ax', (1, 1)), ('test', (6, 0))),
 (('is', (6, 2)), (u'this', (4, 2)), (u'za', (1, 1)), ('test', (6, 0)))]

In [51]:
########################
# back to live code
########################

In [52]:
def split_suggestions(sentence):
    result = []
    for word in sentence:
        result.append([(word[0], s[0], get_emission_prob(s[1])) for s in word[1]])
    return result

In [54]:
sentence_word_sug = sentence_words.mapValues(lambda v: split_suggestions(v))
sentence_word_sug.take(1)

[(0,
  [[(u'this', u'this', -0.009999999999999946),
    (u'this', u'his', -4.615170185988091),
    (u'this', u'thus', -4.615170185988091),
    (u'this', u'thin', -4.615170185988091),
    (u'this', u'the', -9.913487552536127),
    (u'this', u'that', -9.913487552536127),
    (u'this', u'is', -9.913487552536127),
    (u'this', u'him', -9.913487552536127),
    (u'this', u'they', -9.913487552536127),
    (u'this', u'their', -9.913487552536127)],
   [(u'is', u'is', -0.009999999999999946),
    (u'is', u'in', -4.615170185988091),
    (u'is', u'it', -4.615170185988091),
    (u'is', u'his', -4.615170185988091),
    (u'is', u'as', -4.615170185988091),
    (u'is', u'i', -4.615170185988091),
    (u'is', u's', -4.615170185988091),
    (u'is', u'if', -4.615170185988091),
    (u'is', u'its', -4.615170185988091),
    (u'is', u'us', -4.615170185988091)],
   [(u'a', u'a', -0.009999999999999946),
    (u'a', u'as', -4.615170185988091),
    (u'a', u'at', -4.615170185988091),
    (u'a', u'an', -4.61517018598

In [55]:
def get_word_combos(sug_lists):
    return list(itertools.product(*sug_lists))

In [56]:
sentence_word_combos = sentence_word_sug.map(lambda (k, v): (k, get_word_combos(v)))
sentence_word_combos.take(1)

[(0,
  [((u'this', u'this', -0.009999999999999946),
    (u'is', u'is', -0.009999999999999946),
    (u'a', u'a', -0.009999999999999946),
    (u'test', u'test', -0.009999999999999946)),
   ((u'this', u'this', -0.009999999999999946),
    (u'is', u'is', -0.009999999999999946),
    (u'a', u'a', -0.009999999999999946),
    (u'test', u'west', -4.615170185988091)),
   ((u'this', u'this', -0.009999999999999946),
    (u'is', u'is', -0.009999999999999946),
    (u'a', u'a', -0.009999999999999946),
    (u'test', u'best', -4.615170185988091)),
   ((u'this', u'this', -0.009999999999999946),
    (u'is', u'is', -0.009999999999999946),
    (u'a', u'a', -0.009999999999999946),
    (u'test', u'rest', -4.615170185988091)),
   ((u'this', u'this', -0.009999999999999946),
    (u'is', u'is', -0.009999999999999946),
    (u'a', u'a', -0.009999999999999946),
    (u'test', u'that', -9.913487552536127)),
   ((u'this', u'this', -0.009999999999999946),
    (u'is', u'is', -0.009999999999999946),
    (u'a', u'a', -0.00

In [57]:
def split_combos(combos):
    sent_id, combo_list = combos
    return [[sent_id, c] for c in combo_list]

In [59]:
sentence_word_combos_split = sentence_word_combos.flatMapValues(lambda x: x) #.flatMap(lambda x: split_combos(x))
sentence_word_combos_split.take(1)

[(0,
  ((u'this', u'this', -0.009999999999999946),
   (u'is', u'is', -0.009999999999999946),
   (u'a', u'a', -0.009999999999999946),
   (u'test', u'test', -0.009999999999999946)))]

In [60]:
def get_combo_prob(combo, tmp_sp, d_sp, tmp_tp, d_tp):
    
    # first word in sentence
    # emission prob * start prob
    orig_path = [combo[0][0]]
    sug_path = [combo[0][1]]
    prob = combo[0][2] + get_start_prob(combo[0][1], tmp_sp, d_sp)
    
    # subsequent words
    for i, w in enumerate(combo[1:]):
        orig_path.append(w[0])
        sug_path.append(w[1])
        prob += w[2] + get_transition_prob(w[1], combo[i-1][1], tmp_tp, d_tp)
    
    return orig_path, sug_path, prob

In [61]:
sentence_word_combos_prob = sentence_word_combos_split.map(lambda (k,v): (k, 
                                get_combo_prob(v, bc_start_prob.value, default_start_prob, 
                                               bc_transition_prob.value, default_transition_prob)))
sentence_word_combos_prob.take(5)

[(0,
  ([u'this', u'is', u'a', u'test'],
   [u'this', u'is', u'a', u'test'],
   -28.247429349355308)),
 (0,
  ([u'this', u'is', u'a', u'test'],
   [u'this', u'is', u'a', u'west'],
   -43.99674855627251)),
 (0,
  ([u'this', u'is', u'a', u'test'],
   [u'this', u'is', u'a', u'best'],
   -36.1873518175974)),
 (0,
  ([u'this', u'is', u'a', u'test'],
   [u'this', u'is', u'a', u'rest'],
   -29.11198532084017)),
 (0,
  ([u'this', u'is', u'a', u'test'],
   [u'this', u'is', u'a', u'that'],
   -29.412339583754793))]

In [62]:
sentence_max_prob = sentence_word_combos_prob.reduceByKey(lambda a,b: a if a[2] > b[2] else b, 
                                                          numPartitions = n_partitions)
sentence_max_prob.take(5)

[(0,
  ([u'this', u'is', u'a', u'test'],
   [u'this', u'is', u'a', u'test'],
   -28.247429349355308)),
 (6,
  ([u'her', u'tee', u'set'], [u'her', u'the', u'set'], -20.972192187442285)),
 (1,
  ([u'this', u'is', u'a', u'test'],
   [u'this', u'is', u'a', u'test'],
   -28.247429349355308)),
 (2,
  ([u'here', u'is', u'a', u'test'],
   [u'here', u'is', u'a', u'test'],
   -27.812090091881217)),
 (3,
  ([u'this', u'is', u'ax', u'test'],
   [u'this', u'is', u'as', u'test'],
   -31.348522138567127))]

In [161]:
def get_count_mismatches_prob(sentences):
    orig_sentence, sug_sentence, prob = sentences
    count_mismatches = len([(orig_sentence[i], sug_sentence[i]) for i in range(len(orig_sentence))
            if orig_sentence[i]!=sug_sentence[i]])
    return count_mismatches, orig_sentence, sug_sentence

In [162]:
sentence_errors = sentence_max_prob.mapValues(lambda v: (get_count_mismatches_prob(v))) \
        .filter(lambda (k, v): v[0]>0).cache()

In [163]:
print sentence_errors
print sentence_errors.getNumPartitions()
print sentence_errors.count()
print sentence_errors.take(5)

PythonRDD[167] at RDD at PythonRDD.scala:43
6
4
[(6, (1, [u'her', u'tee', u'set'], [u'her', u'the', u'set'])), (3, (1, [u'this', u'is', u'ax', u'test'], [u'this', u'is', u'as', u'test'])), (4, (1, [u'this', u'is', u'za', u'test'], [u'this', u'is', u'a', u'test'])), (5, (1, [u'thee', u'is', u'a', u'test'], [u'then', u'is', u'a', u'test']))]


In [164]:
sentence_errors_list = sentence_errors.collect()

In [165]:
num_errors = sum([s[1][0] for s in sentence_errors_list])

In [166]:
for sentence in sentence_errors_list:
    print 'Line %i: %s --> %s' % (sentence[0], ' '.join(sentence[1][1]), ' '.join(sentence[1][2]))

Line 6: her tee set --> her the set
Line 3: this is ax test --> this is as test
Line 4: this is za test --> this is a test
Line 5: thee is a test --> then is a test
