In [None]:
def correct_document_context_parallel_words(fname, dictionary, longest_word_length,
                             start_prob, default_start_prob,
                             transition_prob, default_transition_prob,
                             num_word_suggestions=5000, printlist=True):
    
    ############
    #
    # load file & initial processing
    #
    ############
    
    # broadcast Python dictionaries to workers
    bc_dictionary = sc.broadcast(dictionary)
    bc_start_prob = sc.broadcast(start_prob)
    bc_transition_prob = sc.broadcast(transition_prob)
    
    make_all_lower = sc.textFile(fname).map(lambda line: line.lower()).filter(lambda x: x!='')
    
    regex = re.compile('[^a-z ]')
    
    split_sentence = make_all_lower.flatMap(lambda line: line.split('.')) \
            .map(lambda sentence: regex.sub(' ', sentence)) \
            .map(lambda sentence: sentence.split()).cache()                        ###
            
    # use accumulator to count the number of words checked
    accum_total_words = sc.accumulator(0)
    split_words = split_sentence.flatMap(lambda x: x).foreach(lambda x: accum_total_words.add(1))
    
    sentence_id = split_sentence.zipWithIndex().map(lambda (k, v): (v, k)).partitionBy(n_partitions).cache()  ###
    
    sentence_words = sentence_id.mapValues(lambda v: 
                                map_sentence_words(v, bc_dictionary.value, longest_word_length))  ###
    
    sentence_word_sug = sentence_words.mapValues(lambda v: split_suggestions(v))  ###
    
    sentence_word_combos = sentence_word_sug.mapValues(lambda v: get_word_combos(v)) ###
    
    sentence_word_combos_split = sentence_word_combos.flatMap(lambda x: split_combos(x)).partitionBy(n_partitions).cache()  ###? lost partitioning,
     
    sentence_word_combos_prob = sentence_word_combos_split.mapValues(lambda v: 
                                get_combo_prob(v, bc_start_prob.value, default_start_prob, 
                                               bc_transition_prob.value, default_transition_prob))  ## only if we successfully repartition
    
    sentence_max_prob = sentence_word_combos_prob.reduceByKey(lambda a,b: a if a[2] > b[2] else b)

    sentence_mismatch = sentence_max_prob.mapValues(lambda v: (v[0], v[1])) \
         .mapValues(lambda v: get_sentence_mismatches(v)) \
         .filter(lambda (k,v): v!=None)
               
    word_mismatch = sentence_mismatch.flatMap(lambda x: split_mismatches(x)).cache() ## or just 'collect' right away & change print statement below
    
#     # use accumulator to count the number of mismatches
#     accum_total_mismatches = sc.accumulator(0)
#     count_mismatches = word_mismatch.foreach(lambda x: accum_total_mismatches.add(1))
    
    if printlist:
        print '    Words with suggested corrections (line number, word in text, top match):'
        print word_mismatch.map(lambda x: (x[0], str(x[1]) + " --> " + str(x[2]))).collect()