In [1]:
import findspark
import os
findspark.init('/Users/K-Lo/spark-1.5.0')

In [2]:
import pyspark
conf = (pyspark.SparkConf()
    .setMaster('local')
    .setAppName('pyspark')
    .set("spark.executor.memory", "2g"))
sc = pyspark.SparkContext(conf=conf)

In [3]:
import re

In [4]:
'''
Based on SymSpell:

Originally written in C#:

// SymSpell: 1 million times faster through Symmetric Delete spelling correction algorithm
//
// The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup 
// for a given Damerau-Levenshtein distance. It is six orders of magnitude faster and language independent.
// Opposite to other algorithms only deletes are required, no transposes + replaces + inserts.
// Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term.
// Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters!
//
// Copyright (C) 2015 Wolf Garbe
// Version: 3.0
// Author: Wolf Garbe <wolf.garbe@faroo.com>
// Maintainer: Wolf Garbe <wolf.garbe@faroo.com>
// URL: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
// Description: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
//
// License:
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License, 
// version 3.0 (LGPL-3.0) as published by the Free Software Foundation.
// http://www.opensource.org/licenses/LGPL-3.0
//
// Usage: single word + Enter:  Display spelling suggestions
//        Enter without input:  Terminate the program
'''
pass

In [5]:
n_partitions = 6  # number of partitions to be used

In [6]:
max_edit_distance = 3

In [7]:
# we generate and count all words for the corpus,
# then add deletes to the dictionary
# this is a slightly different approach from the Faroo algorithm
# that may be more appropriate for parallel processing

In [8]:
# general helper functions
def get_deletes_list(word):
    '''given a word, derive strings with up to max_edit_distance characters deleted'''
    # takes a string as input and returns all 1-deletes in a list
    # allows for duplicates to be created, will deal with duplicates later to minimize shuffling
    if len(word)>1:
        return ([word[:c] + word[c+1:] for c in range(len(word))])
    else:
        return []

In [9]:
def copartitioned(RDD1, RDD2):
    '''check if two RDDs are copartitioned'''
    return RDD1.partitioner == RDD2.partitioner

In [10]:
def combine_joined_lists(tup):
    '''takes as input a tuple in the form (a, b) where each of a, b may be None (but not both) or a list
       and returns a concatenated list of unique elements'''
    concat_list = []
    if tup[1] is None:
        concat_list = tup[0]
    elif tup[0] is None:
        concat_list = tup[1]
    else:
        concat_list = tup[0] + tup[1]
        
    return list(set(concat_list))

In [11]:
def parallel_create_dictionary(fname):

    print "Creating dictionary..." 
    
    ############
    #
    # process corpus
    #
    ############
    
    print ">>> processing corpus words..."
    
    # http://stackoverflow.com/questions/22520932/python-remove-all-non-alphabet-chars-from-string
    regex = re.compile('[^a-z ]')

    # convert file into one long sequence of words
    make_all_lower = sc.textFile(fname).map(lambda line: line.lower())
    replace_nonalphs = make_all_lower.map(lambda line: regex.sub(' ', line))
    all_words = replace_nonalphs.flatMap(lambda line: line.split())

    # create core corpus dictionary (i.e. only words appearing in file, no "deletes") and cache it
    # output RDD of unique_words_with_count: [(word1, count1), (word2, count2), (word3, count3)...]
    count_once = all_words.map(lambda word: (word, 1))
    unique_words_with_count = count_once.reduceByKey(lambda a, b: a + b, numPartitions = n_partitions).cache()
    
    # output stats on core corpus
    print "total words processed: %i" % unique_words_with_count.map(lambda (k, v): v).reduce(lambda a, b: a + b)
    print "total unique words in corpus: %i" % unique_words_with_count.count()
    
    ############
    #
    # generate deletes list
    #
    ############
    
    # generate list of n-deletes from words in a corpus of the form: [(word1, count1), (word2, count2), ...]
    # we will handle possible duplicates after map/reduce:
    #     our thinking is the resulting suggestions lists for each delete will be much smaller than the
    #     list of potential deletes, and it is more efficient to reduce first, then remove duplicates 
    #     from these smaller lists (at each worker node), rather than calling `distinct()` on  
    #     flattened `expand_deletes` which would require a large shuffle

    ##
    ## generate 1-deletes
    ##
     
    assert max_edit_distance>0  
    print ">>> processing deletions from corpus..."
    
    generate_deletes = unique_words_with_count.map(lambda (parent, count): (parent, get_deletes_list(parent)), 
                                                      preservesPartitioning=True)
    expand_deletes = generate_deletes.flatMapValues(lambda x: x)
    
    # swap and combine, resulting RDD after processing 1-deletes has elements:
    # [(delete1, [correct1, correct2...]), (delete2, [correct1, correct2...])...]
    swap = expand_deletes.map(lambda (orig, delete): (delete, [orig]))
    combine = swap.reduceByKey(lambda a, b: a + b, numPartitions = n_partitions)

    # cache "master" deletes RDD, list of (deletes, [unique suggestions]), for use in loop
    deletes = combine.mapValues(lambda sl: list(set(sl))).cache()
    
    ##
    ## generate 2+ deletes
    ##
    
    d_remaining = max_edit_distance - 1  # decreasing counter
    queue = deletes

    while d_remaining>0:

        # generate further deletes
        #'expand_new_deletes' will be of the form [(parent "delete", [new child "deletes"]), ...]
        # n.b. this will filter out elements with no new child deletes
        gen_new_deletes = queue.map(lambda (x, y): (x, get_deletes_list(x)), preservesPartitioning=True)
        expand_new_deletes = gen_new_deletes.flatMapValues(lambda x: x)  

        # associate each new child delete with same corpus word suggestions that applied for parent delete
        # update queue with [(new child delete, [corpus suggestions]) ...] and cache for next iteration
        
        assert copartitioned(queue, expand_new_deletes)   # check partitioning for efficient join
        get_sugglist_from_parent = expand_new_deletes.join(queue)
        new_deletes = get_sugglist_from_parent.map(lambda (p, (c, sl)): (c, sl))
        combine_new = new_deletes.reduceByKey(lambda a, b: a + b, numPartitions = n_partitions)
        queue = combine_new.mapValues(lambda sl: list(set(sl))).cache()

        # update "master" deletes list with new deletes, and cache for next iteration
        
        assert copartitioned(deletes, queue)    # check partitioning for efficient join
        join_delete_lists = deletes.fullOuterJoin(queue)
        deletes = join_delete_lists.mapValues(lambda y: combine_joined_lists(y)).cache()

        d_remaining -= 1
        
    ############
    #
    # merge deletes with unique corpus words to construct main dictionary
    #
    ############

    # dictionary entries are in the form: (list of suggested corrections, frequency of word in corpus)
    # note frequency of word in corpus is not incremented for deletes
    deletes_for_dict = deletes.mapValues(lambda sl: (sl, 0)) 
    unique_words_for_dict = unique_words_with_count.mapValues(lambda count: ([], count))

    assert copartitioned(unique_words_for_dict, deletes_for_dict)  # check partitioning for efficient join
    join_deletes = unique_words_for_dict.fullOuterJoin(deletes_for_dict)
    '''
    entries now in form of (word, ( ([], count), ([suggestions], 0) )) for words in both corpus/deletes
                           (word, ( ([], count), None               )) for (real) words in corpus only
                           (word, ( None       , ([suggestions], 0) )) for (fake) words in deletes only
    '''

    # if entry has deletes and is a real word, take suggestion list from deletes and count from corpus
    dictionary_RDD = join_deletes.mapValues(lambda (xtup, ytup): 
                                                xtup if ytup is None
                                                else ytup if xtup is None
                                                else (ytup[0], xtup[1])).cache()

    print "total items in dictionary (corpus words and deletions): %i" % dictionary_RDD.count()
    print "  edit distance for deletions: %i" % max_edit_distance
    longest_word_length = unique_words_with_count.map(lambda (k, v): len(k)).reduce(max)
    print "  length of longest word in corpus: %i" % longest_word_length
        
    return dictionary_RDD

In [12]:
%%time
#create_dictionary("/Users/K-Lo/Desktop/big.txt").collect()
a = parallel_create_dictionary("/Users/K-Lo/Desktop/big.txt")

Creating dictionary...
>>> processing corpus words...
total words processed: 1105285
total unique words in corpus: 29157
>>> processing deletions from corpus...
total items in dictionary (corpus words and deletions): 2151998
  edit distance for deletions: 3
  length of longest word in corpus: 18
CPU times: user 108 ms, sys: 26.9 ms, total: 135 ms
Wall time: 4min 23s
