In [18]:
from collections import Counter, defaultdict
import re

In [12]:
# From naive_bayes chapter.
def tokenize(message):
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    return set(all_words)                           # remove duplicates

In [13]:
def word_count_old(documents):
    """Word count not using MapReduce."""
    return Counter(word
                   for document in documents
                   for word in tokenize(documents))

In [14]:
def wc_mapper(document):
    """For each word in document, emit (word, 1)."""
    for word in tokenize(document):
        yield (word, 1)

In [17]:
def wc_reducer(word, counts):
    """Sum up the counts for a word."""
    yield (word, sum(counts))

In [22]:
def word_count(documents):
    """Count the words in the input documents using MapReduce."""
    
    # Place to store grouped values.
    collector = defaultdict(list)
    
    for document in documents:
        for word, count in wc_mapper(document):
            collector[word].append(count)
    
    return [output
            for word, counts in collector.items()
            for output in wc_reducer(word, counts)]

In [24]:
documents = ["data science", "big data", "science fiction"]
word_count(documents)

[('data', 2), ('science', 2), ('big', 1), ('fiction', 1)]