In [1]:
 !pip install syllables #comment or uncomment based on if installed in environment



Note the use of the magic command ```%%file```.  You can use this to write the contents of a cell out to a file, which is what we need to do to use mrjob:

In [2]:
%%file word_count.py
from mrjob.job import MRJob
import re

class MRWordFrequencyCount(MRJob):

  ### input: self, in_key, in_value
  def mapper(self, _, line):
    yield "chars", len(line)
    yield "words", len(line.split())
    yield "lines", 1

  ### input: self, in_key from mapper, in_value from mapper
  def reducer(self, key, values):
    yield key, sum(values)
if __name__ == "__main__":
    MRWordFrequencyCount.run()

Overwriting word_count.py


Then we use the bang (!) command to run the python file on a text file.

In [3]:
!python word_count.py data/gutenberg/short.t1.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/word_count.jovyan.20210503.231749.743881
Running step 1 of 1...
job output is in /tmp/word_count.jovyan.20210503.231749.743881/output
Streaming final output from /tmp/word_count.jovyan.20210503.231749.743881/output...
"chars"	10653
"words"	1822
"lines"	200
Removing temp directory /tmp/word_count.jovyan.20210503.231749.743881...


In [4]:
### Now let's look at a slightly more complicated example:

In [5]:
%%file most_used_word.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

WORD_RE = re.compile(r"[\w']+") # any whitespace or apostrophe, used to split lines below


class MRMostUsedWord(MRJob):
    STOPWORDS = {'i', 'we', 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than'}
    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_max_word)
        ]

    def mapper_get_words(self, _, line):
        # yield each word in the line
        for word in WORD_RE.findall(line):
            if word.lower() not in self.STOPWORDS:
                yield (word.lower(), 1)

    def combiner_count_words(self, word, counts):
        # optimization: sum the words we've seen so far
        yield (word, sum(counts))

    def reducer_count_words(self, word, counts):
        # send all (num_occurrences, word) pairs to the same reducer.
        # num_occurrences is used so we can easily use Python's max() function.
        yield None, (sum(counts), word)

    # discard the key; it is just None
    def reducer_find_max_word(self, _, word_count_pairs):
        # each item of word_count_pairs is (count, word),
        # so yielding one results in key=counts, value=word
        yield max(word_count_pairs)



if __name__ == '__main__':
    import time
    start = time.time()
    MRMostUsedWord.run()
    end = time.time()
    print(end - start)

Overwriting most_used_word.py


In [6]:
!python most_used_word.py data/gutenberg/t8.shakespeare.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/most_used_word.jovyan.20210503.231750.929059
Running step 1 of 2...
Running step 2 of 2...
job output is in /tmp/most_used_word.jovyan.20210503.231750.929059/output
Streaming final output from /tmp/most_used_word.jovyan.20210503.231750.929059/output...
5479	"thou"
Removing temp directory /tmp/most_used_word.jovyan.20210503.231750.929059...
9.276134252548218


Now let's Write an mrjob script that finds the 10 words that have the most syllables from the t5.churchill.txt file

In [7]:
%%file top_ten_count.py
from mrjob.job import MRJob
from mrjob.step import MRStep
from collections import defaultdict
import itertools
import re

WORD_RE = re.compile(r"[\w']+") # any whitespace or apostrophe, used to split lines below

class MRTopTenWords(MRJob):
    
    def steps(self):
        return [MRStep(mapper=self.mapper,reducer=self.reducer),MRStep(reducer = self.secondreducer)]

    
    ### input:self, _in_key, in_value
    def mapper(self, _, line):
        #yield each word in the line
        for word in WORD_RE.findall(line):
            yield (word.lower(),len(word))
            
    ### input: self, in_key from mapper, in_value from mapper        
    def reducer(self, word, chars):
        # optimization: get a single value for the words we have seen thus far
        yield None,(max(chars),word)
        
    def secondreducer(self, _, word_count_pairs): 
        word_dict= {}
        for count,word in sorted(word_count_pairs, reverse=True):
                word_dict.setdefault(count,[]).append(word)
        for result in itertools.islice(word_dict.items(), 10):
            yield  result
        
        
if __name__ == '__main__':
    MRTopTenWords.run()

Overwriting top_ten_count.py


In [8]:
!python top_ten_count.py data/gutenberg/t5.churchill.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/top_ten_count.jovyan.20210503.231801.053313
Running step 1 of 2...
Running step 2 of 2...
job output is in /tmp/top_ten_count.jovyan.20210503.231801.053313/output
Streaming final output from /tmp/top_ten_count.jovyan.20210503.231801.053313/output...
19	["straightforwardness"]
18	["overcapitalization", "misinterpretations", "kirkcudbrightshire", "disproportionately", "conversationalists", "characteristically"]
17	["uncomprehendingly", "semiconsciousness", "recrystallization", "misunderstandings", "misrepresentation", "misinterpretation", "materialistically", "incommunicability", "disinterestedness", "conventionalities", "controversialists", "conscientiousness", "communicativeness", "commissionerships", "characterizations"]
16	["unreasonableness", "unostentatiously", "unenforceability", "undiscriminating", "unconventionally", "unconstitutional", "uncompromisingly", "un