In [1]:
! mkdir -p mrjob-sortvalue

In [2]:
import os
os.chdir("/media/notebooks/mrjob-sortvalue")

In [3]:
! pwd

/media/notebooks/mrjob-sortvalue


In [19]:
%%writefile mrjob-sort.py
# Copyright 2011 Yelp
# Copyright 2013 David Marin
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""For any word that appears in a document, compute stats about which
words come next (including percentage).

This is meant as a simple demonstration of why SORT_VALUES is useful.
"""
from mrjob.job import MRJob
from mrjob.step import MRStep
import re


WORD_RE = re.compile(r"[\w']+")


class MRNextWordStats(MRJob):

    SORT_VALUES = True

    def steps(self):
        return [MRStep(mapper=self.m_find_words,
                       combiner=self.c_combine_counts,
                       reducer=self.r_sum_counts),
               # MRStep(reducer=self.r_compute_stats)
                MRStep(reducer=self.reducer)
               ]

    def m_find_words(self, _, line):
        """Tokenize lines, and look for pairs of adjacent words.

        Yield (prev_word, word), 1 and (prev_word, '*'), 1 for each pair
        """
        prev_word = None

        for word in WORD_RE.findall(line):
            word = word.lower()

            if prev_word is not None:
                # total up the number of times prev_word appears
                # and the number of times next_word appears after it
                yield (prev_word, '*'), 1
                yield (prev_word, word), 1

            prev_word = word

    def c_combine_counts(self, key, counts):
        """Sum up all those 1s before passing data off to the reducer"""
        yield key, sum(counts)
        
        
        
            
    def reducer(self, key, values):
        for value in values:
            yield key,value

        

    def r_sum_counts(self, key, counts):
        """Compute the number of times each pair of words appears, and the
        number of times the first word in a pair appears, and send it to
        a reducer that keys on the first word in the pair.
        """
        count = sum(counts)

        prev_word, word = key

        if word == '*':
            # we want total to arrive at r_compute_stats first, so
            # prefix it with "A", which comes before "B"
            yield prev_word, ('E: total', count)
        else:
            yield prev_word, ('X: stats', (word, count))

    def r_compute_stats(self, prev_word, value):
        """For each pair of words, compute how many times it appears,
        how many times the first word appears in a pair, and the percentage
        of time the second word follows the first.

        This relies on values appearing in sorted order; we need the total
        number of times the first word appears before we can compute the
        percentage for each second word.
        """
        total = None
        

        for value_type, data in value:
            if value_type == 'E: total':
                total = data
            else:
               
                assert value_type == 'X: stats'
                word, count = data
                # E comes before X, so total should already be set
                percent = 100.0 * count / total
                yield (prev_word, word), (total, count, percent)


if __name__ == '__main__':
    MRNextWordStats.run()


Overwriting mrjob-sort.py


In [20]:
! python mrjob-sort.py  hdfs:///tmp/mrjoin/*  -r hadoop 

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /usr/lib/hadoop/bin...
Found hadoop binary: /usr/lib/hadoop/bin/hadoop
Using Hadoop version 2.6.0
Looking for Hadoop streaming jar in /usr/lib/hadoop...
Looking for Hadoop streaming jar in /usr/lib/hadoop-mapreduce...
Found Hadoop streaming jar: /usr/lib/hadoop-mapreduce/hadoop-streaming.jar
Creating temp directory /tmp/mrjob-sort.root.20190521.104603.346869
Copying local files to hdfs:///user/root/tmp/mrjob/mrjob-sort.root.20190521.104603.346869/files/...
Running step 1 of 2...
  packageJobJar: [] [/usr/lib/hadoop-mapreduce/hadoop-streaming-2.6.0-cdh5.9.0.jar] /tmp/streamjob6317930329524989181.jar tmpDir=null
  Connecting to ResourceManager at yarnmaster/172.18.0.2:8032
  Connecting to ResourceManager at yarnmaster/172.18.0.2:8032
  Total input paths to process : 2
  number of splits:3
  Submitting tokens for job: job_1558431322651_0025
  Submitted application appli

"uruguay"	["E: total", 1]
"uruguay"	["X: stats", ["uy", 1]]
"uzbekistan"	["E: total", 1]
"uzbekistan"	["X: stats", ["uz", 1]]
"vanuatu"	["E: total", 1]
"vanuatu"	["X: stats", ["vu", 1]]
"vatican"	["E: total", 1]
"vatican"	["X: stats", ["city", 1]]
"venezuela"	["E: total", 1]
"venezuela"	["X: stats", ["bolivarian", 1]]
"verda"	["E: total", 1]
"verda"	["X: stats", ["belgrave", 1]]
"verde"	["E: total", 1]
"verde"	["X: stats", ["cv", 1]]
"viet"	["E: total", 1]
"viet"	["X: stats", ["nam", 1]]
"vincent"	["E: total", 1]
"vincent"	["X: stats", ["and", 1]]
"vincenzo"	["E: total", 1]
"vincenzo"	["X: stats", ["samples", 1]]
"virgin"	["E: total", 2]
"virgin"	["X: stats", ["islands", 2]]
"wallis"	["E: total", 1]
"wallis"	["X: stats", ["and", 1]]
"wess"	["E: total", 1]
"wess"	["X: stats", ["regular", 1]]
"western"	["E: total", 1]
"western"	["X: stats", ["sahara", 1]]
"wines"	["E: total", 1]
"wines"	["X: stats", ["regular", 1]]
"wingate"	["E: total", 1]
"wingate"	["X: stats", ["regular", 1]]
"yemen"	

In [6]:
! python mrjob-sort.py  hdfs:///tmp/mrjoin/*  --output-dir hdfs:///tmp/carpeta/sort1  -r hadoop 

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /usr/lib/hadoop/bin...
Found hadoop binary: /usr/lib/hadoop/bin/hadoop
Using Hadoop version 2.6.0
Looking for Hadoop streaming jar in /usr/lib/hadoop...
Looking for Hadoop streaming jar in /usr/lib/hadoop-mapreduce...
Found Hadoop streaming jar: /usr/lib/hadoop-mapreduce/hadoop-streaming.jar
Creating temp directory /tmp/mrjob-sort.root.20190521.094102.518951
Copying local files to hdfs:///user/root/tmp/mrjob/mrjob-sort.root.20190521.094102.518951/files/...
Running step 1 of 2...
  packageJobJar: [] [/usr/lib/hadoop-mapreduce/hadoop-streaming-2.6.0-cdh5.9.0.jar] /tmp/streamjob1978238678561105229.jar tmpDir=null
  Connecting to ResourceManager at yarnmaster/172.18.0.2:8032
  Connecting to ResourceManager at yarnmaster/172.18.0.2:8032
  Total input paths to process : 2
  number of splits:3
  Submitting tokens for job: job_1558431322651_0001
  Submitted application appli

["united", "republic"]	[5, 1, 20.0]
["united", "states"]	[5, 2, 40.0]
["urban", "malo"]	[1, 1, 100.0]
["uruguay", "uy"]	[1, 1, 100.0]
["uzbekistan", "uz"]	[1, 1, 100.0]
["vanuatu", "vu"]	[1, 1, 100.0]
["vatican", "city"]	[1, 1, 100.0]
["venezuela", "bolivarian"]	[1, 1, 100.0]
["verda", "belgrave"]	[1, 1, 100.0]
["verde", "cv"]	[1, 1, 100.0]
["viet", "nam"]	[1, 1, 100.0]
["vincent", "and"]	[1, 1, 100.0]
["vincenzo", "samples"]	[1, 1, 100.0]
["virgin", "islands"]	[2, 2, 100.0]
["wallis", "and"]	[1, 1, 100.0]
["wess", "regular"]	[1, 1, 100.0]
["western", "sahara"]	[1, 1, 100.0]
["wines", "regular"]	[1, 1, 100.0]
["wingate", "regular"]	[1, 1, 100.0]
["yemen", "ye"]	[1, 1, 100.0]
["yuette", "steinman"]	[1, 1, 100.0]
["yugoslav", "republic"]	[1, 1, 100.0]
["zambia", "zm"]	[1, 1, 100.0]
["zealand", "nz"]	[1, 1, 100.0]
["zimbabwe", "zw"]	[1, 1, 100.0]
["zoraida", "muise"]	[1, 1, 100.0]
["zufelt", "regular"]	[1, 1, 100.0]
Removing HDFS temp directory hdfs:///user/root/tmp/mrjob/mrjob-sort.root.