# mrjob: Contador de palabras

En este ejemplo trabajaremos con un fichero de texto cargado previamente en HDFS. Haremos primero una ejecución en local y luego en Hadoop.

In [1]:
! mkdir -p mrjob/wordcount

In [2]:
import os
os.chdir("/media/notebooks/mrjob/wordcount")

In [3]:
! pwd

/media/notebooks/mrjob/wordcount


In [4]:
%%writefile mrjob-ejercicio.py
from mrjob.job import MRJob 

import re 

# preparamos una expresion regular que recoja las palabras.  

WORD_RE = re.compile(r"[\w']+") 

class MRWordFreqCount(MRJob): 

    def mapper(self, _, line): 
       # Para cada palabra en la linea, emitimos un par <palabra, 1> 
        for word in WORD_RE.findall(line): 
            yield (word.lower(), 1) 

    # El combiner agrega los pares <palabra, 1> que se emitan en el mismo map. 
    def combiner(self, word, counts): 
        yield (word, sum(counts)) 

    #El reducer agrega los pares <palabra, X> que le llegan 
    def reducer(self, word, counts): 
        yield (word, sum(counts)) 

if __name__ == '__main__': 
     MRWordFreqCount.run() 

Overwriting mrjob-ejercicio.py


In [5]:
! python mrjob-ejercicio.py /media/notebooks/marktwain.txt  > ouputlocal

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/mrjob-ejercicio.root.20190726.103809.511889
Running step 1 of 1...
job output is in /tmp/mrjob-ejercicio.root.20190726.103809.511889/output
Streaming final output from /tmp/mrjob-ejercicio.root.20190726.103809.511889/output...
Removing temp directory /tmp/mrjob-ejercicio.root.20190726.103809.511889...


In [6]:
! tail ouputlocal

"unpatriotism"	1
"unpaved"	2
"unpeopled"	7
"unperfected"	3
"unpestered"	1
"unphilosophical"	1
"unpictured"	1
"unpicturesque"	3
"unpinned"	2
"unpiratical"	1


In [7]:
! hdfs dfs -rm /tmp/carpeta/mrjob-wordcount-output/*
! hdfs dfs -rmdir /tmp/carpeta/mrjob-wordcount-output

rm: `/tmp/carpeta/mrjob-wordcount-output/*': No such file or directory
rmdir: `/tmp/carpeta/mrjob-wordcount-output': No such file or directory


In [8]:
! python mrjob-ejercicio.py hdfs:///tmp/carpeta/marktwain.txt -r hadoop --python-bin /opt/anaconda/bin/python3.7 \
--output-dir /tmp/carpeta/mrjob-wordcount-output 

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /usr/lib/hadoop/bin...
Found hadoop binary: /usr/lib/hadoop/bin/hadoop
Using Hadoop version 2.6.0
Looking for Hadoop streaming jar in /usr/lib/hadoop...
Looking for Hadoop streaming jar in /usr/lib/hadoop-mapreduce...
Found Hadoop streaming jar: /usr/lib/hadoop-mapreduce/hadoop-streaming.jar
Creating temp directory /tmp/mrjob-ejercicio.root.20190726.103929.243950
uploading working dir files to hdfs:///user/root/tmp/mrjob/mrjob-ejercicio.root.20190726.103929.243950/files/wd...
Copying other local files to hdfs:///user/root/tmp/mrjob/mrjob-ejercicio.root.20190726.103929.243950/files/
Running step 1 of 1...
  packageJobJar: [] [/usr/lib/hadoop-mapreduce/hadoop-streaming-2.6.0-cdh5.15.1.jar] /tmp/streamjob6536794343884990512.jar tmpDir=null
  Connecting to ResourceManager at yarnmaster/172.28.0.4:8032
  Connecting to ResourceManager at yarnmaster/172.28.0.4:8032
  Total 

In [11]:
! hdfs dfs -ls /tmp/carpeta/mrjob-wordcount-output

Found 2 items
-rw-r--r--   3 root supergroup          0 2019-07-26 10:40 /tmp/carpeta/mrjob-wordcount-output/_SUCCESS
-rw-r--r--   3 root supergroup     664929 2019-07-26 10:40 /tmp/carpeta/mrjob-wordcount-output/part-00000


In [12]:
! hdfs dfs -tail /tmp/carpeta/mrjob-wordcount-output/part-00000


n"	2
"zat"	2
"ze"	24
"zeal"	36
"zealand"	59
"zealand's"	1
"zealander"	1
"zealous"	10
"zealously"	4
"zeb"	3
"zebra"	2
"zebras"	2
"zebulon"	1
"zedekiah"	1
"zedoary"	1
"zehn"	1
"zei"	1
"zeilerus"	1
"zeit"	3
"zeitung"	3
"zeitung's"	2
"zeitvertreib"	1
"zenana"	8
"zeniff"	1
"zenith"	17
"zenobia"	2
"zenophon"	1
"zephyr"	7
"zephyrs"	3
"zere"	1
"zermatt"	37
"zero"	6
"zest"	11
"zeus"	2
"zeuxis"	1
"zhentlemans"	1
"ziani"	1
"ziehe"	1
"ziemlich"	1
"zig"	2
"zigzag"	8
"zigzagging"	1
"zimmermanns"	1
"zimmern"	1
"zinc"	5
"zion"	3
"zip"	1
"zis"	9
"zither"	4
"zo"	4
"zoe"	4
"zoe's"	1
"zogbaum"	1
"zola"	1
"zone"	5
"zones"	5
"zonoras"	5
"zoo"	3
"zoological"	10
"zoology"	1
"zorah"	1
"zorn"	1
"zoroaster"	2
"zoroastrian"	1
"zoroastrians"	3
"zouch"	1
"zu"	21
"zufallig"	1
"zug"	8
"zuge"	1
"zugs"	1
"zulu"	9
"zulus"	8
"zum"	2
"zunge"	1
"zurich"	2
"zuruck"	2
"zuruckkommen"	1
"zusammen"	1
"zusammengetroffen"	2
"zutphen"	1
"zuweilen"	2
