# mrjob: Contador de palabras

En este ejemplo trabajaremos con un fichero de texto cargado previamente en HDFS. Haremos primero una ejecución en local y luego en Hadoop.

In [1]:
! mkdir -p /media/notebook/notebooks_tema2/mrjob/wordcount

In [2]:
import os
os.chdir("/media/notebook/notebooks_tema2/mrjob/wordcount")

In [3]:
! pwd

/media/notebook/notebooks_tema2/mrjob/wordcount


In [4]:
%%writefile mrjob-ejercicio.py
from mrjob.job import MRJob 

import re 

# preparamos una expresion regular que recoja las palabras.  

WORD_RE = re.compile(r"[\w']+") 

class MRWordFreqCount(MRJob): 

    def mapper(self, _, line): 
       # Para cada palabra en la linea, emitimos un par <palabra, 1> 
        for word in WORD_RE.findall(line): 
            yield (word.lower(), 1) 

    # El combiner agrega los pares <palabra, 1> que se emitan en el mismo map. 
    def combiner(self, word, counts): 
        yield (word, sum(counts)) 

    #El reducer agrega los pares <palabra, X> que le llegan 
    def reducer(self, word, counts): 
        yield (word, sum(counts)) 

if __name__ == '__main__': 
     MRWordFreqCount.run() 

Overwriting mrjob-ejercicio.py


In [5]:
! python mrjob-ejercicio.py /media/notebook/datos/marktwain.txt  > outputlocal

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/mrjob-ejercicio.root.20191103.181729.125226
Running step 1 of 1...
job output is in /tmp/mrjob-ejercicio.root.20191103.181729.125226/output
Streaming final output from /tmp/mrjob-ejercicio.root.20191103.181729.125226/output...
Removing temp directory /tmp/mrjob-ejercicio.root.20191103.181729.125226...


In [6]:
! cat outputlocal

"texto"	5
"todo"	1
"toma"	1
"un"	2
"zapater"	1
"es"	1
"escribiendo"	1
"esto"	1
"estoy"	1
"hola"	2
"mas"	4
"nuevo"	1
"palabras"	1
"para"	1
"saludo"	1
"tenemos"	1
"test"	1
"adri\u00e1n"	1
"bien"	1
"contar"	1
"cordial"	1
"cuantas"	1
"de"	1


In [7]:
! hdfs dfs -rm /tmp/carpeta/mrjob-wordcount-output/*
! hdfs dfs -rmdir /tmp/carpeta/mrjob-wordcount-output

Deleted /tmp/carpeta/mrjob-wordcount-output/_SUCCESS
Deleted /tmp/carpeta/mrjob-wordcount-output/part-00000


In [8]:
! python mrjob-ejercicio.py hdfs:///tmp/carpeta/mr-job-wordcount-input/marktwain.txt -r hadoop --python-bin \
/opt/anaconda/bin/python3.7 --output-dir /tmp/carpeta/mrjob-wordcount-output 

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /usr/lib/hadoop/bin...
Found hadoop binary: /usr/lib/hadoop/bin/hadoop
Using Hadoop version 2.6.0
Looking for Hadoop streaming jar in /usr/lib/hadoop...
Looking for Hadoop streaming jar in /usr/lib/hadoop-mapreduce...
Found Hadoop streaming jar: /usr/lib/hadoop-mapreduce/hadoop-streaming.jar
Creating temp directory /tmp/mrjob-ejercicio.root.20191103.181737.130894
uploading working dir files to hdfs:///user/root/tmp/mrjob/mrjob-ejercicio.root.20191103.181737.130894/files/wd...
Copying other local files to hdfs:///user/root/tmp/mrjob/mrjob-ejercicio.root.20191103.181737.130894/files/
Running step 1 of 1...
  packageJobJar: [] [/usr/lib/hadoop-mapreduce/hadoop-streaming-2.6.0-cdh5.15.1.jar] /tmp/streamjob8902058658768382756.jar tmpDir=null
  Connecting to ResourceManager at yarnmaster/172.19.0.3:8032
  Connecting to ResourceManager at yarnmaster/172.19.0.3:8032
  Total 

In [9]:
! hdfs dfs -ls /tmp/carpeta/mrjob-wordcount-output

Found 2 items
-rw-r--r--   3 root supergroup          0 2019-11-03 18:18 /tmp/carpeta/mrjob-wordcount-output/_SUCCESS
-rw-r--r--   3 root supergroup        237 2019-11-03 18:18 /tmp/carpeta/mrjob-wordcount-output/part-00000


In [10]:
! hdfs dfs -tail /tmp/carpeta/mrjob-wordcount-output/part-00000


"adri\u00e1n"	1
"bien"	1
"contar"	1
"cordial"	1
"cuantas"	1
"de"	1
"es"	1
"escribiendo"	1
"esto"	1
"estoy"	1
"hola"	2
"mas"	4
"nuevo"	1
"palabras"	1
"para"	1
"saludo"	1
"tenemos"	1
"test"	1
"texto"	5
"todo"	1
"toma"	1
"un"	2
"zapater"	1
