# Ejemplo inicial: Contador de palabras

## En este ejemplo trabajaremos con un fichero de texto cargado previamente en HDFS

In [1]:
! mkdir -p ejemplo-inicial/wordcount

In [22]:
import os
os.chdir("/media/notebooks/ejemplo-inicial/wordcount")

In [23]:
! pwd

/media/notebooks/ejemplo-inicial/wordcount


In [24]:
%%writefile mapper.py
#!/usr/bin/env python

import sys

# entrada de la entrada estandar STDIN
for line in sys.stdin:
  # eliminamos espacios blancos al principio y final
  line = line.strip()
  # dividimos la linea en palabras
  words = line.split()
  # incrementamos los contadores
  for word in words:
    # escribimos los resultados a la salida estandard STDOUT. 
    # Esta salida sera la entrada para el reduce, es decir, para reducer.py
    # delimiado por tab, para cada palabra ponemos 1 ocurrencia
    print  (word +"\t" + str(1))

Overwriting mapper.py


In [25]:
%%writefile reducer.py
#!/usr/bin/env python

import sys

current_word = None
current_count = 0
word = None

# entrada desde STDIN
for line in sys.stdin:
  # eliminamos espacios blancos al principio y final
  line = line.strip()

  # parseamos la entrada que hemos obtenido del mapper.py
  word, count = line.split('\t', 1)

  # pasamos el contador de string a int
  try:
    count = int(count)
  except ValueError:
    # si el contados no es un numero, descartamos la linea
    continue

  # este if solamente funciona porque Hadoop ordena la salida del map por la clave (aqui es word) antes de pasarsela al reducer
  if current_word == word:
    current_count += count
  else:
    if current_word:
      # escribir resultado a STDOUT
      print (current_word+ "\t" + str(current_count))
    current_count = count
    current_word = word

# escribimos la ultima palabra
if current_word == word:
    print (current_word + "\t" + str(current_count))

Overwriting reducer.py


In [26]:
! ls -lh

total 22M
-rwxrwxrwx 1 root root  520 Jul 10 16:38 mapper.py
-rwxrwxrwx 1 root root  927 Jul 10 16:38 reducer.py
-rwxrwxrwx 1 root root 1.9M Jul 10 14:59 salidawordcount
-rw-r--r-- 1 root root  21M Jul 10 14:56 salidawordcountmap


Primero probamos solamente la función map ...

In [27]:
! cat ../../marktwain.txt | python mapper.py  > salidawordcountmap

In [28]:
! tail salidawordcountmap

subscribe	1
to	1
our	1
email	1
newsletter	1
to	1
hear	1
about	1
new	1
eBooks.	1


In [29]:
! hdfs dfs -rm /tmp/salida-wordcountmap/*
! hdfs dfs -rmdir /tmp/salida-wordcountmap

Deleted /tmp/salida-wordcountmap/_SUCCESS
Deleted /tmp/salida-wordcountmap/part-00000


In [30]:
! hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \
-files mapper.py -mapper mapper.py \
-input /tmp/carpeta/marktwain.txt -output /tmp/salida-wordcountmap

packageJobJar: [] [/usr/lib/hadoop-mapreduce/hadoop-streaming-2.6.0-cdh5.9.0.jar] /tmp/streamjob2283734107267297335.jar tmpDir=null
19/07/10 16:39:03 INFO client.RMProxy: Connecting to ResourceManager at yarnmaster/172.18.0.2:8032
19/07/10 16:39:03 INFO client.RMProxy: Connecting to ResourceManager at yarnmaster/172.18.0.2:8032
19/07/10 16:39:03 INFO mapred.FileInputFormat: Total input paths to process : 1
19/07/10 16:39:03 INFO mapreduce.JobSubmitter: number of splits:2
19/07/10 16:39:04 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1562757543411_0018
19/07/10 16:39:04 INFO impl.YarnClientImpl: Submitted application application_1562757543411_0018
19/07/10 16:39:04 INFO mapreduce.Job: The url to track the job: http://yarnmaster:8088/proxy/application_1562757543411_0018/
19/07/10 16:39:04 INFO mapreduce.Job: Running job: job_1562757543411_0018
19/07/10 16:39:09 INFO mapreduce.Job: Job job_1562757543411_0018 running in uber mode : false
19/07/10 16:39:09 INFO mapreduce.Job:

In [34]:
! hdfs dfs -tail /tmp/salida-wordcountmap/part-00000

zigzag	1
zigzag	1
zigzag	1
zigzag;	1
zigzagging	1
zinc	1
zinc	1
zinc-plated	1
zinc-plated,	1
zis	1
zis	1
zis	1
zis	1
zis	1
zither	1
zo	1
zo	1
zo	1
zo	1
zone	1
zone	1
zone	1
zone	1
zone!	1
zones	1
zones	1
zones	1
zones	1
zoological	1
zoological	1
zoological	1
zoological	1
zoological	1
zoology,	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zu	1
zuruck,	1
zuruck.	1
zuruckkommen	1
zusammen	1
zusammengetroffen	1
zusammengetroffen,	1
zuweilen	1
zwar	1
zwei	1
zwei	1
zwei	1
zwolf	1
zylobalsamum	1
zzip!--let	1
{10}	1
{10}	1
{13}	1
{1}	1
{1}	1
{1}	1
{1}	1
{1}	1
{1}	1
{1}	1
{1}	1
{1}	1
{1}	1
{1}	1
{1}	1
{1}	1
{1}	1
{2}	1
{2}--Sir	1
{3}	1
{3}	1
{4}	1
{4}	1
{5}	1
{5}	1
{6}	1
{6}	1
{7}	1
{7}	1
{8}	1
{8}	1
{9}	1
{9}	1
{footnote	1
{footnote	1
{footnote	1
{footnote	1
{footnote	1
{footnote	1
|	1
|	1
|	1
|	1
|	1
|	1
|	1
|	1
|	1
|	1
|	1
|	

In [35]:
! cat ../../marktwain.txt | python mapper.py | sort | python reducer.py > salidawordcount

In [36]:
! tail salidawordcount

Zwischen	1
zwolf	1
Zylo,	1
zylobalsamum	1
"Zylobalsamum."	1
Zylobalsamum--"	1
Zylobalsamum	1
Zylo--what	1
zzip!--let	1
--zzz--zzz--	1


In [37]:
! hdfs dfs -rm /tmp/salida-wordcount/*
! hdfs dfs -rmdir /tmp/salida-wordcount

Deleted /tmp/salida-wordcount/_SUCCESS
Deleted /tmp/salida-wordcount/part-00000


In [39]:
! hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \
-files mapper.py,reducer.py -mapper mapper.py -reducer reducer.py \
-input /tmp/carpeta/marktwain.txt -output /tmp/salida-wordcount

packageJobJar: [] [/usr/lib/hadoop-mapreduce/hadoop-streaming-2.6.0-cdh5.9.0.jar] /tmp/streamjob667315733110171432.jar tmpDir=null
19/07/10 16:42:00 INFO client.RMProxy: Connecting to ResourceManager at yarnmaster/172.18.0.2:8032
19/07/10 16:42:00 INFO client.RMProxy: Connecting to ResourceManager at yarnmaster/172.18.0.2:8032
19/07/10 16:42:00 INFO mapred.FileInputFormat: Total input paths to process : 1
19/07/10 16:42:00 INFO mapreduce.JobSubmitter: number of splits:2
19/07/10 16:42:01 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1562757543411_0019
19/07/10 16:42:01 INFO impl.YarnClientImpl: Submitted application application_1562757543411_0019
19/07/10 16:42:01 INFO mapreduce.Job: The url to track the job: http://yarnmaster:8088/proxy/application_1562757543411_0019/
19/07/10 16:42:01 INFO mapreduce.Job: Running job: job_1562757543411_0019
19/07/10 16:42:06 INFO mapreduce.Job: Job job_1562757543411_0019 running in uber mode : false
19/07/10 16:42:06 INFO mapreduce.Job: 

In [40]:
! hdfs dfs -ls /tmp/salida-wordcount/*

-rw-r--r--   3 root supergroup          0 2019-07-10 16:42 /tmp/salida-wordcount/_SUCCESS
-rw-r--r--   3 root supergroup    1936539 2019-07-10 16:42 /tmp/salida-wordcount/part-00000


In [41]:
! hdfs dfs -tail /tmp/salida-wordcount/part-00000

outh--said	1
youth--the	1
youth.	15
youth;	5
youth?	1
youthful	21
youthful,	1
youthful;	1
youthfullest	1
youths	18
youths,	6
yow	1
yr's	1
yrs,	1
yrs.	1
ys	1
yt	31
yt,	1
ytt	2
yuther	11
z,	1
z9o.5.	1
zanier,	1
zareba	1
zartlichsten	2
zat	1
ze	21
zeal	20
zeal,	6
zeal--	1
zeal--in	1
zeal.	5
zealous	9
zealous--are	1
zealously	4
zebras	1
zebras--all	1
zedoary,	1
zehn	1
zenana	4
zenana,	1
zenana.	1
zenith	9
zenith,	3
zenith-scouring	1
zenith.	4
zephyr	2
zephyr,	1
zephyrs	3
zere	1
zero	2
zero,	2
zero.	1
zero;	1
zest	8
zest.	1
zest;	2
zhentlemans	1
ziehe	1
ziemlich	1
zig-zag	2
zigzag	7
zigzag;	1
zigzagging	1
zinc	2
zinc-plated	1
zinc-plated,	1
zis	5
zither	1
zo	4
zone	4
zone!	1
zones	4
zoological	5
zoology,	1
zu	20
zuruck,	1
zuruck.	1
zuruckkommen	1
zusammen	1
zusammengetroffen	1
zusammengetroffen,	1
zuweilen	1
zwar	1
zwei	3
zwolf	1
zylobalsamum	1
zzip!--let	1
{10}	2
{13}	1
{1}	14
{2}	1
{2}--Sir	1
{3}	