# Introducción a Spark

## 1. Crear RDDs

### 1.1. Crear un RDD base


##### Comenzamos generando nuestro primer RDD utilizando una lista de Python y el método "sc.parallelize"


In [6]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [7]:
wordsList = ['cat', 'elephant', 'rat', 'rat', 'cat']
wordsRDD = sc.parallelize(wordsList, 4)

print type(wordsRDD)

<class 'pyspark.rdd.RDD'>


### 1.2. RDD map

##### Vamos a aplicar una función a todos las palabras que componen el RDD. Por ejemplo una función que convierta las palabras al plural añadiendoles una "s" final.

In [8]:
def makePlural(word):
    return word + 's'

print makePlural('cat')

cats


In [17]:
pluralLambdaRDD = wordsRDD.map(lambda x: x + 's')
print pluralLambdaRDD.collect()

['cats', 'elephants', 'rats', 'rats', 'cats']


##### Hagamos una función que revierta el map anterior con la función makeSingular

In [18]:
def makeSingular(word):
    return word[:-1]

pluralLambdaRDD.map(makeSingular).collect()

['cat', 'elephant', 'rat', 'rat', 'cat']

##### Podríamos también utilizar la función makePlural que había escrito antes

In [19]:
wordsRDD.map(makePlural).collect()

['cats', 'elephants', 'rats', 'rats', 'cats']

In [20]:
pluralLengths = (pluralLambdaRDD.map(lambda x: len(x)).collect())
print pluralLengths

[4, 9, 4, 4, 4]


##### Número de letras "a" que llevan las cadenas del RDD

In [16]:
def num_aes(word):
    return word.count("a")

wordsRDD.map(num_aes).collect()

[1, 1, 1, 1, 1]

##### Podemos encadenar varios map

In [21]:
wordsRDD.map(lambda x: len(x)).map(lambda x: x + 1).collect()

[4, 9, 4, 4, 4]

## 2. Trabajar con Pair RDDs

##### Construimos un pair RDD

In [24]:
wordPairs = wordsRDD.map(lambda x: (x,1))
print wordPairs.collect()

[('cat', 1), ('elephant', 1), ('rat', 1), ('rat', 1), ('cat', 1)]


In [25]:
wordPairs = wordsRDD.map(lambda x: (x, len(x)))
print wordPairs.collect()

[('cat', 3), ('elephant', 8), ('rat', 3), ('rat', 3), ('cat', 3)]


In [37]:
wordPairs = wordsRDD.map(lambda x: (x,1))
print wordPairs.collect()

[('cat', 1), ('elephant', 1), ('rat', 1), ('rat', 1), ('cat', 1)]


### 2.1 Claves y valores

In [38]:
wordPairs.keys().collect()

['cat', 'elephant', 'rat', 'rat', 'cat']

In [39]:
wordPairs.values().collect()

[1, 1, 1, 1, 1]

### 2.2 GroupByKey

In [40]:
wordsGrouped = wordPairs.groupByKey()

In [41]:
wordsGrouped.collect()

[('rat', <pyspark.resultiterable.ResultIterable at 0x7fd72bd87b90>),
 ('elephant', <pyspark.resultiterable.ResultIterable at 0x7fd72bd9cad0>),
 ('cat', <pyspark.resultiterable.ResultIterable at 0x7fd72bd9c790>)]

In [42]:
for key, value in wordsGrouped.collect():
    print '{0}: {1}'.format(key, list(value))

rat: [1, 1]
elephant: [1]
cat: [1, 1]


### 2.3 ReduceByKey

In [61]:
wordCounts = wordPairs.reduceByKey(lambda x,y:x+y)
print wordCounts.collect()

[('rat', 2), ('elephant', 1), ('cat', 2)]


In [62]:
wordCounts.collect()

[('rat', 2), ('elephant', 1), ('cat', 2)]

### 2.4 Ejemplo con un fichero

##### Creamos una función para eliminar los signos de puntuación

In [63]:
import re

def removePunctuation(text):
    return re.sub('[^a-z| |0-9]', '', text.strip().lower())

In [64]:
import os.path

baseDir = os.path.join('data')
inputPath = os.path.join('shakespeare.txt')
fileName = os.path.join(baseDir, inputPath)

shakespeareRDD = (sc
                  .textFile(fileName, 8)
                  .map(removePunctuation))
print '\n'.join(shakespeareRDD
                .zipWithIndex()  # to (line, lineNum)
                .map(lambda (l, num): '{0}: {1}'.format(num, l))  # to 'lineNum: line'
                .take(15))

0: the project gutenberg ebook of the complete works of william shakespeare by
1: william shakespeare
2: 
3: this ebook is for the use of anyone anywhere at no cost and with
4: almost no restrictions whatsoever  you may copy it give it away or
5: reuse it under the terms of the project gutenberg license included
6: with this ebook or online at wwwgutenbergorg
7: 
8:  this is a copyrighted project gutenberg ebook details below 
9:      please follow the copyright guidelines in this file     
10: 
11: title the complete works of william shakespeare
12: 
13: author william shakespeare
14: 


In [65]:
shakespeareWordsRDD = shakespeareRDD.flatMap(lambda x: x.split(' '))
shakespeareWordCount = shakespeareWordsRDD.count()
print shakespeareWordsRDD.top(5)
print shakespeareWordCount


[u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds']
950384


In [66]:
shakeWordsRDD = shakespeareWordsRDD.filter(lambda x: x != '')
shakeWordCount = shakeWordsRDD.count()
print shakeWordCount

903705


In [70]:
from operator import add

top15WordsAndCounts = shakeWordsRDD.map(lambda x: (x,1)).reduceByKey(add).takeOrdered (15, key=lambda (key,val): -val)
print '\n'.join(map(lambda (w, c): '{0}: {1}'.format(w, c), top15WordsAndCounts))

the: 27825
and: 26791
i: 20681
to: 19261
of: 18289
a: 14667
you: 13716
my: 12481
that: 11135
in: 11027
is: 9621
not: 8745
for: 8261
with: 8046
me: 7769


In [73]:
conjunto1RDD = sc.parallelize([1, 2, 3])
conjunto2RDD = sc.parallelize([2, 3, 4, 5])

In [76]:
conjunto1RDD.intersection(conjunto2RDD).collect()

[2, 3]

In [77]:
conjunto1RDD.union(conjunto2RDD).collect()

[1, 2, 3, 2, 3, 4, 5]

In [78]:
conjunto1RDD.union(conjunto2RDD).distinct().collect()

[1, 2, 3, 4, 5]

## 3. Operaciones entre pair RDDs

##### Vamos a leer dos ficheros de datos, uno con datos de clientes y otro con datos de tarjetas asociadas a esos clientes

In [81]:
import os.path

baseDir = os.path.join('data')

inputClientesPath = os.path.join('clientes.txt')
inputTarjetasPath = os.path.join('tarjetas.txt')

fileClientesName = os.path.join(baseDir, inputClientesPath)
fileTarjetasName = os.path.join(baseDir, inputTarjetasPath)

clientesRDD = (sc.textFile(fileClientesName, 8))

clientesRDD.collect()

[u'80000000,Antonio Lopez Ramirez,Calle Cantalapiedra 8',
 u'70000000,Francisco Arias Sanchez,Avenida de America 12',
 u'50000000,Norberto Marias Quintana,Calle Uganda 88',
 u'10000000,Julio Cortazar Carter,Calle Bruselas 14',
 u'20000000,Arturo Belano Ya\xf1ez,Travesia de Calvo Sotelo 13']

In [82]:
tarjetasRDD = (sc.textFile(fileTarjetasName, 8))

tarjetasRDD.collect()

[u'70000000,Francisco Arias Sanchez,Visa,1200120012001200',
 u'50000000,Norberto Marias Quintana,Visa,1001100110011001',
 u'10000000,Julio Cortazar Carter,Master Card,2003200320032003',
 u'20000000,Arturo Belano Ya\xf1ez,American Express,9008900890089008',
 u'20000000,Arturo Belano Ya\xf1ez,American Express,1111222233334444']

##### Estos ficheros contienen lineas con los datos separados por ",". Vamos a crear unas clases para guardar sus datos

In [86]:
class Cliente(object):
    def __init__(self, linea):
        self.dni, self.nombre, self.direccion = linea.split(',')

class Tarjeta(object):
    def __init__(self, linea):
        self.dni, self.nombre, self.tipo_tarjeta, self.num_tarjeta = linea.split(',')

##### Convertimos las líneas de texto en objetos Cliente y Tarjeta

In [88]:
from clases import Cliente, Tarjeta

objClientesRDD = clientesRDD.map(lambda x: Cliente(x))

objClientesRDD.collect()

[<clases.Cliente at 0x7fd72bd9c850>,
 <clases.Cliente at 0x7fd711fcde10>,
 <clases.Cliente at 0x7fd711fcde50>,
 <clases.Cliente at 0x7fd70d1f6110>,
 <clases.Cliente at 0x7fd70d1f6150>]

In [89]:
objTarjetasRDD = tarjetasRDD.map(lambda x: Tarjeta(x))

objTarjetasRDD.collect()

[<clases.Tarjeta at 0x7fd733551390>,
 <clases.Tarjeta at 0x7fd711fcdf10>,
 <clases.Tarjeta at 0x7fd711fcdfd0>,
 <clases.Tarjeta at 0x7fd7335ac250>,
 <clases.Tarjeta at 0x7fd72bd9c350>]

##### ¿Cómo podemos combinar los datos de los dos RDDs?

In [91]:
objClientesRDD.join(objTarjetasRDD).collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 73.0 failed 1 times, most recent failure: Lost task 0.0 in stage 73.0 (TID 428, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 167, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/spark/python/pyspark/rdd.py", line 1914, in <lambda>
    map_values_fn = lambda kv: (kv[0], f(kv[1]))
TypeError: 'Cliente' object does not support indexing

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:390)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
	at org.apache.spark.scheduler.Task.run(Task.scala:86)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1873)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1886)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1899)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1913)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:453)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor81.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 167, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/spark/python/pyspark/rdd.py", line 1914, in <lambda>
    map_values_fn = lambda kv: (kv[0], f(kv[1]))
TypeError: 'Cliente' object does not support indexing

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:390)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
	at org.apache.spark.scheduler.Task.run(Task.scala:86)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


##### Necesitmos pair RDDs

In [95]:
objClientesRDD.map(lambda x: (x.dni, x)).join(objTarjetasRDD.map(lambda x: (x.dni, x))).collect()

[(u'10000000',
  (<clases.Cliente at 0x7fd72bdae810>, <clases.Tarjeta at 0x7fd733527650>)),
 (u'70000000',
  (<clases.Cliente at 0x7fd733527e90>, <clases.Tarjeta at 0x7fd733527850>)),
 (u'20000000',
  (<clases.Cliente at 0x7fd733527290>, <clases.Tarjeta at 0x7fd733527a90>)),
 (u'20000000',
  (<clases.Cliente at 0x7fd733527b90>, <clases.Tarjeta at 0x7fd733527990>)),
 (u'50000000',
  (<clases.Cliente at 0x7fd733527dd0>, <clases.Tarjeta at 0x7fd733527b10>))]

In [98]:
joinRDD = objClientesRDD.map(lambda x: (x.dni, x)).join(objTarjetasRDD.map(lambda x: (x.dni, x)))

In [101]:
joinRDD.first()[1][0]

<clases.Cliente at 0x7fd719c09bd0>

In [103]:
joinRDD.map(lambda x: x[1][0].nombre + ' - ' + x[1][1].tipo_tarjeta + ' - ' + x[1][1].num_tarjeta).collect()

[u'Julio Cortazar Carter - Master Card - 2003200320032003',
 u'Francisco Arias Sanchez - Visa - 1200120012001200',
 u'Arturo Belano Ya\xf1ez - American Express - 9008900890089008',
 u'Arturo Belano Ya\xf1ez - American Express - 1111222233334444',
 u'Norberto Marias Quintana - Visa - 1001100110011001']