In [2]:
!pip install pyspark



In [3]:
from pyspark import SparkContext

sc = SparkContext("local[1]", "ejemplo_rdd")

# A. Transformaciones

### Map

In [4]:
# Mapa
a = sc.parallelize([1, 4, 6, 3])
b = a.map(lambda x: x+10)
b.collect()

[11, 14, 16, 13]

In [9]:
rdd = sc.parallelize(['este es un texto', 'este es otro texto de prueba', 'prueba 1'])

# Obtener las cantidades de letras de cada cadena de caracteres
rdd.map(lambda x: len(x)).collect()

[16, 28, 8]

### Filter

In [5]:
# Filtro
a = sc.parallelize([1, 4, 6, 3, 10, 15, 16])
b = a.filter(lambda x: x>=6)
b.collect()

[6, 10, 15, 16]

In [10]:
rdd = sc.parallelize(['este es un texto', 'este es otro texto de prueba', 'prueba 1'])

# Mantener aquellas cadenas de caracteres con menos de 20 caracteres
def funcion(x):
    if (len (x) < 20):
        return x

rdd.filter(funcion).collect()

['este es un texto', 'prueba 1']

### Map y FlatMap

In [11]:
# Mapa y flatMap
a = sc.parallelize([1, 4, 6, 3])
b = a.map(lambda x: [x, 10*x])
b.collect()

[[1, 10], [4, 40], [6, 60], [3, 30]]

In [12]:
c = a.flatMap(lambda x: [x, 10*x])
c.collect()

[1, 10, 4, 40, 6, 60, 3, 30]

In [13]:
# Ejemplo 2
rdd1 = sc.parallelize(["hola mundo", "hoy es jueves"])
rdd2 = rdd1.map(lambda x: x.split(" "))
rdd2.collect()

[['hola', 'mundo'], ['hoy', 'es', 'jueves']]

In [14]:
rdd3 = rdd1.flatMap(lambda x: x.split(" "))
rdd3.collect()

['hola', 'mundo', 'hoy', 'es', 'jueves']

### GroupByKey

In [15]:
rdd = sc.parallelize([('a',7), ('b',4), ('a',1), ('b',6), ('c',3)])
rdd.collect()

[('a', 7), ('b', 4), ('a', 1), ('b', 6), ('c', 3)]

In [16]:
rdd2 = rdd.groupByKey()
rdd2.collect()

[('a', <pyspark.resultiterable.ResultIterable at 0x7f6a6a8e7310>),
 ('b', <pyspark.resultiterable.ResultIterable at 0x7f6a6a8e7950>),
 ('c', <pyspark.resultiterable.ResultIterable at 0x7f6a6a7daa50>)]

In [17]:
rdd2.mapValues(list).collect()

[('a', [7, 1]), ('b', [4, 6]), ('c', [3])]

### Cogroup

In [18]:
rdd = sc.parallelize([('a',7), ('b',4), ('a',1), ('b',6), ('c',3)])
rdd2 = sc.parallelize([('a',5), ('b',9)])

rdd3 = rdd.cogroup(rdd2)
rdd3.collect()

[('b',
  (<pyspark.resultiterable.ResultIterable at 0x7f6a6a7e5c50>,
   <pyspark.resultiterable.ResultIterable at 0x7f6a6a7e51d0>)),
 ('c',
  (<pyspark.resultiterable.ResultIterable at 0x7f6a6a7e5650>,
   <pyspark.resultiterable.ResultIterable at 0x7f6a6a7e5fd0>)),
 ('a',
  (<pyspark.resultiterable.ResultIterable at 0x7f6a6a7e5790>,
   <pyspark.resultiterable.ResultIterable at 0x7f6a6a7e5a50>))]

In [19]:
rdd3.mapValues(lambda x: (x[0].data, x[1].data)).collect()

[('b', ([4, 6], [9])), ('c', ([3], [])), ('a', ([7, 1], [5]))]

In [20]:
# Usando Python (no es eficiente si son muchos datos)
[(par[0], [elem.data for elem in par[1]]) for par in rdd3.collect()]

[('b', [[4, 6], [9]]), ('c', [[3], []]), ('a', [[7, 1], [5]])]

### Join

In [21]:
rdd = sc.parallelize([('a',7), ('b',4), ('a',1), ('b',6), ('c',3)])
rdd2 = sc.parallelize([('a',5), ('b',9), ('a',100)])

rdd3 = rdd.join(rdd2)
rdd3.collect()

[('b', (4, 9)),
 ('b', (6, 9)),
 ('a', (7, 5)),
 ('a', (7, 100)),
 ('a', (1, 5)),
 ('a', (1, 100))]

### Operadores de Conjuntos

In [22]:
rdd1 = sc.parallelize([1, 3, 5])
rdd2 = sc.parallelize([20, 40, 60, 5])

rdd1.union(rdd2).collect()

[1, 3, 5, 20, 40, 60, 5]

In [23]:
rdd1.intersection(rdd2).collect()

[5]

In [24]:
rdd1.subtract(rdd2).collect()

[1, 3]

In [25]:
rdd1.cartesian(rdd2).collect()

[(1, 20),
 (1, 40),
 (1, 60),
 (1, 5),
 (3, 20),
 (3, 40),
 (3, 60),
 (3, 5),
 (5, 20),
 (5, 40),
 (5, 60),
 (5, 5)]

### ReduceByKey

In [26]:
rdd = sc.parallelize([('a',7), ('b',4), ('a',1), ('a',100), ('b',6), ('c',3)])

rdd.reduceByKey(lambda x,y: x+y).collect()

[('a', 108), ('b', 10), ('c', 3)]

### SortByKey

In [27]:
rdd = sc.parallelize([('a',7), ('b',4), ('a',1), ('a',100), ('b',6), ('c',3)])

rdd.sortByKey().collect()

[('a', 7), ('a', 1), ('a', 100), ('b', 4), ('b', 6), ('c', 3)]

In [28]:
rdd.sortByKey(ascending=False).collect()

[('c', 3), ('b', 4), ('b', 6), ('a', 7), ('a', 1), ('a', 100)]

# B. Acciones

In [30]:
# Creación de un RDD
lista = [40, 20, 50, 10, 70, 30]
rdd = sc.parallelize(lista)

# Collect
rdd.collect()

[40, 20, 50, 10, 70, 30]

In [31]:
# Take (toma 3 valores)
rdd.take(3)

[40, 20, 50]

In [32]:
# Top (toma los 3 valores más altos)
rdd.top(3)

[70, 50, 40]

In [33]:
# Reduce (ejemplo usando la suma)
rdd.reduce(lambda x,y: x+y)

220

In [34]:
# foreach (en colab no se muestra el resultado)
def f(x): print(x)
rdd.foreach(f)

In [35]:
# Ejemplo: MapReduce
rdd = sc.parallelize(['Hola este es un texto', 'este es un párrafo', 'es'])

rdd.flatMap(lambda x: x.split(" ")).collect()

['Hola', 'este', 'es', 'un', 'texto', 'este', 'es', 'un', 'párrafo', 'es']

In [36]:
rdd.flatMap(lambda x: x.split(" ")).map(lambda x: (x, 1)).collect()

[('Hola', 1),
 ('este', 1),
 ('es', 1),
 ('un', 1),
 ('texto', 1),
 ('este', 1),
 ('es', 1),
 ('un', 1),
 ('párrafo', 1),
 ('es', 1)]

In [37]:
rdd.flatMap(lambda x: x.split(" ")).map(lambda x: (x, 1)).reduceByKey(lambda x,y: x+y).collect()

[('Hola', 1), ('este', 2), ('es', 3), ('un', 2), ('texto', 1), ('párrafo', 1)]