In [139]:
# Read data from local file system:
fruits = sc.textFile('data/fruits.txt')
yellowThings = sc.textFile('data/yellowthings.txt')

In [140]:
# Convert RDD to list
print(fruits.collect())
print(yellowThings.collect())

['apple', 'banana', 'canary melon', 'grap', 'lemon', 'orange', 'pineapple', 'strawberry']
['banana', 'bee', 'butter', 'canary melon', 'gold', 'lemon', 'pineapple', 'sunflower']


In [141]:
# Get number of partitions
print(fruits.getNumPartitions())

2


In [142]:
# Get each partition
print(fruits.glom().collect())
print(yellowThings.glom().collect())

[['apple', 'banana', 'canary melon', 'grap', 'lemon'], ['orange', 'pineapple', 'strawberry']]
[['banana', 'bee', 'butter', 'canary melon', 'gold'], ['lemon', 'pineapple', 'sunflower']]


----------

##  RDD operations

In [143]:
# Reverse string for each element
fruitsReversed = fruits.map(lambda fruit: fruit[::-1])
print(fruitsReversed.collect())

['elppa', 'ananab', 'nolem yranac', 'parg', 'nomel', 'egnaro', 'elppaenip', 'yrrebwarts']


In [144]:
# filter
shortFruits = fruits.filter(lambda fruit: len(fruit) <= 5)
print(shortFruits.collect())

['apple', 'grap', 'lemon']


In [145]:
# flatMap
characters = fruits.flatMap(lambda fruit: fruit)
print(characters.collect())

['a', 'p', 'p', 'l', 'e', 'b', 'a', 'n', 'a', 'n', 'a', 'c', 'a', 'n', 'a', 'r', 'y', ' ', 'm', 'e', 'l', 'o', 'n', 'g', 'r', 'a', 'p', 'l', 'e', 'm', 'o', 'n', 'o', 'r', 'a', 'n', 'g', 'e', 'p', 'i', 'n', 'e', 'a', 'p', 'p', 'l', 'e', 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y']


In [146]:
# union
fruitsAndYellowThings = fruits.union(yellowThings)
print(fruitsAndYellowThings.collect())
print(fruitsAndYellowThings.getNumPartitions())

['apple', 'banana', 'canary melon', 'grap', 'lemon', 'orange', 'pineapple', 'strawberry', 'banana', 'bee', 'butter', 'canary melon', 'gold', 'lemon', 'pineapple', 'sunflower']
4


In [147]:
# intersection
yellowFruits = fruits.intersection(yellowThings)
print(yellowFruits.collect())

['pineapple', 'canary melon', 'lemon', 'banana']


In [148]:
# subtraction
subtracted_fruit = fruits.subtract(yellowThings)
print(subtracted_fruit.collect())

['orange', 'apple', 'grap', 'strawberry']


In [149]:
# distinct
distinctFruitsAndYellowThings = fruitsAndYellowThings.distinct()
print(distinctFruitsAndYellowThings.collect())

['orange', 'pineapple', 'canary melon', 'lemon', 'bee', 'banana', 'butter', 'gold', 'sunflower', 'apple', 'grap', 'strawberry']


In [150]:
# Cartesian product
cartesian_fruits = fruits.cartesian(yellowThings) 
print(cartesian_fruits.collect())

[('apple', 'banana'), ('apple', 'bee'), ('apple', 'butter'), ('apple', 'canary melon'), ('apple', 'gold'), ('banana', 'banana'), ('banana', 'bee'), ('banana', 'butter'), ('banana', 'canary melon'), ('banana', 'gold'), ('canary melon', 'banana'), ('canary melon', 'bee'), ('canary melon', 'butter'), ('canary melon', 'canary melon'), ('canary melon', 'gold'), ('grap', 'banana'), ('grap', 'bee'), ('grap', 'butter'), ('grap', 'canary melon'), ('grap', 'gold'), ('lemon', 'banana'), ('lemon', 'bee'), ('lemon', 'butter'), ('lemon', 'canary melon'), ('lemon', 'gold'), ('apple', 'lemon'), ('apple', 'pineapple'), ('apple', 'sunflower'), ('banana', 'lemon'), ('banana', 'pineapple'), ('banana', 'sunflower'), ('canary melon', 'lemon'), ('canary melon', 'pineapple'), ('canary melon', 'sunflower'), ('grap', 'lemon'), ('grap', 'pineapple'), ('grap', 'sunflower'), ('lemon', 'lemon'), ('lemon', 'pineapple'), ('lemon', 'sunflower'), ('orange', 'banana'), ('orange', 'bee'), ('orange', 'butter'), ('orange',

In [151]:
# zip 
print(fruits.zip(yellowThings).collect())

#Can only zip with RDD which has the same number of partitions
#print(fruits.zip(newyellowThings).collect())

print(fruits.zipWithIndex().collect())

# Items in the kth partition will get ids k, n+k, 2*n+k, â€¦, where n is the number of partitions. 
# This is more efficient since each partition is processed independently
print(fruits.zipWithUniqueId().glom().collect())

[('apple', 'banana'), ('banana', 'bee'), ('canary melon', 'butter'), ('grap', 'canary melon'), ('lemon', 'gold'), ('orange', 'lemon'), ('pineapple', 'pineapple'), ('strawberry', 'sunflower')]
[('apple', 0), ('banana', 1), ('canary melon', 2), ('grap', 3), ('lemon', 4), ('orange', 5), ('pineapple', 6), ('strawberry', 7)]
[[('apple', 0), ('banana', 2), ('canary melon', 4), ('grap', 6), ('lemon', 8)], [('orange', 1), ('pineapple', 3), ('strawberry', 5)]]


### RDD actions

In [152]:
# count
numFruits = fruits.count()
print(numFruits)

8


In [153]:
# take
first3Fruits = fruits.take(3)
print(first3Fruits)

['apple', 'banana', 'canary melon']


In [154]:
# Tip: Don't use count() when you don't need to return the exact number of rows
# use take() or isEmpty()
print(fruits.isEmpty())

False


In [155]:
print(fruits.map(lambda fruit: len(fruit)).sum())
print(fruits.map(lambda fruit: len(fruit)).reduce(lambda x, y: x+y))

57
57


In [156]:
# the reduce function must be associative; otherwise the result is nondeterministic and depends on the partitioning
rdd = sc.parallelize([1, 2, 1, 3, 4, 5, 2], 4)
print(rdd.glom().collect())
print(rdd.reduce(lambda x, y: 2*x+y))

[[1], [2, 1], [3, 4], [5, 2]]
60


In [157]:
# reduce
letterSet = fruits.map(lambda fruit: set(fruit)).reduce(lambda x, y: x.union(y))
print(letterSet)

{'o', 'r', 'a', 'i', 'p', 'g', 'c', ' ', 'l', 'y', 'e', 'w', 'n', 'b', 'm', 't', 's'}


In [158]:
# treeReduce 
# Data are combined partially on a small set of executors before they are sent to the driver, 
# which dramatically reduces the load the driver has to deal with. 

letterSet = fruits.map(lambda fruit: set(fruit)).treeReduce(lambda x, y: x.union(y))
print(letterSet)

{'o', 'r', 'a', 'i', 'p', 'g', 'c', ' ', 'l', 'y', 'e', 'w', 'n', 'b', 'm', 't', 's'}


In [159]:
# fold
letterSet = fruits.map(lambda fruit: set(fruit)).fold(set(), lambda x, y: x.union(y))
print(letterSet)

{'o', 'r', 'a', 'i', 'p', 'g', 'c', ' ', 'l', 'y', 'e', 'w', 'n', 'b', 'm', 't', 's'}


In [160]:
# reducing an empty rdd is not allowed, but fold is OK.
r = sc.parallelize([])
#r.reduce(lambda x, y: x+y)
r.fold(0, lambda x, y: x+y)

0

In [161]:
# aggregate /  treeAggregate can return a different result type than the type of the RDD

def f(x, y):
    x.add(y)
    return x

letterSet = fruits.flatMap(lambda fruit: list(fruit)).aggregate(set(), f, lambda x, y: x.union(y))
print(letterSet)

# It avoids object allocation.
# This is the most efficient way for solving this problem.

{'o', 'r', 'a', 'i', 'p', 'g', 'c', ' ', 'l', 'y', 'e', 'w', 'n', 'b', 'm', 't', 's'}


In [162]:
# foreach is an action, map is a transformation.
fruits.foreach(lambda x: print('I have a', x))

I have a apple
I have a banana
I have a canary melon
I have a grap
I have a lemon
I have a orange
I have a pineapple
I have a strawberry


### Closure

In [163]:
rdd = sc.parallelize(range(10))
accum = sc.accumulator(0)

def g(x):
    global accum
    accum += x

a = rdd.foreach(g)

print(accum.value)

45


In [164]:
rdd = sc.parallelize(range(10))
accum = sc.accumulator(-1)

def g(x):
    global accum
    accum += x
    return x * x

a = rdd.map(g)
print(accum.value)
# print(a.reduce(lambda x, y: x+y))
a.cache()
tmp = a.count()
print(accum.value)
print(rdd.reduce(lambda x, y: x+y))

tmp = a.count()
print(accum.value)
print(rdd.reduce(lambda x, y: x+y))


-1
44
45
44
45


In [165]:
n = 100
rdd = sc.parallelize(range(1000000*n) , n)
accum = sc.accumulator(0)

def g(x):
    global accum
    accum += x
    return x

a = rdd.map(g)
tmp = a.count()
print(accum.value)

# correct answer: 4999999950000000
# Spark will only update the accumulator from the successful task,and the failed tasks are completely ignored.
# So the accumulator is computed correctly even if some tasks fail



4999999950000000




In [166]:
n = 100
rdd = sc.parallelize(range(100000*n) , n)
accum = sc.accumulator(0)

def g(x):
    global accum
    accum += x
    return x

a = rdd.map(g)
b = a.repartition(n+1) #reduceByKey(sum)  # this causes a shuffle
tmp = b.count()
print(accum.value)

# Because shuffle output is stored locally, if a node goes down, that shuffle output is gone. 
# Spark goes back to the stage that generated the shuffle output, looks at which tasks need
# to be rerun, and re-executes them on one of the nodes that is still alive.
# This results in the accumulator being an over-count

# Summary: It's OK to use accumulators in an action (e.g., foreach) but not in a transformation
# Or better: Avoid using them at all.



49999995000000




### Closure and Persistence

In [167]:
# From the official spark examples.
from random import random
from operator import add

partitions = 1
n = 100000 * partitions

def f(_: int) -> int:
    x = random() * 2 - 1
    y = random() * 2 - 1
    return 1 if x ** 2 + y ** 2 <= 1 else 0

count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
print("Pi is roughly %f" % (4.0 * count / n))

Pi is roughly 3.139840


In [168]:
from random import random

a = sc.parallelize(range(0,10),2)
print(a.map(lambda _: random()).glom().collect())

[[0.08197107068419918, 0.5106449226745052, 0.1123248153207631, 0.21871356601511205, 0.29039239441003184], [0.08197107068419918, 0.5106449226745052, 0.1123248153207631, 0.21871356601511205, 0.29039239441003184]]


In [169]:
random()

0.08197107068419918

### mapPartitions and mapPartitionsWithIndex

In [170]:
# mapPartitions

def f(pa):
    l = []
    for x in pa:
        l.append(x[::-1])
    return l
        
fruitsReversed = fruits.mapPartitions(f)
print(fruitsReversed.collect())

['elppa', 'ananab', 'nolem yranac', 'parg', 'nomel', 'egnaro', 'elppaenip', 'yrrebwarts']


In [171]:
# It's more efficient to use yield

def f(pa):
    for x in pa:
        yield x[::-1]

fruitsReversed = fruits.mapPartitions(f)
print(fruitsReversed.collect())

#  It provides a facility to do heavy initializations (for example Database connection) once for each partition
# instead of doing it on every element in the RDD.

['elppa', 'ananab', 'nolem yranac', 'parg', 'nomel', 'egnaro', 'elppaenip', 'yrrebwarts']


In [172]:
# Can also do some transformation on the partition level

def f(pa):
    return sorted(pa, reverse = False)

print(fruits.glom().collect())
print(fruits.mapPartitions(f).glom().collect())

[['apple', 'banana', 'canary melon', 'grap', 'lemon'], ['orange', 'pineapple', 'strawberry']]
[['apple', 'banana', 'canary melon', 'grap', 'lemon'], ['orange', 'pineapple', 'strawberry']]


In [173]:
# mapPartitionsWithIndex

def f(i, pa):
    for x in pa:
        yield str(i) + x

fruitsReversed = fruits.mapPartitionsWithIndex(f)
print(fruitsReversed.glom().collect())

# These two functions will be useful for advanced algorithm design (will see later)

[['0apple', '0banana', '0canary melon', '0grap', '0lemon'], ['1orange', '1pineapple', '1strawberry']]


In [174]:
# Check that random numbers are different
from random import random, seed
from time import time

s = time()

def f(index, it):
    seed(index + s)
    for i in it:
        yield random()

print(sc.parallelize(range(10), 2).mapPartitionsWithIndex(f).glom().collect())

[[0.4915180891510291, 0.6501074950025437, 0.7425690415610158, 0.8094631097993955, 0.30904201290698985], [0.7167792617493542, 0.3987359268395958, 0.027887404178219444, 0.9969306184063441, 0.2764869188716471]]


In [175]:
# Correct version for computing Pi
from random import random, seed
from time import time

partitions = 100
n = 100000 * partitions

s = time()

def f(index, it):
    seed(index + s)
    for i in it:
        x = random() * 2 - 1
        y = random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 <= 1 else 0

count = sc.parallelize(range(1, n + 1), partitions).mapPartitionsWithIndex(f).sum()

print("Pi is roughly", 4.0 * count / n)



Pi is roughly 3.141356




### Linear-time selection

In [176]:
# An inefficient algorithm: requires a shuffle + a scan
data = [34, 67, 21, 56, 47, 89, 12, 44, 74, 43, 26]
A = sc.parallelize(data,2)
k = 4

print(A.sortBy(lambda x: x).zipWithIndex().map(lambda x: (x[1],x[0])).lookup(k))

[43]


In [177]:
data = [34, 67, 21, 56, 47, 89, 12, 44, 74, 43, 26]
A = sc.parallelize(data,2)
k = 4

while True:
    x = A.first()
    A1 = A.filter(lambda z: z < x)
    A2 = A.filter(lambda z: z > x)
    mid = A1.count()
    if mid == k:
        print(x)
        break
    if k < mid:
        A = A1
    else:
        A = A2
        k = k - mid - 1
    A.cache()

43


### Key-Value Pairs

In [178]:
# groupByKey
groupFruitsByLength = fruits.map(lambda fruit: (len(fruit), fruit)).groupByKey()
print(groupFruitsByLength.take(10))
for x in groupFruitsByLength.take(1)[0][1]:
    print(x)

[(6, <pyspark.resultiterable.ResultIterable object at 0x7f96dc477220>), (12, <pyspark.resultiterable.ResultIterable object at 0x7f96dc477160>), (4, <pyspark.resultiterable.ResultIterable object at 0x7f96dc477070>), (10, <pyspark.resultiterable.ResultIterable object at 0x7f96dc477130>), (5, <pyspark.resultiterable.ResultIterable object at 0x7f96dc477af0>), (9, <pyspark.resultiterable.ResultIterable object at 0x7f96dc477b20>)]
banana
orange


In [179]:
# reduceByKey: this more efficient
# reduceByKey will compute local sums for each key in each partition and combine those local sums 
# into larger sums after shuffling.

numFruitsByLength = fruits.map(lambda fruit: (len(fruit), 1)).reduceByKey(lambda x, y: x + y)
print(numFruitsByLength.take(10))

# aggregateByKey, foldByKey also available.
# but there is no treeAggregateByKey

[(6, 2), (12, 1), (4, 1), (10, 1), (5, 2), (9, 1)]


In [180]:
from operator import add

lines = sc.textFile('data/course.txt')
counts = lines.flatMap(lambda x: x.split()) \
              .map(lambda x: (x, 1)) \
              .reduceByKey(lambda x,y:x+y)
print(counts.take(20))

[('Course', 2), ('Information', 1), ('systems,', 1), ('cloud', 1), ('parallel', 1), ('as', 1), ('in', 2), ('mining', 1), ('massive', 1), ('amount', 1), ('of', 3), ('even', 1), ('servers', 1), ('centers.', 1), ('both', 1), ('hands-on', 1), ('this', 1), ('new', 1), ('Lecture', 1), ('videos', 1)]


In [181]:
print(counts.sortByKey().take(20))
print(counts.sortBy(lambda x: x[1], False).take(20))
print(counts.collectAsMap())

[('Big', 1), ('Course', 2), ('Description', 1), ('Information', 1), ('Lecture', 1), ('This', 1), ('across', 1), ('amount', 1), ('and', 3), ('as', 1), ('both', 1), ('centers.', 1), ('cloud', 1), ('commodity', 1), ('computing', 1), ('course', 1), ('data', 4), ('emerge', 1), ('enabling', 1), ('even', 1)]
[('data', 4), ('of', 3), ('and', 3), ('Course', 2), ('in', 2), ('the', 2), ('Information', 1), ('systems,', 1), ('cloud', 1), ('parallel', 1), ('as', 1), ('mining', 1), ('massive', 1), ('amount', 1), ('even', 1), ('servers', 1), ('centers.', 1), ('both', 1), ('hands-on', 1), ('this', 1)]
{'Course': 2, 'Information': 1, 'systems,': 1, 'cloud': 1, 'parallel': 1, 'as': 1, 'in': 2, 'mining': 1, 'massive': 1, 'amount': 1, 'of': 3, 'even': 1, 'servers': 1, 'centers.': 1, 'both': 1, 'hands-on': 1, 'this': 1, 'new': 1, 'Lecture': 1, 'videos': 1, 'Description': 1, 'Big': 1, 'data': 4, 'including': 1, 'computing': 1, 'and': 3, 'processing': 1, 'frameworks,': 1, 'emerge': 1, 'enabling': 1, 'technolo

In [182]:
counts.lookup('data')
# This scans the whole RDD, unless there is a partitioner (to be discussed later)

[4]

### Join vs. Broadcast Variables

In [183]:
# Join simple example

products = sc.parallelize([(1, "Apple"), (1, 'apple'), (2, "Orange"), (3, "TV"), (5, "Computer")])
#trans = sc.parallelize([(1, 134, "OK"), (3, 34, "OK"), (5, 162, "Error"), (1, 135, "OK"), (2, 53, "OK"), (1, 45, "OK")])
trans = sc.parallelize([(1, (134, "OK")), (3, (34, "OK")), (5, (162, "Error")), (1, (135, "OK")), (2, (53, "OK")), (1, (45, "OK"))])

print(products.join(trans).take(20))

[(1, ('Apple', (134, 'OK'))), (1, ('Apple', (135, 'OK'))), (1, ('Apple', (45, 'OK'))), (1, ('apple', (134, 'OK'))), (1, ('apple', (135, 'OK'))), (1, ('apple', (45, 'OK'))), (2, ('Orange', (53, 'OK'))), (3, ('TV', (34, 'OK'))), (5, ('Computer', (162, 'Error')))]


In [184]:
products = {1: "Apple", 2: "Orange", 3: "TV", 4: "PC", 5: "Computer"}
trans = sc.parallelize([(1, (134, "OK")), (3, (34, "OK")), (5, (162, "Error")), (1, (135, "OK")), (2, (53, "OK")), (1, (45, "OK"))])

broadcasted_products = sc.broadcast(products)

results = trans.map(lambda x: (x[0], broadcasted_products.value[x[0]], x[1]))
#  results = trans.map(lambda x: (x[0], products[x[0]], x[1]))
print(results.take(20))


[(1, 'Apple', (134, 'OK')), (3, 'TV', (34, 'OK')), (5, 'Computer', (162, 'Error')), (1, 'Apple', (135, 'OK')), (2, 'Orange', (53, 'OK')), (1, 'Apple', (45, 'OK'))]


In [185]:
# Compare with cogroup

products = sc.parallelize([(1, "Apple"), (1, 'apple'),  (2, "Orange"), (3, "TV"), (4, "PC"), (5, "Computer")])
trans = sc.parallelize([(1, (134, "OK")), (3, (34, "OK")), (5, (162, "Error")), (1, (135, "OK")), (2, (53, "OK")), (1, (45, "OK"))])

for x,y in products.cogroup(trans).collect():
    print(x, tuple(map(list, y)))

1 (['Apple', 'apple'], [(134, 'OK'), (135, 'OK'), (45, 'OK')])
2 (['Orange'], [(53, 'OK')])
3 (['TV'], [(34, 'OK')])
4 (['PC'], [])
5 (['Computer'], [(162, 'Error')])
