In [2]:
# Examples

In [3]:
sc

In [8]:
# Top action

inputList = [5,5,6,1,3,2]
inputRDD = sc.parallelize(inputList)

# Retrieve top 2, 3 and 4 elements
top2 = inputRDD.top(2)
top3 = inputRDD.top(3)
top4 = inputRDD.top(4)

print("Top 2:")
print(top2)

print("Top 3:")
print(top3)

print("Top 4:")
print(top4)

Top 2:
[6, 5]
Top 3:
[6, 5, 5]
Top 4:
[6, 5, 5, 3]


In [19]:
# Top action custom order

inputListNames = ['Giovanni', 'Paolo', 'Francesco', 'Arcangelo', 'Luca']
inputNamesRDD = sc.parallelize(inputListNames)

# Retrieve top three longest names
topThreeNamesDesc = inputNamesRDD.top(3, lambda x:len(x))
print(topThreeNamesDesc)

['Francesco', 'Arcangelo', 'Giovanni']


In [17]:
# TakeOrdered action
inputList = [1,2,5,12,55]
inputRDD = sc.parallelize(inputList)

# Retrieve first 4 elements in asc order
top3 = inputRDD.takeOrdered(3)
print(top3)

[1, 2, 5]


In [20]:
# Same can be done for custom order
topThreeNamesAsc = inputNamesRDD.takeOrdered(3,lambda x: len(x))
print(topThreeNamesAsc)

['Luca', 'Paolo', 'Giovanni']


In [21]:
# takeSample action
# If you want random samples from your RDD

## It takes as parameters (WithReplacement ('True'/'False'), numElements, seed (optional))
randomNames = inputNamesRDD.takeSample(False,3)
print(randomNames)

['Paolo', 'Giovanni', 'Arcangelo']


In [22]:
# Reduce action Example 1

# It is used to reduce the list to a single python object element 
# according to a user provided function

inputList = [1,2,3,4,5,6,7,8,9]
inputRDD = sc.parallelize(inputList)

# Be aware that the function must be assosciative or commuative
# input and output data type MUST be the same
sumValues = inputRDD.reduce(lambda s,t: s+t) ## note that function is applied recursively, no for loop needed
print(sumValues)

45


In [24]:
# Reduce action Example 2

# We define a custom function
def computeMax(x,y):
    if(x>=y):
        return x
    return y

inputList = [43,51,22,13,78,99,1,42]
inputRDD = sc.parallelize(inputList)

# Be aware that the function must be assosciative or commuative
maxVal = inputRDD.reduce(computeMax) ## note that function is applied recursively, no for loop needed
print(maxVal)

99


In [31]:
# Fold Action
# useful if no commutative is required

# Let's say we want to concatenate strings (associative but not commutative)
inputList = ['This ', 'is ', 'a ', 'test ']
inputRDD = sc.parallelize(inputList)

# First parameter is always empty string!
phrase = inputRDD.fold('',lambda x,y : x+y)
print(phrase)

# lambda function we used is not very good in terms of efficiency
# you have no other options anyway...

This is a test 


In [41]:
# Aggregate Action

# Input and Output terms can be different
# It takes three parameter, a zero value and two user provided functions

# First function combines differnt data types
# Second function combines ssame data types

# We wanr to compute the sum of the elements, and the number of elements we have
inputListAggr = [1,2,3,5,5,6]
inRDD = sc.parallelize(inputListAggr)

# Instatiate a zero val using a tuple (sum, # of elements)
zeroVal = (0,0)

# Compute the sum of the elements and count them
## (1, 1), 2 -> (1+2 [acc[0]]+v2, 1+1 [acc[1]+1]) -> (3, 2)
## second function used to aggregate partiotions
sumCount = inRDD.aggregate(zeroVal, lambda acc,v2: (acc[0]+v2, acc[1]+1), 
                           lambda acc1, acc2: (acc1[0]+acc2[0], acc1[1]+acc2[1]))

print(sumCount)

# We compute the average
myAvg = sumCount[0]/sumCount[1]
print('Avergae: ', myAvg)

(22, 6)
Avergae:  3.6666666666666665


In [46]:
# Pay attention that the data sent on the network are the same as
# using map and reduce methods. Efficiency in the end is quite the same

mapRDD = inRDD.map(lambda v: (v,1))

reduceRDD = mapRDD.reduce(lambda x,y: (x[0]+y[0],x[1]+y[1]))
print(reduceRDD)

(22, 6)


In [48]:
mapRDD.collect()

[(1, 1), (2, 1), (3, 1), (5, 1), (5, 1), (6, 1)]

In [1]:
# CountByKey action

# Returns a local python dictinoary containing the info about the number of elements associated with each key

# If the number of distinct keys is large, the result 
# of the actino cannot be stored in a local variable of the Driver

# Data are sent on the network to compute the final result

inputList = [('Forrest Gump', 4), ('Star Trek', 5), ('Forrest Gump', 3)]

In [2]:
inputRDD = sc.parallelize(inputList)
inputRDD.countByKey()

defaultdict(int, {'Forrest Gump': 2, 'Star Trek': 1})

In [3]:
# CollectAsMap action

# Returns a local dictionary containing the same pairs of the considered input RDD of pairs
# PAY ATTENTION: lot of data on the network, it transfer the entire RDD

# A dictinoary cannot contain duplicate keys! OCIO, system won't rise an error!

inputList = [('User1', 'Paolo'), ('User2', 'Giovanni'), ('User3', 'Francesco')]
inputRDD = sc.parallelize(inputList)

retrievePairsRDD = inputRDD.collectAsMap()

In [5]:
print(retrievePairsRDD)

{'User1': 'Paolo', 'User2': 'Giovanni', 'User3': 'Francesco'}


In [6]:
# ERROR with duplicate keys

wrongInputList = [('User1', 'Paolo'), ('User2', 'Giovanni'), ('User2', 'Francesco')]
inputRDD = sc.parallelize(wrongInputList)

retrievePairsRDD = inputRDD.collectAsMap()
print(retrievePairsRDD)

{'User1': 'Paolo', 'User2': 'Francesco'}


In [9]:
# Lookup action

# returns a local python list containing the   values 
# of the pairs of the input RDD associated with the key k specified as parameter

inputList = [('Forrest Gump', 4), ('Star Trek', 5), ('Forrest Gump', 3)]
inputRDD = sc.parallelize(inputList)
inputRDD.lookup('Forrest Gump')

[4, 3]