# ReduceByKey
### Similarly to reduce action, it takes an RDD of (kye,val) tuples and reduces the input RDD to a list of single tuple for each key with val computed via a specified function f. Its result is stored in a brand new RDD.


In [3]:
sc
nameAge = [('Paolo',40), ('Giorgio',22), ('Paolo',35)]

In [6]:
nameAgeRDD = sc.parallelize(nameAge)

# Select for each name the lower age
youngestPariRDD = nameAgeRDD.reduceByKey(lambda age1, age2: min(age1,age2))
youngestPariRDD.collect()

[('Giorgio', 22), ('Paolo', 35)]

# FoldByKey
### Similarly to reduceByKey but is requires the function to be associative and needs a zero first value.

In [9]:
# Concatenate messages associated with names
nameMessage = [('Paolo','Sono'), ('Giorgio','Sono Giorgio'), ('Paolo',' Paolo')]

nameMessaggeRDD = sc.parallelize(nameMessage)

zero = ('')
nameMexRDD = nameMessaggeRDD.foldByKey(zero,lambda mex1, mex2: mex1+mex2)
nameMexRDD.collect()

[('Giorgio', 'Sono Giorgio'), ('Paolo', 'Sono Paolo')]

# CombineByKey
### It is a generalization of foldByKey because the data types of the values of the input and the returned RDD of pairs can be different.

In [12]:
# Write an HDFS where each line conains name, age, avgAge
nameAge = [('Paolo',40), ('Giorgio',22), ('Paolo',35)]
nameAgeRDD = sc.parallelize(nameAge)

# Compute the sum of the ages and the num of input paurs for each name (key)
sumNameRDD = nameAgeRDD.combineByKey(\
                lambda inputElem: (inputElem,1),\
                lambda intermediateElem, inputElem: (intermediateElem[0]+inputElem, intermediateElem[1]+1),\
                lambda intermediateElem1, intermediateElem2:\
                            (intermediateElem1[0]+intermediateElem2[0],intermediateElem1[1]+intermediateElem2[1]))
sumNameRDD.collect()

[('Giorgio', (22, 1)), ('Paolo', (75, 2))]

In [37]:
# Compute avg and save it on HDFS
avgPerNameRDD = sumNameRDD.map(lambda pair: (pair[0], pair[1][0]/pair[1][1]))

outputPath = 'res_combineByKey_example/'
##avgPerNameRDD.saveAsTextFile(outputPath)
avgPerNameRDD.collect()

[('Giorgio', 22.0), ('Paolo', 37.5)]

# GroupByKey
### Not associative nor commutative, use it only if you want to compute an aggregation, otherwise it is not the best solution

In [15]:
# Create an HDFS file were each line has a unique name and a list of all the ages associated with it
nameAge = [('Paolo',40), ('Giorgio',22), ('Paolo',35), ('Bruno', 16)]

nameAgeRDD = sc.parallelize(nameAge,2)
nameAgePairsRDD = nameAgeRDD.groupByKey()

nameAgePairsRDD.collect()

[('Giorgio', <pyspark.resultiterable.ResultIterable at 0x7f2347490790>),
 ('Bruno', <pyspark.resultiterable.ResultIterable at 0x7f2347490a50>),
 ('Paolo', <pyspark.resultiterable.ResultIterable at 0x7f23474907d0>)]

In [17]:
nameAgePairsRDD.mapValues(lambda listValues: list(listValues)).saveAsTextFile('res_groupByKey_example/')

In [18]:
# Note: mapValues is used to transform the values of each pair
# it transform an iterable into a Python list

# MapValues
### Apply a function f over the value of each pair of an input RDD or key/val paurs and return a new RDD with same number of key/val pairs. More efficient than map transformation, it doesn't send data on the network

In [20]:
# Increase for each name/age the age val of 1
nameAge = [('Paolo',40), ('Giorgio',22), ('Paolo',35)]
nameAgeRDD = sc.parallelize(nameAge)

plusOneRDD = nameAgeRDD.mapValues(lambda age: age+1)
plusOneRDD.collect() # Key part doesn't change

[('Paolo', 41), ('Giorgio', 23), ('Paolo', 36)]

# FlatMapValues
### Similar to MapValues, the returned is a list of key/values

In [27]:
# Extract words for each sentence
sentences = [('Sentence 1', 'Sentence Test Number 1'), 
          ('Sentence 2', 'Sentence Test Number 2'),
          ('Sentence 3', 'Sentence Test Number 3')]

sentencesRDD = sc.parallelize(sentences)

sentIdWord = sentencesRDD.flatMapValues(lambda s: s.split(' '))
sentIdWord.collect()

[('Sentence 1', 'Sentence'),
 ('Sentence 1', 'Test'),
 ('Sentence 1', 'Number'),
 ('Sentence 1', '1'),
 ('Sentence 2', 'Sentence'),
 ('Sentence 2', 'Test'),
 ('Sentence 2', 'Number'),
 ('Sentence 2', '2'),
 ('Sentence 3', 'Sentence'),
 ('Sentence 3', 'Test'),
 ('Sentence 3', 'Number'),
 ('Sentence 3', '3')]

# Keys transformation
### Used to return an RDD where you only have the key part. Note: duplicates are not removed!

In [30]:
nameAge = [('Paolo',40), ('Giorgio',22), ('Paolo',35), ('Bruno', 16)]

nameAgeRDD = sc.parallelize(nameAge)

nameAgeKeysRDD = nameAgeRDD.keys()
print(nameAgeKeysRDD.collect())

nameAgeKeysDistinctRDD = nameAgeRDD.keys().distinct()
print(nameAgeKeysDistinctRDD.collect())

['Paolo', 'Giorgio', 'Paolo', 'Bruno']
['Bruno', 'Giorgio', 'Paolo']


# Values transformation
### Used to return an RDD where you only have the value part. Note: duplicates are not removed!

In [32]:
nameAge = [('Paolo',40), ('Giorgio',22), ('Paolo',35), ('Bruno', 16)]

nameAgeRDD = sc.parallelize(nameAge)

nameAgesValuesRDD = nameAgeRDD.values()
nameAgesValuesRDD.collect()

[40, 22, 35, 16]

# Values() and Keys() DO NOT SEND DATA ON THE NETWORK!

# SortByKey transformation
### Sorts according ONLY on keys

In [35]:
nameAge = [('Paolo',40), ('Giorgio',22), ('Paolo',35), ('Bruno', 16)]

nameAgeRDD = sc.parallelize(nameAge)

nameAgesSortedByKeysRDD = nameAgeRDD.sortByKey(False) # True ASC, False DESC
nameAgesSortedByKeysRDD.collect()

[('Paolo', 40), ('Paolo', 35), ('Giorgio', 22), ('Bruno', 16)]