In [1]:
# This function print some statistics about the input RDD
def analyzePartitions(myRDD):
    if myRDD.partitioner is None:
        print("Partioner: No partitioner")
    else:
        print("Partioner: "+str(myRDD.partitioner.partitionFunc))
    
    print("Num. partitions: "+ str(myRDD.getNumPartitions()))
    
    # Create a local copy of the input partitions in a local Python list
    partitions = myRDD.glom().collect()

    print("Content of the partitions")
    for p in partitions:
        print(str(p))

In [2]:
# Examples

In [3]:
# Define an initial RDD 
# Set the (minimum) number of partitions to 4
myRDD = sc.parallelize(["1,Post text1 ..",\
                        "2,Post text2 ..",\
                        "3,Post text3 ..",\
                        "1,Post text4 ..",\
                        "1,Post text5 ..",\
                        "2,Post text6 ..",\
                        "4,Post text7 ..",\
                        "5,Post text8 ..",\
                        "3,Post text9 .."], 4)

In [4]:
# Print some statistics
analyzePartitions(myRDD)

Partioner: No partitioner
Num. partitions: 4
Content of the partitions
['1,Post text1 ..', '2,Post text2 ..']
['3,Post text3 ..', '1,Post text4 ..']
['1,Post text5 ..', '2,Post text6 ..']
['4,Post text7 ..', '5,Post text8 ..', '3,Post text9 ..']


In [5]:
#
# Repartition
#
# Change the number of partitions to 12 (i.e., repartition the content of the input RDD and return 
# a new RDD with the same content but different partitions: 12 partitions))
myRepartitionedRDD = myRDD.repartition(12)

In [6]:
# Print some statistics
analyzePartitions(myRepartitionedRDD)

Partioner: No partitioner
Num. partitions: 12
Content of the partitions
[]
['1,Post text1 ..', '2,Post text2 ..']
['4,Post text7 ..', '5,Post text8 ..', '3,Post text9 ..']
[]
['1,Post text5 ..', '2,Post text6 ..']
['3,Post text3 ..', '1,Post text4 ..']
[]
[]
[]
[]
[]
[]


In [7]:
# Print some statistics about the initial RDD myRDD
analyzePartitions(myRDD)

Partioner: No partitioner
Num. partitions: 4
Content of the partitions
['1,Post text1 ..', '2,Post text2 ..']
['3,Post text3 ..', '1,Post text4 ..']
['1,Post text5 ..', '2,Post text6 ..']
['4,Post text7 ..', '5,Post text8 ..', '3,Post text9 ..']


In [8]:
#
# Repartition
#
# Change the number of partitions to 2 (i.e., repartition the content of the input RDD and return 
# a new RDD with the same content but different partitions: 2 partitions))
myRepartitionedRDD2 = myRDD.repartition(2)

In [9]:
# Print some statistics
analyzePartitions(myRepartitionedRDD2)

Partioner: No partitioner
Num. partitions: 2
Content of the partitions
['1,Post text1 ..', '2,Post text2 ..', '1,Post text5 ..', '2,Post text6 ..', '4,Post text7 ..', '5,Post text8 ..', '3,Post text9 ..']
['3,Post text3 ..', '1,Post text4 ..']


In [10]:
# Print some statistics about the initial RDD
analyzePartitions(myRDD)

Partioner: No partitioner
Num. partitions: 4
Content of the partitions
['1,Post text1 ..', '2,Post text2 ..']
['3,Post text3 ..', '1,Post text4 ..']
['1,Post text5 ..', '2,Post text6 ..']
['4,Post text7 ..', '5,Post text8 ..', '3,Post text9 ..']


In [11]:
#
# coalesce
#
# Coalesce is better than repartition when you are decreasing the number of partitions (it minimizes shuffles)
# Change the number of partitions to 2 (i.e., repartition the content of the input RDD and return 
# a new RDD with the same content but different partitions: 2 partitions))
myRepartitionedRDD3 = myRDD.coalesce(2)

In [12]:
# Print some statistics
analyzePartitions(myRepartitionedRDD3)

Partioner: No partitioner
Num. partitions: 2
Content of the partitions
['1,Post text1 ..', '2,Post text2 ..', '3,Post text3 ..', '1,Post text4 ..']
['1,Post text5 ..', '2,Post text6 ..', '4,Post text7 ..', '5,Post text8 ..', '3,Post text9 ..']


In [13]:
# Print some statistics about the initial RDD myRDD
analyzePartitions(myRDD)

Partioner: No partitioner
Num. partitions: 4
Content of the partitions
['1,Post text1 ..', '2,Post text2 ..']
['3,Post text3 ..', '1,Post text4 ..']
['1,Post text5 ..', '2,Post text6 ..']
['4,Post text7 ..', '5,Post text8 ..', '3,Post text9 ..']


In [14]:
# Apply a map transformation to return an RDD of pairs
# - key: postID
# - value: content of the post

myPairRDD = myRDD.map(lambda post: (int(post.split(',')[0]), post.split(',')[1]) )

In [15]:
# Print some statistics
# Note that the number of partitions is the same and also the "content" of the partitions is similar
analyzePartitions(myPairRDD)

Partioner: No partitioner
Num. partitions: 4
Content of the partitions
[(1, 'Post text1 ..'), (2, 'Post text2 ..')]
[(3, 'Post text3 ..'), (1, 'Post text4 ..')]
[(1, 'Post text5 ..'), (2, 'Post text6 ..')]
[(4, 'Post text7 ..'), (5, 'Post text8 ..'), (3, 'Post text9 ..')]


In [16]:
# Return a new rdd of pairs with only 2 partitions by using coaleasce
myPairRDD2 = myPairRDD.coalesce(2)

In [17]:
# Print some statistics
analyzePartitions(myPairRDD2)

Partioner: No partitioner
Num. partitions: 2
Content of the partitions
[(1, 'Post text1 ..'), (2, 'Post text2 ..'), (3, 'Post text3 ..'), (1, 'Post text4 ..')]
[(1, 'Post text5 ..'), (2, 'Post text6 ..'), (4, 'Post text7 ..'), (5, 'Post text8 ..'), (3, 'Post text9 ..')]


In [18]:
#
# partitionBy()
#
# Return a new rdd of pairs with only 2 partitions. 
# Use partitionBy and the hash partition function to put all the pairs associated with the same key 
# in the same partition.
# Each partition can contain pairs associated with different keys
myPairRDD3 = myPairRDD.partitionBy(2)

In [19]:
# Print some statistics
# Note that all the pairs associated with the same key are stored in the same partition
# Given an input pair, spark associates that pair to partition number hash(key)%num.partitions
analyzePartitions(myPairRDD3)

Partioner: <function portable_hash at 0x7f54ac3447b8>
Num. partitions: 2
Content of the partitions
[(2, 'Post text2 ..'), (2, 'Post text6 ..'), (4, 'Post text7 ..')]
[(1, 'Post text1 ..'), (3, 'Post text3 ..'), (1, 'Post text4 ..'), (1, 'Post text5 ..'), (5, 'Post text8 ..'), (3, 'Post text9 ..')]


In [20]:
#
# partitionBy()
#
# A custom partition function to put all the pairs associated with keys <3 in
# one partition and those associated with keys >=3 in another.
def myPartitionFunction(key):
    if key<3:
        return 0
    else:
        return 1

In [21]:
# Return a new rdd of pairs with only 2 partitions. 
# Use partitionBy and the custom partition function to put all the pairs associated with keys <3 in
# one partition and those associated with keys >=3 in the other.
myPairRDD4 = myPairRDD.partitionBy(2, myPartitionFunction)

In [22]:
# Print some statistics
# Note that all the pairs associated with odd keys are stored in one partition and
# the pairs associated with even keys are stored in the other partition
# Given an input pair, spark associates that pair to partition number myPartitionFunction(key)%num.partitions
analyzePartitions(myPairRDD4)

Partioner: <function myPartitionFunction at 0x7f54951510d0>
Num. partitions: 2
Content of the partitions
[(1, 'Post text1 ..'), (2, 'Post text2 ..'), (1, 'Post text4 ..'), (1, 'Post text5 ..'), (2, 'Post text6 ..')]
[(3, 'Post text3 ..'), (4, 'Post text7 ..'), (5, 'Post text8 ..'), (3, 'Post text9 ..')]


In [23]:
# What happens if I set the number of partitions to 4 and I use myPartitionFunction to partition pairs?
myPairRDD5 = myPairRDD.partitionBy(4, myPartitionFunction)

In [24]:
# Print some statistics
# Note that one two partitions contain some pairs
# because myPartitionFunction(key)%num.partitions can assume only on the following two values: 0, 1
analyzePartitions(myPairRDD5)

Partioner: <function myPartitionFunction at 0x7f54951510d0>
Num. partitions: 4
Content of the partitions
[(1, 'Post text1 ..'), (2, 'Post text2 ..'), (1, 'Post text4 ..'), (1, 'Post text5 ..'), (2, 'Post text6 ..')]
[(3, 'Post text3 ..'), (4, 'Post text7 ..'), (5, 'Post text8 ..'), (3, 'Post text9 ..')]
[]
[]


In [25]:
# Examples to analyze the impacts of transformations on the partitions and partitioner of the the returned RDD

In [26]:
# Print some statistics about the input RDD of pairs myPairRDD
analyzePartitions(myPairRDD)

Partioner: No partitioner
Num. partitions: 4
Content of the partitions
[(1, 'Post text1 ..'), (2, 'Post text2 ..')]
[(3, 'Post text3 ..'), (1, 'Post text4 ..')]
[(1, 'Post text5 ..'), (2, 'Post text6 ..')]
[(4, 'Post text7 ..'), (5, 'Post text8 ..'), (3, 'Post text9 ..')]


In [27]:
# filter the content of myPairRDD
filteredPairRDD = myPairRDD.filter(lambda pair: pair[0]!=5 and pair[0]!=3)

In [28]:
# Print some statistics
# Same number of partitions and same partitioner (no partitioner in this case)
analyzePartitions(filteredPairRDD)

Partioner: No partitioner
Num. partitions: 4
Content of the partitions
[(1, 'Post text1 ..'), (2, 'Post text2 ..')]
[(1, 'Post text4 ..')]
[(1, 'Post text5 ..'), (2, 'Post text6 ..')]
[(4, 'Post text7 ..')]


In [29]:
# Organize pairs by using the hash partition function. Keep the same number of partitions of the input pair RDD
myPairRDD6 = myPairRDD.partitionBy(myPairRDD.getNumPartitions(), hash)

In [30]:
# Print some statistics
analyzePartitions(myPairRDD6)

Partioner: <built-in function hash>
Num. partitions: 4
Content of the partitions
[(4, 'Post text7 ..')]
[(1, 'Post text1 ..'), (1, 'Post text4 ..'), (1, 'Post text5 ..'), (5, 'Post text8 ..')]
[(2, 'Post text2 ..'), (2, 'Post text6 ..')]
[(3, 'Post text3 ..'), (3, 'Post text9 ..')]


In [31]:
# filter the content of myPairRDD6
filteredPairRDD6 = myPairRDD6.filter(lambda pair: pair[0]!=5 and pair[0]!=3)

In [32]:
# Print some statistics
# Same number of partitions and same partitioner (hash function)
analyzePartitions(filteredPairRDD6)

Partioner: <built-in function hash>
Num. partitions: 4
Content of the partitions
[(4, 'Post text7 ..')]
[(1, 'Post text1 ..'), (1, 'Post text4 ..'), (1, 'Post text5 ..')]
[(2, 'Post text2 ..'), (2, 'Post text6 ..')]
[]


In [33]:
# Apply a map transformation to change the value part of the input pairs of myPairRDD6
# - key = input key
# - value = len(input value)
mapPairRDD6 = myPairRDD6.map(lambda pair: (pair[0], len(pair[1])) )

In [34]:
# Print some statistics
# Same number of partitions but note that there is no partitioner.
# No shuffle because data are not sent on the network to apply map but but no partitioner.
# Pay attention that map causes the new returned RDD of pairs to forget the parent's partitioning information.
# The lambda function we used in the map transformation is not changing the key part but the system does not 
# know it
analyzePartitions(mapPairRDD6)

Partioner: No partitioner
Num. partitions: 4
Content of the partitions
[(4, 13)]
[(1, 13), (1, 13), (1, 13), (5, 13)]
[(2, 13), (2, 13)]
[(3, 13), (3, 13)]


In [35]:
# You can specify that you are not changing the key part by using the preservesPartitioning parameter
mapPairRDD7 = myPairRDD6.map(lambda pair: (pair[0], len(pair[1])), preservesPartitioning=True)

In [36]:
# Print some statistics
# Same number of partitions and also same partitioner in this case.
# Pay attention that the system trusts you. If you set preservesPartitioning to True but your map
# function changes the key part of the pairs the next operations could be wrong.
analyzePartitions(mapPairRDD7)

Partioner: <built-in function hash>
Num. partitions: 4
Content of the partitions
[(4, 13)]
[(1, 13), (1, 13), (1, 13), (5, 13)]
[(2, 13), (2, 13)]
[(3, 13), (3, 13)]


In [37]:
# Print some statistics about the input RDD of pairs myPairRDD6
analyzePartitions(myPairRDD6)

Partioner: <built-in function hash>
Num. partitions: 4
Content of the partitions
[(4, 'Post text7 ..')]
[(1, 'Post text1 ..'), (1, 'Post text4 ..'), (1, 'Post text5 ..'), (5, 'Post text8 ..')]
[(2, 'Post text2 ..'), (2, 'Post text6 ..')]
[(3, 'Post text3 ..'), (3, 'Post text9 ..')]


In [38]:
# Apply a mapValues transformation to change the value part of the input pairs of myPairRDD6
# - key = input key
# - value = len(input value)
mapPairRDD8 = myPairRDD6.mapValues(lambda value: len(value) )

In [39]:
# Print some statistics
# Same number of partitions and also same partitioner in this case because mapValues does not change the
# value part of the input pairs.
analyzePartitions(mapPairRDD8)

Partioner: <built-in function hash>
Num. partitions: 4
Content of the partitions
[(4, 13)]
[(1, 13), (1, 13), (1, 13), (5, 13)]
[(2, 13), (2, 13)]
[(3, 13), (3, 13)]


In [40]:
# Print some statistics about the input RDD of pairs mapPairRDD6
analyzePartitions(mapPairRDD6)

Partioner: No partitioner
Num. partitions: 4
Content of the partitions
[(4, 13)]
[(1, 13), (1, 13), (1, 13), (5, 13)]
[(2, 13), (2, 13)]
[(3, 13), (3, 13)]


In [41]:
# Apply reduceByKey to sum the values for each each key
keySumRDD = mapPairRDD6.reduceByKey(lambda v1,v2: v1+v2)

In [42]:
# Print some statistics
# Note that the returned RDD is partitioned. 
# The system applies the hash partition function during the shuffle step that is executed before the 
# computation of the final result for each key
analyzePartitions(keySumRDD)

Partioner: <function portable_hash at 0x7f54ac3447b8>
Num. partitions: 4
Content of the partitions
[(4, 13)]
[(1, 39), (5, 13)]
[(2, 26)]
[(3, 26)]
