### Use multi-threading to submit jobs in parallel

In [1]:
import threading
import random

partitions = 10
n = 500000 * partitions

# use different seeds in different threads and different partitions
# a bit ugly, since mapPartitionsWithIndex takes a function with only index
# and it as parameters
def f1(index, it):
    random.seed(index + 987231)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

def f2(index, it):
    random.seed(index + 987232)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

def f3(index, it):
    random.seed(index + 987233)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0
    
def f4(index, it):
    random.seed(index + 987234)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0
    
def f5(index, it):
    random.seed(index + 987245)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

f = [f1, f2, f3, f4, f5]
    
# the function executed in each thread/job
def dojob(i):
    count = sc.parallelize(range(1, n + 1), partitions) \
              .mapPartitionsWithIndex(f[i]).reduce(lambda a,b: a+b)
    print("Worker", i, "reports: Pi is roughly", 4.0 * count / n)

# create and execute the threads
threads = []
for i in range(5):
    t = threading.Thread(target=dojob, args=(i,))
    threads += [t]
    t.start()

# wait for all threads to complete
for t in threads:
    t.join()

print("Finished")

                                                                                

Worker 3 reports: Pi is roughly 3.142096
Worker 1 reports: Pi is roughly 3.1422232


[Stage 2:>   (0 + 0) / 10][Stage 3:==> (6 + 4) / 10][Stage 4:>   (0 + 2) / 10][Stage 2:>                 (0 + 0) / 10][Stage 4:>                 (0 + 6) / 10]                                                                                

Worker 2 reports: Pi is roughly 3.1424288




Worker 4 reports: Pi is roughly 3.1419832
Worker 0 reports: Pi is roughly 3.1426512
Finished




### Example: Finding all primes

In [2]:
n = 500000
allnumbers = sc.parallelize(range(2, n), 8).cache()
composite = allnumbers.flatMap(lambda x: range(x*2, n, x)).repartition(8)
prime = allnumbers.subtract(composite)
print(prime.take(10))



[17, 97, 113, 193, 241, 257, 337, 353, 401, 433]


                                                                                

In [3]:
# Find the number of elements in each parttion
def partitionsize(it): 
    yield len(list(it))

print(allnumbers.mapPartitions(partitionsize).collect())
print(composite.mapPartitions(partitionsize).collect())
print(prime.mapPartitions(partitionsize).collect())
print(prime.glom().take(4)[1][0:4])
print(prime.glom().take(4)[2][0:4])
print(prime.glom().take(4)[3][0:4])
print(composite.glom().take(1)[0][0:40])

[62499, 62500, 62500, 62500, 62499, 62500, 62500, 62500]
[704805, 704790, 704800, 704800, 704800, 704799, 704800, 704816]


                                                                                

[0, 5169, 1, 5219, 0, 5206, 0, 5189, 0, 5165, 0, 5199, 0, 5191, 0, 5199]


                                                                                

[17, 97, 113, 193]


                                                                                

[2]


                                                                                

[3, 19, 67, 83]
[44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 364, 366, 368, 370, 372, 374, 376, 378, 380, 382, 524, 526, 528, 530, 532, 534, 536, 538, 540, 542]


In [4]:
# repartition vs coalesce

rdd1 = sc.parallelize(range(30), 6)
print(rdd1.glom().collect())

# repartition can increase or decrease the level of parallelism in this RDD. 
# Internally, this uses a chunk-based shuffle to redistribute data. 
rdd2 = rdd1.repartition(6)
print(rdd2.glom().collect())

#If you are decreasing the number of partitions in this RDD, consider using coalesce, 
# which can avoid performing a shuffle.
# coalesce merges adjacent partitions, so it cannot fix skew issues!
rdd3 = rdd1.coalesce(3)
print(rdd3.glom().collect())

[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], [20, 21, 22, 23, 24], [25, 26, 27, 28, 29]]
[[], [0, 1, 2, 3, 4, 25, 26, 27, 28, 29], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24], [], [10, 11, 12, 13, 14], [5, 6, 7, 8, 9]]
[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]


### Data Partitioning

In [5]:
data = [8, 8, 1, 96, 240, 400, 1, 800, 4, 12]
rdd = sc.parallelize(zip(data, data),4)
print(rdd.partitioner)
rdd = rdd.map(lambda t: (t[0], t[1]+1))
print(rdd.partitioner)
print(rdd.glom().collect())

rdd = rdd.reduceByKey(lambda x,y: x+y)
print(rdd.glom().collect())
print(rdd.partitioner)
print(rdd.partitioner.partitionFunc)

rdd1 = rdd.map(lambda x: (x[0], x[1]+1))
print(rdd1.glom().collect())
print(rdd1.partitioner)

rdd2 = rdd.mapValues(lambda x: x+1)
print(rdd2.partitioner.partitionFunc)

rdd = rdd.sortByKey()
print(rdd.glom().collect())
print(rdd.partitioner.partitionFunc)
rdd3 = rdd.mapValues(lambda x: x+1)
print(rdd3.partitioner.partitionFunc)

None
None
[[(8, 9), (8, 9)], [(1, 2), (96, 97)], [(240, 241), (400, 401)], [(1, 2), (800, 801), (4, 5), (12, 13)]]
[[(8, 18), (96, 97), (240, 241), (400, 401), (800, 801), (4, 5), (12, 13)], [(1, 4)], [], []]
<pyspark.rdd.Partitioner object at 0x7fecb15b05b0>
<function portable_hash at 0x7fece43443a0>
[[(8, 19), (96, 98), (240, 242), (400, 402), (800, 802), (4, 6), (12, 14)], [(1, 5)], [], []]
None
<function portable_hash at 0x7fece43443a0>
[[(1, 4), (4, 5), (8, 18)], [(12, 13), (96, 97)], [(240, 241), (400, 401)], [(800, 801)]]
<function RDD.sortByKey.<locals>.rangePartitioner at 0x7fecb0543af0>
<function RDD.sortByKey.<locals>.rangePartitioner at 0x7fecb0543af0>


In [6]:
data = [8, 8, 1, 96, 240, 400, 1, 800, 4, 12]
rdd = sc.parallelize(zip(data, data),4)
print(rdd.partitioner)

# repartition does a random reparitioning, resulting in no partitioner.
rdd1 = rdd.repartition(4)
print(rdd1.glom().collect())
print(rdd1.partitioner)

# partitionBy partitions data by hashing the key.
# This can only be applied on (key, value) pairs
rdd2 = rdd.partitionBy(4)
print(rdd2.glom().collect())
print(rdd2.partitioner)
print(rdd2.partitioner.partitionFunc)

None
[[(240, 240), (400, 400), (1, 1), (800, 800), (4, 4), (12, 12)], [(1, 1), (96, 96)], [], [(8, 8), (8, 8)]]
None
[[(8, 8), (8, 8), (96, 96), (240, 240), (400, 400), (800, 800), (4, 4), (12, 12)], [(1, 1), (1, 1)], [], []]
<pyspark.rdd.Partitioner object at 0x7fecb15b0550>
<function portable_hash at 0x7fece43443a0>


In [7]:
# Partition using a custom partition function

def partitionsize(it): yield len(list(it))
    
n = 40000

def f(x):
    return x % 17

data1 = list(range(0, n, 16)) + list(range(0, n, 16))
data2 = range(0, n, 8)
rdd1 = sc.parallelize(zip(data1, data2), 16)
print(rdd1.mapPartitions(partitionsize).collect())
rdd2 = rdd1.reduceByKey(lambda x,y: x+y)
print(rdd2.mapPartitions(partitionsize).collect())
rdd3 = rdd2.partitionBy(16, f)
print(rdd3.mapPartitions(partitionsize).collect())
rdd4 = rdd1.reduceByKey(lambda x,y: x+y, partitionFunc=f)
print(rdd4.mapPartitions(partitionsize).collect())
print(rdd4.partitioner.partitionFunc)

[312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 320]
[2500, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[295, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147]
[295, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147]
<function f at 0x7fecb0569160>


In [8]:
# Join two RDDs not co-partitioned
# The resulting RDD has twice the partition number

a = sc.parallelize(zip(range(10000), range(10000)), 8)
b = sc.parallelize(zip(range(10000), range(10000)), 8)
c = a.join(b)
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().take(2)[1][0:4])

# After a shuffling operation, the resulting RDD is hash partitioned
print(a.partitioner)
a = a.reduceByKey(lambda x,y: x+y)
print(a.partitioner.partitionFunc)
b = b.reduceByKey(lambda x,y: x+y)
print(b.partitioner.partitionFunc)

# Join two RDDs co-partitioned: no shuffle is needed and partition number is the same
c = a.join(b)
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().first()[0:4])

# coalesce/repartition removes the partitioner.
b = b.coalesce(8)
print(b.partitioner)
c = a.join(b)  # This join still requires a shuffle
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().first()[0:4])


16
<function portable_hash at 0x7fece43443a0>
[(1, (1, 1)), (17, (17, 17)), (33, (33, 33)), (49, (49, 49))]
None
<function portable_hash at 0x7fece43443a0>
<function portable_hash at 0x7fece43443a0>
8
<function portable_hash at 0x7fece43443a0>
[(0, (0, 0)), (8, (8, 8)), (16, (16, 16)), (24, (24, 24))]
None
16
<function portable_hash at 0x7fece43443a0>
[(0, (0, 0)), (16, (16, 16)), (32, (32, 32)), (48, (48, 48))]


In [9]:
# Create two RDDs with different number of partitions
a = sc.parallelize(zip(range(10000), range(10000)), 4)
b = sc.parallelize(zip(range(10000), range(10000)), 8)

# They are not co-partitioned because they have different numbers of partitions.
a = a.reduceByKey(lambda x,y: x+y)
b = b.reduceByKey(lambda x,y: x+y)

c = a.join(b)
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().first()[0:4])

# To avoid a third shuffle, use the same partition number in the first two shuffles:
a = sc.parallelize(zip(range(10000), range(10000)), 4)
b = sc.parallelize(zip(range(10000), range(10000)), 8)

a = a.reduceByKey(lambda x,y: x+y, 8)
b = b.reduceByKey(lambda x,y: x+y)

c = a.join(b)
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().first()[0:4])

12
<function portable_hash at 0x7fece43443a0>
[(0, (0, 0)), (12, (12, 12)), (24, (24, 24)), (36, (36, 36))]
8
<function portable_hash at 0x7fece43443a0>
[(0, (0, 0)), (8, (8, 8)), (16, (16, 16)), (24, (24, 24))]


In [10]:
# lookup: Return the list of values in the RDD for key key. 
# This operation is done efficiently if the RDD has a known partitioner by only searching the partition that the key maps to.

l = range(1000)
rdd = sc.parallelize(zip(l, l), 10)
print(rdd.lookup(42))  # slow
sorted = rdd.sortByKey() # induces a range partitioner
print(sorted.lookup(42))  # fast
print(sorted.lookup(1024)) # fast

[42]
[42]
[]


### Partitioning in DataFrames

In [11]:
# Hash partitioner in SparkSQL

spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False) 

import pyspark.sql.functions

data1 = [1, 2, 1, 1, 2, 3, 4, 4, 5, 2, 1]
data2 = [2, 1, 1, 3, 4, 4, 5, 2, 1, 5, 3]

df1 = spark.createDataFrame(zip(data1, data2), ['a', 'b'])
df2 = spark.createDataFrame(zip(data1, data2), ['a', 'c'])

df1 = df1.join(df2, 'a')
print(df1.rdd.getNumPartitions())
print(df1.rdd.glom().collect())
print(df1.rdd.partitioner)  # This doesn't work for dataframes, as the RDD underlying a dataframe is virtual

# SparkSQL uses MurmurHash to make generating adversarial data more difficult
# Calling SparkSQL's hash function
df1.select('*', pyspark.sql.functions.hash(df1['a']), pyspark.sql.functions.hash(df1['a']) % 200).show()

# Calling Python's hash function
print(hash(1))

200


                                                                                

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [Row(a=5, b=1, c=1)], [Row(a=1, b=2, c=2), Row(a=1, b=2, c=1), Row(a=1, b=2, c=3), Row(a=1, b=2, c=3), Row(a=1, b=1, c=2), Row(a=1, b=1, c=1), Row(a=1, b=1, c=3), Row(a=1, b=1, c=3), Row(a=1, b=3, c=2), Row(a=1, b=3, c=1), Row(a=1, b=3, c=3), Row(a=1, b=3, c=3), Row(a=1, b=3, c=2), Row(a=1, b=3, c=1), Row(a=1, b=3, c=3), Row(a=1, b=3, c=3)], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [Row(a=3, b=4, c=4)], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [Row(a=2, b=1, c=1), Row(a=2, b=1, c=4), Row(a=2, b=1, c=5), Row(a=2, b=4, c=1), Row(a=2, b=4, c=4), Row(a=2, b=4, c=5), Row(a=2, b=5

In [12]:
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)  # Default in Spark 2.x 
print(spark.conf.get('spark.sql.shuffle.partitions'))  # number of partitions in a shuffle, default is 200

#spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", True)  # Default in Spark 3.x 
#  When this is set to True, Spark will coalesce contiguous shuffle partitions according to the target size
# (specified by spark.sql.adaptive.advisoryPartitionSizeInBytes, default is 64 MB), to avoid too many small tasks.
#  May change this if your dataframe is not too large to get better parallelism. 

data1 = [1, 2, 1, 1, 2, 3, 4, 4, 5, 2, 1]
data2 = [2, 1, 1, 3, 4, 4, 5, 2, 1, 5, 3]

df1 = spark.createDataFrame(zip(data1, data2), ['a', 'b'])
print(df1.rdd.getNumPartitions())
print(df1.rdd.glom().collect())

df2 = df1.groupBy('a').count()
df2.show()
print(df2.rdd.getNumPartitions())

200
6
[[Row(a=1, b=2)], [Row(a=2, b=1), Row(a=1, b=1)], [Row(a=1, b=3), Row(a=2, b=4)], [Row(a=3, b=4), Row(a=4, b=5)], [Row(a=4, b=2), Row(a=5, b=1)], [Row(a=2, b=5), Row(a=1, b=3)]]
+---+-----+
|  a|count|
+---+-----+
|  5|    1|
|  1|    4|
|  3|    1|
|  2|    3|
|  4|    2|
+---+-----+

200


In [13]:
n = 1000000
partitions = 40
df = spark.range(n)  # More efficient than using parallelize(range()) 
df = df.select(df[0].alias('a'), df[0].alias('b')).cache()
df.take(3)

spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False) # default is True

spark.conf.set("spark.sql.adaptive.coalescePartitions.parallelismFirst", False)  # default is True in Spark 3.2
# When true, Spark ignores the target size specified by spark.sql.adaptive.advisoryPartitionSizeInBytes 
# (default 64MB) when coalescing contiguous shuffle partitions, and only respect the minimum partition
# size specified by spark.sql.adaptive.coalescePartitions.minPartitionSize (default 1MB), to maximize the 
# parallelism. This is to avoid performance regression when enabling adaptive query execution. 


#spark.conf.set('spark.sql.shuffle.partitions', partitions)  # number of partitions in a shuffle, default is 200

from pyspark.sql.functions import *

print(df.rdd.getNumPartitions())

df1 = df.groupBy(df[0]).count()
print(df1.rdd.getNumPartitions())



6




200


In [14]:
# Join hints

a = spark.createDataFrame(zip(range(1000), range(1000)), ['a', 'a1'])
b = spark.createDataFrame(zip(range(1000), range(1000)), ['a', 'b1'])

#c = a.join(b, 'a')
c = a.join(b.hint('broadcast'), 'a')
#c = a.join(b.hint('shuffle_hash'), 'a')
# c = a.join(b.hint('merge'), 'a')
c.show()

c.explain()

+---+---+---+
|  a| a1| b1|
+---+---+---+
|  0|  0|  0|
|  1|  1|  1|
|  2|  2|  2|
|  3|  3|  3|
|  4|  4|  4|
|  5|  5|  5|
|  6|  6|  6|
|  7|  7|  7|
|  8|  8|  8|
|  9|  9|  9|
| 10| 10| 10|
| 11| 11| 11|
| 12| 12| 12|
| 13| 13| 13|
| 14| 14| 14|
| 15| 15| 15|
| 16| 16| 16|
| 17| 17| 17|
| 18| 18| 18|
| 19| 19| 19|
+---+---+---+
only showing top 20 rows

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [a#214L, a1#215L, b1#219L]
   +- BroadcastHashJoin [a#214L], [a#218L], Inner, BuildRight, false
      :- Filter isnotnull(a#214L)
      :  +- Scan ExistingRDD[a#214L,a1#215L]
      +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=421]
         +- Filter isnotnull(a#218L)
            +- Scan ExistingRDD[a#218L,b1#219L]




                                                                                