<h2> Imports & Configuration </h2>

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.master("local[4]").getOrCreate()

23/06/06 14:41:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark.conf.set("spark.sql.shuffle.partitions", "3")
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled", "false")

'3'

<h2> Simulating Skewed Join </h2>

In [24]:
df_uniform = spark.createDataFrame([i for i in range(1000000)], IntegerType())
df_uniform.show(5, False)

+-----+
|value|
+-----+
|0    |
|1    |
|2    |
|3    |
|4    |
+-----+
only showing top 5 rows



23/06/06 20:05:34 WARN TaskSetManager: Stage 55 contains a task of very large size (1590 KiB). The maximum recommended task size is 1000 KiB.


In [23]:
df_uniform.withColumn(
    "partition", F.spark_partition_id()
).groupBy("partition").count().orderBy("partition").show()

23/06/06 20:04:29 WARN TaskSetManager: Stage 53 contains a task of very large size (1590 KiB). The maximum recommended task size is 1000 KiB.


+---------+------+
|partition| count|
+---------+------+
|        0|749568|
|        1|749568|
|        2|749568|
|        3|751296|
+---------+------+



In [25]:
df0 = spark.createDataFrame([0] * 999990, IntegerType()).repartition(1)
df1 = spark.createDataFrame([1] * 5, IntegerType()).repartition(1)
df2 = spark.createDataFrame([2] * 5, IntegerType()).repartition(1)
df_skew = df0.union(df1).union(df2)
df_skew.show(5, False)

+-----+
|value|
+-----+
|0    |
|0    |
|0    |
|0    |
|0    |
+-----+
only showing top 5 rows



In [22]:
df_skew.withColumn(
    "partition", F.spark_partition_id()
).groupBy("partition").count().orderBy("partition").show()

+---------+------+
|partition| count|
+---------+------+
|        0|999990|
|        1|     5|
|        2|     5|
+---------+------+



In [9]:
df_uniform.withColumn("partition", F.spark_partition_id()).groupBy("partition").count().show()

23/06/06 14:41:47 WARN TaskSetManager: Stage 14 contains a task of very large size (1590 KiB). The maximum recommended task size is 1000 KiB.


+---------+------+
|partition| count|
+---------+------+
|        0|249856|
|        2|249856|
|        3|250432|
|        1|249856|
+---------+------+



In [10]:
df_joined_c1 = df_skew.join(df_uniform, "value", 'inner')

In [11]:
df_joined_c1.withColumn("partition", F.spark_partition_id()).groupBy("partition").count().show()

23/06/06 14:41:48 WARN TaskSetManager: Stage 21 contains a task of very large size (1590 KiB). The maximum recommended task size is 1000 KiB.
[Stage 20:>                                                         (0 + 4) / 4]

+---------+------+
|partition| count|
+---------+------+
|        0|999995|
|        1|     5|
+---------+------+



                                                                                

<h2> Simulating Uniform Distribution Through Salting </h2>

In [12]:
SALT_NUMBER = 3
# SALT_NUMBER = spark.conf.get("spark.sql.shuffle.partitions")

In [13]:
df_skew = df_skew.withColumn("salt", (F.rand() * SALT_NUMBER).cast("int"))

In [20]:
df_skew.show(truncate=False)

+-----+----+
|value|salt|
+-----+----+
|0    |0   |
|0    |0   |
|0    |2   |
|0    |2   |
|0    |1   |
|0    |0   |
|0    |1   |
|0    |2   |
|0    |1   |
|0    |1   |
|0    |1   |
|0    |1   |
|0    |0   |
|0    |2   |
|0    |2   |
|0    |1   |
|0    |0   |
|0    |1   |
|0    |1   |
|0    |0   |
+-----+----+
only showing top 20 rows



In [15]:
df_uniform = (
    df_uniform
    .withColumn("salt_values", F.array([F.lit(i) for i in range(SALT_NUMBER)]))
    .withColumn("salt", F.explode(F.col("salt_values")))
)

In [21]:
df_uniform.show(truncate=False)

+-----+-----------+----+
|value|salt_values|salt|
+-----+-----------+----+
|0    |[0, 1, 2]  |0   |
|0    |[0, 1, 2]  |1   |
|0    |[0, 1, 2]  |2   |
|1    |[0, 1, 2]  |0   |
|1    |[0, 1, 2]  |1   |
|1    |[0, 1, 2]  |2   |
|2    |[0, 1, 2]  |0   |
|2    |[0, 1, 2]  |1   |
|2    |[0, 1, 2]  |2   |
|3    |[0, 1, 2]  |0   |
|3    |[0, 1, 2]  |1   |
|3    |[0, 1, 2]  |2   |
|4    |[0, 1, 2]  |0   |
|4    |[0, 1, 2]  |1   |
|4    |[0, 1, 2]  |2   |
|5    |[0, 1, 2]  |0   |
|5    |[0, 1, 2]  |1   |
|5    |[0, 1, 2]  |2   |
|6    |[0, 1, 2]  |0   |
|6    |[0, 1, 2]  |1   |
+-----+-----------+----+
only showing top 20 rows



23/06/06 16:43:20 WARN TaskSetManager: Stage 47 contains a task of very large size (1590 KiB). The maximum recommended task size is 1000 KiB.


In [17]:
df_joined = df_skew.join(df_uniform, ["value", "salt"], 'inner')

In [18]:
df_joined.withColumn("partition", F.spark_partition_id()).groupBy("value", "partition").count().orderBy("value", "partition").show()

23/06/06 14:41:50 WARN TaskSetManager: Stage 35 contains a task of very large size (1590 KiB). The maximum recommended task size is 1000 KiB.
[Stage 37:>                                                         (0 + 3) / 3]

+-----+---------+------+
|value|partition| count|
+-----+---------+------+
|    0|        0|333483|
|    0|        1|332851|
|    0|        2|333656|
|    1|        1|     5|
|    2|        0|     2|
|    2|        2|     3|
+-----+---------+------+



                                                                                