In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

In [5]:
spark.conf.set('spark.sql.adaptive.enabled', False)
spark.conf.set('spark.sql.coalescePartition.enabled', False )
spark.conf.set('spark.sql.autoBroadcastJoinThreshold', -1)

In [6]:
spark.sparkContext.defaultParallelism

24

In [7]:
df_1 = spark.range(4, 200, 2)
df_2 = spark.range(2, 200, 4)

In [8]:
df_3 = df_1.repartition(5)
df_4 = df_2.repartition(7)

In [9]:
df_joined = df_3.join(df_4, on='id')

In [10]:
df_sum = df_joined.select(F.sum('id').alias('sum'))

In [11]:
df_sum.explain()

== Physical Plan ==
*(6) HashAggregate(keys=[], functions=[sum(id#0L)])
+- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=59]
   +- *(5) HashAggregate(keys=[], functions=[partial_sum(id#0L)])
      +- *(5) Project [id#0L]
         +- *(5) SortMergeJoin [id#0L], [id#2L], Inner
            :- *(2) Sort [id#0L ASC NULLS FIRST], false, 0
            :  +- Exchange hashpartitioning(id#0L, 80), ENSURE_REQUIREMENTS, [plan_id=43]
            :     +- Exchange RoundRobinPartitioning(5), REPARTITION_BY_NUM, [plan_id=42]
            :        +- *(1) Range (4, 200, step=2, splits=24)
            +- *(4) Sort [id#2L ASC NULLS FIRST], false, 0
               +- Exchange hashpartitioning(id#2L, 80), ENSURE_REQUIREMENTS, [plan_id=50]
                  +- Exchange RoundRobinPartitioning(7), REPARTITION_BY_NUM, [plan_id=49]
                     +- *(3) Range (2, 200, step=4, splits=24)




In [13]:
df_union = df_sum.union(df_4)

In [15]:
df_union.show()

+----+
| sum|
+----+
|4998|
|  10|
|  26|
| 122|
| 154|
| 186|
| 198|
|  30|
| 118|
| 190|
|  22|
|  38|
|  46|
|  62|
|  66|
| 178|
| 194|
|   6|
|  18|
|  34|
+----+
only showing top 20 rows

