In [0]:
from pyspark.sql import SparkSession

data1 = [(1, "A"), (2, "B"), (3, "C")]
data2 = [(1, "X"), (2, "Y"), (4, "Z")]

df1 = spark.createDataFrame(data1, ["id", "val1"])
df2 = spark.createDataFrame(data2, ["id", "val2"])


In [0]:
from pyspark.sql.functions import broadcast

# Broadcast the smaller dataframe
df_broadcast = df1.join(broadcast(df2), "id", "inner")
df_broadcast.show()


+---+----+----+
| id|val1|val2|
+---+----+----+
|  1|   A|   X|
|  2|   B|   Y|
+---+----+----+



In [0]:
# No explicit code - Spark uses this under the hood when appropriate
# Just use a regular join and let Spark decide
df_shuffle = df1.join(df2, "id", "inner")
df_shuffle.show()


+---+----+----+
| id|val1|val2|
+---+----+----+
|  1|   A|   X|
|  2|   B|   Y|
+---+----+----+



In [0]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)


In [0]:
# Disable broadcast to force sort-merge join
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

# Run join normally
df_sort_merge = df1.join(df2, "id", "inner")
df_sort_merge.show()


+---+----+----+
| id|val1|val2|
+---+----+----+
|  1|   A|   X|
|  2|   B|   Y|
+---+----+----+



In [0]:
df_cross = df1.crossJoin(df2)
df_cross.show()


+---+----+---+----+
| id|val1| id|val2|
+---+----+---+----+
|  1|   A|  1|   X|
|  1|   A|  2|   Y|
|  1|   A|  4|   Z|
|  2|   B|  1|   X|
|  2|   B|  2|   Y|
|  2|   B|  4|   Z|
|  3|   C|  1|   X|
|  3|   C|  2|   Y|
|  3|   C|  4|   Z|
+---+----+---+----+



In [0]:
df_broadcast.explain(mode="formatted") 

== Physical Plan ==
AdaptiveSparkPlan (8)
+- Project (7)
   +- BroadcastHashJoin Inner BuildRight (6)
      :- Filter (2)
      :  +- Scan ExistingRDD (1)
      +- Exchange (5)
         +- Filter (4)
            +- Scan ExistingRDD (3)


(1) Scan ExistingRDD
Output [2]: [id#2L, val1#3]
Arguments: [id#2L, val1#3], MapPartitionsRDD[4] at applySchemaToPythonRDD at NativeMethodAccessorImpl.java:0, ExistingRDD, UnknownPartitioning(0)

(2) Filter
Input [2]: [id#2L, val1#3]
Condition : isnotnull(id#2L)

(3) Scan ExistingRDD
Output [2]: [id#6L, val2#7]
Arguments: [id#6L, val2#7], MapPartitionsRDD[9] at applySchemaToPythonRDD at NativeMethodAccessorImpl.java:0, ExistingRDD, UnknownPartitioning(0)

(4) Filter
Input [2]: [id#6L, val2#7]
Condition : isnotnull(id#6L)

(5) Exchange
Input [2]: [id#6L, val2#7]
Arguments: SinglePartition, EXECUTOR_BROADCAST, [plan_id=605]

(6) BroadcastHashJoin
Left keys [1]: [id#2L]
Right keys [1]: [id#6L]
Join type: Inner
Join condition: None

(7) Project
Output [3]: