In [1]:
from pyspark.sql import SparkSession

spark: SparkSession = (
    SparkSession.builder.master("local[3]")  # type: ignore
    .appName("Basic Joins")
    .getOrCreate()
)

spark.version

25/04/08 17:46:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


'3.5.5'

In [2]:
ordersList = [
    ("01", "02", 350, 1),
    ("01", "04", 580, 1),
    ("01", "07", 320, 2),
    ("02", "03", 450, 1),
    ("02", "06", 220, 1),
    ("03", "01", 195, 1),
    ("04", "09", 270, 3),
    ("04", "08", 410, 2),
    ("05", "02", 350, 1),
]

orderDF = spark.createDataFrame(ordersList).toDF(
    "order_id",
    "prod_id",
    "unit_price",
    "qty",
)

orderDF.show()

                                                                                

+--------+-------+----------+---+
|order_id|prod_id|unit_price|qty|
+--------+-------+----------+---+
|      01|     02|       350|  1|
|      01|     04|       580|  1|
|      01|     07|       320|  2|
|      02|     03|       450|  1|
|      02|     06|       220|  1|
|      03|     01|       195|  1|
|      04|     09|       270|  3|
|      04|     08|       410|  2|
|      05|     02|       350|  1|
+--------+-------+----------+---+



In [3]:
productList = [
    ("01", "Scroll Mouse", 250, 20),
    ("02", "Optical Mouse", 350, 20),
    ("03", "Wireless Mouse", 450, 50),
    ("04", "Wireless Keyboard", 580, 50),
    ("05", "Standard Keyboard", 360, 10),
    ("06", "16 GB Flash Storage", 240, 100),
    ("07", "32 GB Flash Storage", 320, 50),
    ("08", "64 GB Flash Storage", 430, 25),
]

productDF = spark.createDataFrame(productList).toDF(
    "prod_id",
    "prod_name",
    "list_price",
    "qty",
)

productDF.show()

+-------+-------------------+----------+---+
|prod_id|          prod_name|list_price|qty|
+-------+-------------------+----------+---+
|     01|       Scroll Mouse|       250| 20|
|     02|      Optical Mouse|       350| 20|
|     03|     Wireless Mouse|       450| 50|
|     04|  Wireless Keyboard|       580| 50|
|     05|  Standard Keyboard|       360| 10|
|     06|16 GB Flash Storage|       240|100|
|     07|32 GB Flash Storage|       320| 50|
|     08|64 GB Flash Storage|       430| 25|
+-------+-------------------+----------+---+



In [4]:
join_expr = orderDF.prod_id == productDF.prod_id

orderDF.join(productDF, on=join_expr, how="full_outer").select(  # full_outer, outer, full
    "*",
).show()

+--------+-------+----------+----+-------+-------------------+----------+----+
|order_id|prod_id|unit_price| qty|prod_id|          prod_name|list_price| qty|
+--------+-------+----------+----+-------+-------------------+----------+----+
|      03|     01|       195|   1|     01|       Scroll Mouse|       250|  20|
|      01|     02|       350|   1|     02|      Optical Mouse|       350|  20|
|      05|     02|       350|   1|     02|      Optical Mouse|       350|  20|
|      02|     03|       450|   1|     03|     Wireless Mouse|       450|  50|
|      01|     04|       580|   1|     04|  Wireless Keyboard|       580|  50|
|    NULL|   NULL|      NULL|NULL|     05|  Standard Keyboard|       360|  10|
|      02|     06|       220|   1|     06|16 GB Flash Storage|       240| 100|
|      01|     07|       320|   2|     07|32 GB Flash Storage|       320|  50|
|      04|     08|       410|   2|     08|64 GB Flash Storage|       430|  25|
|      04|     09|       270|   3|   NULL|          

# Left Join

In [5]:
# left join (left outer join)
# PySpark how parameter: "left", "left_outer"
productRenamedDF = productDF.withColumnRenamed("qty", "reorder_qty")

orderDF.join(productRenamedDF, on=join_expr, how="left_outer").drop(
    productRenamedDF.prod_id,
).select(
    "order_id",
    "prod_id",
    "prod_name",
    "unit_price",
    "list_price",
    "qty",
).sort("order_id").show()

+--------+-------+-------------------+----------+----------+---+
|order_id|prod_id|          prod_name|unit_price|list_price|qty|
+--------+-------+-------------------+----------+----------+---+
|      01|     07|32 GB Flash Storage|       320|       320|  2|
|      01|     02|      Optical Mouse|       350|       350|  1|
|      01|     04|  Wireless Keyboard|       580|       580|  1|
|      02|     03|     Wireless Mouse|       450|       450|  1|
|      02|     06|16 GB Flash Storage|       220|       240|  1|
|      03|     01|       Scroll Mouse|       195|       250|  1|
|      04|     09|               NULL|       270|      NULL|  3|
|      04|     08|64 GB Flash Storage|       410|       430|  2|
|      05|     02|      Optical Mouse|       350|       350|  1|
+--------+-------+-------------------+----------+----------+---+



**TASK**: if we don't know the `prod_name` instead show `prod_id`, `unit_price` if we don't know `list_price`

In [6]:
from pyspark.sql.functions import expr

orderDF.join(productRenamedDF, on=join_expr, how="left_outer").drop(
    productRenamedDF.prod_id,
).select(
    "order_id",
    "prod_id",
    "prod_name",
    "unit_price",
    "list_price",
    "qty",
).withColumn(
    "prod_name",
    expr("COALESCE(prod_name, prod_id)"),
).withColumn(
    "list_price",
    expr("COALESCE(list_price, unit_price)"),
).sort("order_id").show()

+--------+-------+-------------------+----------+----------+---+
|order_id|prod_id|          prod_name|unit_price|list_price|qty|
+--------+-------+-------------------+----------+----------+---+
|      01|     07|32 GB Flash Storage|       320|       320|  2|
|      01|     02|      Optical Mouse|       350|       350|  1|
|      01|     04|  Wireless Keyboard|       580|       580|  1|
|      02|     03|     Wireless Mouse|       450|       450|  1|
|      02|     06|16 GB Flash Storage|       220|       240|  1|
|      03|     01|       Scroll Mouse|       195|       250|  1|
|      04|     09|                 09|       270|       270|  3|
|      04|     08|64 GB Flash Storage|       410|       430|  2|
|      05|     02|      Optical Mouse|       350|       350|  1|
+--------+-------+-------------------+----------+----------+---+

