In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Python Spark Join") \
    .master("local[2]") \
    .config("spark.driver.memory","2g") \
    .config("spark.executor.memory","2g") \
    .getOrCreate()

In [None]:
vital = [
    { 'UserID': 100, 'VitalID': 1, 'Date': '2020-01-01', 'Weight': 75 },
    { 'UserID': 100, 'VitalID': 2, 'Date': '2020-01-02', 'Weight': 78 },
    { 'UserID': 101, 'VitalID': 3, 'Date': '2020-01-01', 'Weight': 90 },
    { 'UserID': 101, 'VitalID': 4, 'Date': '2020-01-02', 'Weight': 95 },
]

alert = [
    { 'AlertID': 1, 'VitalID': 4, 'AlertType': 'WeightIncrease', 'Date': '2020-01-01', 'UserID': 101},
    { 'AlertID': 2, 'VitalID': None, 'AlertType': 'MissingVital', 'Date': '2020-01-04', 'UserID': 100},
    { 'AlertID': 3, 'VitalID': None, 'AlertType': 'MissingVital', 'Date': '2020-01-05', 'UserID': 101}
]

In [3]:
rdd_vital = spark.sparkContext.parallelize(vital)
rdd_alert = spark.sparkContext.parallelize(alert)

In [4]:
df_vital = rdd_vital.toDF()
df_alert = rdd_alert.toDF()

In [5]:
df_vital.printSchema()

root
 |-- Date: string (nullable = true)
 |-- UserID: long (nullable = true)
 |-- VitalID: long (nullable = true)
 |-- Weight: long (nullable = true)



In [6]:
df_alert.printSchema()

root
 |-- AlertID: long (nullable = true)
 |-- AlertType: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- UserID: long (nullable = true)
 |-- VitalID: long (nullable = true)



### JOIN by DataFrame

In [7]:
# INNER JOIN
join_expr = df_vital.VitalID == df_alert.VitalID
df_vital.join(df_alert, join_expr, "inner").show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [8]:
# LEFT JOIN
join_expr = df_vital.VitalID == df_alert.VitalID
df_vital.join(df_alert, join_expr, "left").show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-01|   100|      1|    75|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-02|   100|      2|    78|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-01|   101|      3|    90|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [9]:
# RIGHT JOIN
join_expr = df_vital.VitalID == df_alert.VitalID
df_vital.join(df_alert, join_expr, "right").show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
|      NULL|  NULL|   NULL|  NULL|      2|  MissingVital|2020-01-04|   100|   NULL|
|      NULL|  NULL|   NULL|  NULL|      3|  MissingVital|2020-01-05|   101|   NULL|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [10]:
# FULL OUTER JOIN
join_expr = df_vital.VitalID == df_alert.VitalID
df_vital.join(df_alert, join_expr, "full").show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|      NULL|  NULL|   NULL|  NULL|      2|  MissingVital|2020-01-04|   100|   NULL|
|      NULL|  NULL|   NULL|  NULL|      3|  MissingVital|2020-01-05|   101|   NULL|
|2020-01-01|   100|      1|    75|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-02|   100|      2|    78|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-01|   101|      3|    90|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [11]:
# CROSS JOIN
df_vital.join(df_alert, None, "cross").show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-01|   100|      1|    75|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-02|   100|      2|    78|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-01|   100|      1|    75|      2|  MissingVital|2020-01-04|   100|   NULL|
|2020-01-01|   100|      1|    75|      3|  MissingVital|2020-01-05|   101|   NULL|
|2020-01-02|   100|      2|    78|      2|  MissingVital|2020-01-04|   100|   NULL|
|2020-01-02|   100|      2|    78|      3|  MissingVital|2020-01-05|   101|   NULL|
|2020-01-01|   101|      3|    90|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-01|   101|      3|    90|      2|  MissingVital|2020-01-04|   100| 

In [None]:
# SELF JOIN
join_expr = df_vital.VitalID == df_vital.VitalID
df_vital.join(df_vital, join_expr, "left").show()

+----------+------+-------+------+----------+------+-------+------+
|      Date|UserID|VitalID|Weight|      Date|UserID|VitalID|Weight|
+----------+------+-------+------+----------+------+-------+------+
|2020-01-01|   100|      1|    75|2020-01-01|   100|      1|    75|
|2020-01-02|   100|      2|    78|2020-01-02|   100|      2|    78|
|2020-01-01|   101|      3|    90|2020-01-01|   101|      3|    90|
|2020-01-02|   101|      4|    95|2020-01-02|   101|      4|    95|
+----------+------+-------+------+----------+------+-------+------+



### JOIN by SQL

In [13]:
df_vital.createOrReplaceTempView("Vital")
df_alert.createOrReplaceTempView("Alert")

In [14]:
# INNER JOIN
df_inner_join = spark.sql("""SELECT * FROM Vital v
JOIN Alert a ON v.vitalID = a.vitalID;""")
df_inner_join.show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [15]:
# LEFT JOIN
df_left_join = spark.sql("""SELECT * FROM Vital v
LEFT JOIN Alert a ON v.vitalID = a.vitalID;""")
df_left_join.show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-01|   100|      1|    75|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-02|   100|      2|    78|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-01|   101|      3|    90|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [16]:
# RIGHT JOIN
df_right_join = spark.sql("""SELECT * FROM Vital v
RIGHT JOIN Alert a ON v.vitalID = a.vitalID;""")
df_right_join.show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
|      NULL|  NULL|   NULL|  NULL|      2|  MissingVital|2020-01-04|   100|   NULL|
|      NULL|  NULL|   NULL|  NULL|      3|  MissingVital|2020-01-05|   101|   NULL|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [17]:
# OUTER JOIN
df_outer_join = spark.sql("""SELECT * FROM Vital v
FULL JOIN Alert a ON v.vitalID = a.vitalID;""")
df_outer_join.show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|      NULL|  NULL|   NULL|  NULL|      2|  MissingVital|2020-01-04|   100|   NULL|
|      NULL|  NULL|   NULL|  NULL|      3|  MissingVital|2020-01-05|   101|   NULL|
|2020-01-01|   100|      1|    75|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-02|   100|      2|    78|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-01|   101|      3|    90|   NULL|          NULL|      NULL|  NULL|   NULL|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [18]:
# CROSS JOIN
df_cross_join = spark.sql("""SELECT * FROM Vital v
CROSS JOIN Alert a""")
df_cross_join.show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-01|   100|      1|    75|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-02|   100|      2|    78|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-01|   100|      1|    75|      2|  MissingVital|2020-01-04|   100|   NULL|
|2020-01-01|   100|      1|    75|      3|  MissingVital|2020-01-05|   101|   NULL|
|2020-01-02|   100|      2|    78|      2|  MissingVital|2020-01-04|   100|   NULL|
|2020-01-02|   100|      2|    78|      3|  MissingVital|2020-01-05|   101|   NULL|
|2020-01-01|   101|      3|    90|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-01|   101|      3|    90|      2|  MissingVital|2020-01-04|   100| 

In [22]:
# self JOIN
df_self_join = spark.sql("""SELECT * FROM Vital v1
JOIN Vital v2""")
df_self_join.show()

+----------+------+-------+------+----------+------+-------+------+
|      Date|UserID|VitalID|Weight|      Date|UserID|VitalID|Weight|
+----------+------+-------+------+----------+------+-------+------+
|2020-01-01|   100|      1|    75|2020-01-01|   100|      1|    75|
|2020-01-01|   100|      1|    75|2020-01-02|   100|      2|    78|
|2020-01-02|   100|      2|    78|2020-01-01|   100|      1|    75|
|2020-01-02|   100|      2|    78|2020-01-02|   100|      2|    78|
|2020-01-01|   100|      1|    75|2020-01-01|   101|      3|    90|
|2020-01-01|   100|      1|    75|2020-01-02|   101|      4|    95|
|2020-01-02|   100|      2|    78|2020-01-01|   101|      3|    90|
|2020-01-02|   100|      2|    78|2020-01-02|   101|      4|    95|
|2020-01-01|   101|      3|    90|2020-01-01|   100|      1|    75|
|2020-01-01|   101|      3|    90|2020-01-02|   100|      2|    78|
|2020-01-02|   101|      4|    95|2020-01-01|   100|      1|    75|
|2020-01-02|   101|      4|    95|2020-01-02|   