# Chapter 8 - Joins

In [2]:
# Sample DataFrems
person = spark.createDataFrame([
    (0, "Bill", 0, [100]),
    (1, "Matei", 1, [500, 250, 100]),
    (2, "Michael", 1, [250, 100])
]).toDF("id", "name", "graduate_program", "spark_status")

graduateProgram = spark.createDataFrame([
    (0, "Masters", "School of Information"),
    (2, "Masters", "EECS"),
    (1, "Ph.D", "EECS")
]).toDF("id", "degree", "department")

sparkStatus = spark.createDataFrame([
    (500, "Vice President"),
    (250, "PMC Member"),
    (100, "Contributor")
]).toDF("id", "status")

In [3]:
# Default JoinType is inner join
joinExpression = person["graduate_program"] == graduateProgram["id"]
joinType = "inner" # More joinType: "outer", "left_outer", "right_outer", "left_semi", "left_anti", "cross"
person.join(graduateProgram, joinExpression, joinType).show()

+---+-------+----------------+---------------+---+-------+--------------------+
| id|   name|graduate_program|   spark_status| id| degree|          department|
+---+-------+----------------+---------------+---+-------+--------------------+
|  0|   Bill|               0|          [100]|  0|Masters|School of Informa...|
|  1|  Matei|               1|[500, 250, 100]|  1|   Ph.D|                EECS|
|  2|Michael|               1|     [250, 100]|  1|   Ph.D|                EECS|
+---+-------+----------------+---------------+---+-------+--------------------+



## Joins on Complex Types

In [5]:
from pyspark.sql.functions import expr

In [6]:
person.withColumnRenamed("id", "personId")\
    .join(sparkStatus, expr("array_contains(spark_status, id)")).show()

+--------+-------+----------------+---------------+---+--------------+
|personId|   name|graduate_program|   spark_status| id|        status|
+--------+-------+----------------+---------------+---+--------------+
|       0|   Bill|               0|          [100]|100|   Contributor|
|       1|  Matei|               1|[500, 250, 100]|500|Vice President|
|       1|  Matei|               1|[500, 250, 100]|250|    PMC Member|
|       1|  Matei|               1|[500, 250, 100]|100|   Contributor|
|       2|Michael|               1|     [250, 100]|250|    PMC Member|
|       2|Michael|               1|     [250, 100]|100|   Contributor|
+--------+-------+----------------+---------------+---+--------------+



## Handling Duplicate Column
### Approach 1: Different join expression
Changing the join expression from a Boolean expression to a string or sequence will automatically remove one of the columns.

In [6]:
gradProgramDupe = graduateProgram.withColumnRenamed("id", "graduate_program")
person.join(gradProgramDupe, "graduate_program").show()

+----------------+---+-------+---------------+-------+--------------------+
|graduate_program| id|   name|   spark_status| degree|          department|
+----------------+---+-------+---------------+-------+--------------------+
|               0|  0|   Bill|          [100]|Masters|School of Informa...|
|               1|  1|  Matei|[500, 250, 100]|   Ph.D|                EECS|
|               1|  2|Michael|     [250, 100]|   Ph.D|                EECS|
+----------------+---+-------+---------------+-------+--------------------+



### Approach 2: Drop the column after the join

In [10]:
person.join(gradProgramDupe, person["graduate_program"] == gradProgramDupe["graduate_program"])\
    .drop(person["graduate_program"]).show()

+---+-------+---------------+----------------+-------+--------------------+
| id|   name|   spark_status|graduate_program| degree|          department|
+---+-------+---------------+----------------+-------+--------------------+
|  0|   Bill|          [100]|               0|Masters|School of Informa...|
|  1|  Matei|[500, 250, 100]|               1|   Ph.D|                EECS|
|  2|Michael|     [250, 100]|               1|   Ph.D|                EECS|
+---+-------+---------------+----------------+-------+--------------------+

