# Joins

Creating different DataFrames to join

In [53]:
person = spark.createDataFrame([
    (0, "Bill Chambers", 0, [100]),
    (1, "Matei Zaharia", 1, [500, 250, 100]),
    (2, "Michael Armbrust", 1, [250, 100]),
    (3, "Waklaw Zimpel", 3, [100, 50])])\
    .toDF("id", "name", "graduate_program", "spark_status")

In [54]:
graduate_program = spark.createDataFrame([
    (0, "Masters", "School of Information", "UC Berkeley"),
    (2, "Masters", "EECS", "UC Berkeley"),
    (1, "Ph.D.", "EECS", "UC Berkeley")])\
    .toDF("id", "degree", "department", "school")

In [55]:
spark_status = spark.createDataFrame([
    (500, "Vice President"),
    (250, "PMC Member"),
    (100, "Contributor")])\
    .toDF("status_id", "status")

## Inner joins
Keep rows with keys that exist in the left and right datasets

In [56]:
join_expression = person["graduate_program"] == graduate_program['id']
person.join(graduate_program, join_expression).show()
# or person.join(graduate_program, join_expression, 'inner')

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+



## Outer joins 
Keep rows with keys in either the left or right datasets

In [57]:
join_expression = person["graduate_program"] == graduate_program['id']
person.join(graduate_program, join_expression, 'outer').show()



+----+----------------+----------------+---------------+----+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status|  id| degree|          department|     school|
+----+----------------+----------------+---------------+----+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|   0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|   1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|               1|     [250, 100]|   1|  Ph.D.|                EECS|UC Berkeley|
|   3|   Waklaw Zimpel|               3|      [100, 50]|null|   null|                null|       null|
|null|            null|            null|           null|   2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+----+-------+--------------------+-----------+



## Left/right outer joins 
Keep rows with keys in the left/right dataset

In [58]:
# Left
join_expression = person["graduate_program"] == graduate_program['id']
person.join(graduate_program, join_expression, 'left').show()

+---+----------------+----------------+---------------+----+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status|  id| degree|          department|     school|
+---+----------------+----------------+---------------+----+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|   0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|   1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|   1|  Ph.D.|                EECS|UC Berkeley|
|  3|   Waklaw Zimpel|               3|      [100, 50]|null|   null|                null|       null|
+---+----------------+----------------+---------------+----+-------+--------------------+-----------+



In [59]:
# Right
join_expression = person["graduate_program"] == graduate_program['id']
person.join(graduate_program, join_expression, 'right').show()



+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|            null|           null|  2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+



## Left semi joins 
Keep the rows in the left, and only the left, dataset where the key appears in the right dataset (a type of filter)

In [60]:
join_expression = person["graduate_program"] == graduate_program['id']
person.join(graduate_program, join_expression, 'left_semi').show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program|   spark_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+



## Left anti joins 
Keep the rows in the left, and only the left, dataset where they do not appear in the right dataset

In [61]:
join_expression = person["graduate_program"] == graduate_program['id']
person.join(graduate_program, join_expression, 'left_anti').show()

+---+-------------+----------------+------------+
| id|         name|graduate_program|spark_status|
+---+-------------+----------------+------------+
|  3|Waklaw Zimpel|               3|   [100, 50]|
+---+-------------+----------------+------------+



## Cross (or Cartesian) joins 
Match every row in the left dataset with every row in the right dataset

In [62]:
cross_join = person.join(graduate_program)
print('Resulting number of rows:', cross_join.count())
cross_join.show()


Resulting number of rows: 12
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  0|Masters|School of Informa...|UC Berkeley|
|  0|   Bill Chambers|               0|          [100]|  2|Masters|                EECS|UC Berkeley|
|  0|   Bill Chambers|               0|          [100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  2|Masters|                EECS|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  0|Mas

## Joins on complex types

In [63]:
from pyspark.sql.functions import expr
person.join(spark_status, expr('array_contains(spark_status, status_id)')).show()

+---+----------------+----------------+---------------+---------+--------------+
| id|            name|graduate_program|   spark_status|status_id|        status|
+---+----------------+----------------+---------------+---------+--------------+
|  1|   Matei Zaharia|               1|[500, 250, 100]|      500|Vice President|
|  0|   Bill Chambers|               0|          [100]|      100|   Contributor|
|  1|   Matei Zaharia|               1|[500, 250, 100]|      250|    PMC Member|
|  1|   Matei Zaharia|               1|[500, 250, 100]|      100|   Contributor|
|  2|Michael Armbrust|               1|     [250, 100]|      250|    PMC Member|
|  2|Michael Armbrust|               1|     [250, 100]|      100|   Contributor|
|  3|   Waklaw Zimpel|               3|      [100, 50]|      100|   Contributor|
+---+----------------+----------------+---------------+---------+--------------+

