In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Joins").getOrCreate()

23/06/10 14:19:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Creating Dataframes

In [9]:
person = spark\
    .createDataFrame([
                    (0, "Bill Chambers", 0, [100]),
                    (1, "Matei Zaharia", 1, [500, 250, 100]),
                    (3, "matte Silwal", 1, [500, 2550, 100]),
                    (4, "Devid Shrestha", 1, [5040, 2550, 1040]),
                    (5, "Matei Panday", 1, [500, 2500, 1500]),
                    (6, "Michael Armbrust", 1, [250, 1000])])\
    .toDF("id", "name", "graduate_program", "spark_status")

graduateProgram = spark\
    .createDataFrame([
                    (0, "PHD", "School of Information", "UC Berkeley"),
                    (2, "Masters", "KEC", "UC Berkeley"),
                    (3, "Bachelors", "EECS", "UC Berkeley"),
                    (4, "Masters", "EEC", "UC Berkeley w"),
                    (1, "Ph.D.", "EECS", "UC Berkeley")])\
    .toDF("id", "degree", "department", "school")

sparkStatus = spark\
    .createDataFrame([
                    (500, "Vice President"),
                    (1000, "PMC Member"),
                    (2550, "PMC Chief"),
                    (1040, "PMC Leader"),
                    (100, "Contributor")])\
    .toDF("id", "status")

# Inner join

In [4]:
joinExpression = person["graduate_program"] == graduateProgram['id']
person.join(graduateProgram, joinExpression).show()



+---+----------------+----------------+------------------+---+------+--------------------+-----------+
| id|            name|graduate_program|      spark_status| id|degree|          department|     school|
+---+----------------+----------------+------------------+---+------+--------------------+-----------+
|  0|   Bill Chambers|               0|             [100]|  0|   PHD|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|   [500, 250, 100]|  1| Ph.D.|                EECS|UC Berkeley|
|  3|    matte Silwal|               1|  [500, 2550, 100]|  1| Ph.D.|                EECS|UC Berkeley|
|  4|  Devid Shrestha|               1|[5040, 2550, 1040]|  1| Ph.D.|                EECS|UC Berkeley|
|  5|    Matei Panday|               1| [500, 2500, 1500]|  1| Ph.D.|                EECS|UC Berkeley|
|  6|Michael Armbrust|               1|       [250, 1000]|  1| Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+------------------+---+------+----

                                                                                

In [5]:
wrongJoinExpression = person["name"] == graduateProgram["school"]
person.join(graduateProgram, wrongJoinExpression).show()



+---+----+----------------+------------+---+------+----------+------+
| id|name|graduate_program|spark_status| id|degree|department|school|
+---+----+----------------+------------+---+------+----------+------+
+---+----+----------------+------------+---+------+----------+------+



                                                                                

In [8]:
joinExpression = person["graduate_program"] == graduateProgram['id']
joinType = "inner"
person.join(graduateProgram, joinExpression, joinType).show()



+---+----------------+----------------+------------------+---+------+--------------------+-----------+
| id|            name|graduate_program|      spark_status| id|degree|          department|     school|
+---+----------------+----------------+------------------+---+------+--------------------+-----------+
|  0|   Bill Chambers|               0|             [100]|  0|   PHD|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|   [500, 250, 100]|  1| Ph.D.|                EECS|UC Berkeley|
|  3|    matte Silwal|               1|  [500, 2550, 100]|  1| Ph.D.|                EECS|UC Berkeley|
|  4|  Devid Shrestha|               1|[5040, 2550, 1040]|  1| Ph.D.|                EECS|UC Berkeley|
|  5|    Matei Panday|               1| [500, 2500, 1500]|  1| Ph.D.|                EECS|UC Berkeley|
|  6|Michael Armbrust|               1|       [250, 1000]|  1| Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+------------------+---+------+----

                                                                                

# Outer Join

In [10]:
joinExpression = person["graduate_program"] == graduateProgram['id']
joinType = "outer"
person.join(graduateProgram, joinExpression, joinType).show()



+----+----------------+----------------+------------------+---+---------+--------------------+-------------+
|  id|            name|graduate_program|      spark_status| id|   degree|          department|       school|
+----+----------------+----------------+------------------+---+---------+--------------------+-------------+
|   0|   Bill Chambers|               0|             [100]|  0|      PHD|School of Informa...|  UC Berkeley|
|   1|   Matei Zaharia|               1|   [500, 250, 100]|  1|    Ph.D.|                EECS|  UC Berkeley|
|   3|    matte Silwal|               1|  [500, 2550, 100]|  1|    Ph.D.|                EECS|  UC Berkeley|
|   4|  Devid Shrestha|               1|[5040, 2550, 1040]|  1|    Ph.D.|                EECS|  UC Berkeley|
|   5|    Matei Panday|               1| [500, 2500, 1500]|  1|    Ph.D.|                EECS|  UC Berkeley|
|   6|Michael Armbrust|               1|       [250, 1000]|  1|    Ph.D.|                EECS|  UC Berkeley|
|null|            n

                                                                                

# Left Outer join

In [11]:
joinExpression = person["graduate_program"] == graduateProgram['id']
joinType = "left_outer"
graduateProgram.join(person, joinExpression, joinType).show()

                                                                                

+---+---------+--------------------+-------------+----+----------------+----------------+------------------+
| id|   degree|          department|       school|  id|            name|graduate_program|      spark_status|
+---+---------+--------------------+-------------+----+----------------+----------------+------------------+
|  0|      PHD|School of Informa...|  UC Berkeley|   0|   Bill Chambers|               0|             [100]|
|  2|  Masters|                 KEC|  UC Berkeley|null|            null|            null|              null|
|  3|Bachelors|                EECS|  UC Berkeley|null|            null|            null|              null|
|  4|  Masters|                 EEC|UC Berkeley w|null|            null|            null|              null|
|  1|    Ph.D.|                EECS|  UC Berkeley|   6|Michael Armbrust|               1|       [250, 1000]|
|  1|    Ph.D.|                EECS|  UC Berkeley|   5|    Matei Panday|               1| [500, 2500, 1500]|
|  1|    Ph.D.|    

# right Outer join

In [14]:
joinExpression = person["graduate_program"] == graduateProgram['id']
joinType = "right_outer"
graduateProgram.join(person, joinExpression, joinType).show()



+---+------+--------------------+-----------+---+----------------+----------------+------------------+
| id|degree|          department|     school| id|            name|graduate_program|      spark_status|
+---+------+--------------------+-----------+---+----------------+----------------+------------------+
|  0|   PHD|School of Informa...|UC Berkeley|  0|   Bill Chambers|               0|             [100]|
|  1| Ph.D.|                EECS|UC Berkeley|  1|   Matei Zaharia|               1|   [500, 250, 100]|
|  1| Ph.D.|                EECS|UC Berkeley|  3|    matte Silwal|               1|  [500, 2550, 100]|
|  1| Ph.D.|                EECS|UC Berkeley|  4|  Devid Shrestha|               1|[5040, 2550, 1040]|
|  1| Ph.D.|                EECS|UC Berkeley|  5|    Matei Panday|               1| [500, 2500, 1500]|
|  1| Ph.D.|                EECS|UC Berkeley|  6|Michael Armbrust|               1|       [250, 1000]|
+---+------+--------------------+-----------+---+----------------+-------



# left semi join

In [17]:
joinExpression = person["graduate_program"] == graduateProgram['id']
joinType = "left_semi"
graduateProgram.join(person, joinExpression, joinType).show()



+---+------+--------------------+-----------+
| id|degree|          department|     school|
+---+------+--------------------+-----------+
|  0|   PHD|School of Informa...|UC Berkeley|
|  1| Ph.D.|                EECS|UC Berkeley|
+---+------+--------------------+-----------+



                                                                                

# left anti join

In [18]:
joinExpression = person["graduate_program"] == graduateProgram['id']
joinType = "left_semi"
graduateProgram.join(person, joinExpression, joinType).show()



+---+------+--------------------+-----------+
| id|degree|          department|     school|
+---+------+--------------------+-----------+
|  0|   PHD|School of Informa...|UC Berkeley|
|  1| Ph.D.|                EECS|UC Berkeley|
+---+------+--------------------+-----------+



                                                                                

# Anti join

In [20]:
joinExpression = person["graduate_program"] == graduateProgram['id']
joinType = "anti"
graduateProgram.join(person, joinExpression, joinType).show()

                                                                                

+---+---------+----------+-------------+
| id|   degree|department|       school|
+---+---------+----------+-------------+
|  2|  Masters|       KEC|  UC Berkeley|
|  3|Bachelors|      EECS|  UC Berkeley|
|  4|  Masters|       EEC|UC Berkeley w|
+---+---------+----------+-------------+



# cross join

In [21]:
joinExpression = person["graduate_program"] == graduateProgram['id']
joinType = "cross"
graduateProgram.join(person, joinExpression, joinType).show()



+---+------+--------------------+-----------+---+----------------+----------------+------------------+
| id|degree|          department|     school| id|            name|graduate_program|      spark_status|
+---+------+--------------------+-----------+---+----------------+----------------+------------------+
|  0|   PHD|School of Informa...|UC Berkeley|  0|   Bill Chambers|               0|             [100]|
|  1| Ph.D.|                EECS|UC Berkeley|  1|   Matei Zaharia|               1|   [500, 250, 100]|
|  1| Ph.D.|                EECS|UC Berkeley|  3|    matte Silwal|               1|  [500, 2550, 100]|
|  1| Ph.D.|                EECS|UC Berkeley|  4|  Devid Shrestha|               1|[5040, 2550, 1040]|
|  1| Ph.D.|                EECS|UC Berkeley|  5|    Matei Panday|               1| [500, 2500, 1500]|
|  1| Ph.D.|                EECS|UC Berkeley|  6|Michael Armbrust|               1|       [250, 1000]|
+---+------+--------------------+-----------+---+----------------+-------

                                                                                