In [None]:
import pandas as pd

data = pd.read_excel("superstore.xls")
superstore = spark.createDataFrame(data)

In [None]:
#df.printSchema() shows the schema

superstore.printSchema()

In [None]:
#df.selectExpr("`col name` as id") gives a df with col name renamed in display

superstore.selectExpr("`Order ID` as id").show(5)

In [None]:
#df.where("filter condition") is used to filter the data based on a given logical condition

superstore.where("Quantity > 10")\
          .selectExpr("`Order ID`", "`Postal Code`", "`Quantity`")\
          .show(5)

In [None]:
from pyspark.sql.functions import column

#df.sort(col("col name")).desc() orders the dataframe in descending order

superstore.where(column("Category") == "Furniture")\
          .select("Order ID", "Postal Code", "Quantity")\
          .sort(column("Quantity").desc())\
          .show(5)

In [None]:
#multiple df.where fucntions can be used to filter more conditions

superstore.where(column("Category") != "Furniture")\
          .where(column("Region") == "West")\
          .selectExpr("Category", "Region", "Quantity")\
          .show(5)

In [None]:
#df.createOrReplaceTempView("view name") can be used to create temp view which can be used to query tables using spark.sql

superstore.createOrReplaceTempView("superstore_view")

spark.sql("""DESCRIBE FORMATTED superstore_view""").show(21)

In [None]:
#spark.sql("SQL QUERY") can be used to query temp views

spark.sql("SELECT Category, Region, Quantity FROM superstore_view").show(5)

In [None]:
spark.sql("""SELECT  Category, Region, SUM(Quantity)
FROM superstore_view 
GROUP BY Category, Region
ORDER BY SUM(Quantity) DESC LIMIT 10""")

In [None]:
spark.sql(""" SELECT Category, (Region = "Furniture" AND Quantity > 10 OR SALES > 1000) as test
FROM superstore_view WHERE Region = "Furniture" AND Quantity > 10 OR SALES > 1000""").show(5)

In [None]:
#df.describe() can be used to display the summary of the dataframe
#df.toPandas() can be used to convert a spark dataframe into pandas dataframe

superstore.describe().toPandas().T

In [None]:
#df.distinct() gives the distinct values in a selected column

superstore.select("State").distinct().sort("State").show(5)

In [None]:
#df.sample() can be used to randomly select a sample of data from a dataframe

seed = 5
withReplacement = False
fraction = 0.3

superstore.sample(withReplacement, fraction, seed).count()

In [None]:
#df.randomSplit() can be used to split the data into two fractions

df = superstore.randomSplit([0.25, 0.75], seed = 5)

In [None]:
print("Test: ", df[0].count(),"\n","Train: ", df[1].count())

In [None]:
#df.union(df2) can be used to join two tables of same columns

test = df[0]
train = df[1]

new = test.union(train)
new.count()

In [None]:
from pyspark.sql.functions import desc, asc, col

newdf = superstore.select("State").distinct()

In [None]:
newdf.count()

In [None]:
newdf.show(5)

In [None]:
newdf.sort("State").show(5)

In [None]:
#df.orderBy(col("col name")) can also be used to sort a dataframe column

from pyspark.sql.functions import desc

newdf.orderBy(col("State").desc()).show(5)

In [None]:
superstore.rdd.getNumPartitions()

In [None]:
newdf = superstore.repartition(8)

print(newdf.rdd.getNumPartitions())

In [None]:
newdf = newdf.coalesce(2)

print(newdf.rdd.getNumPartitions())

In [None]:
superstore.rdd.getNumPartitions()

In [None]:
superstore.where(col("State") != "Washington")\
.select("State", "Quantity")\
.groupBy("State")\
.sum("Quantity")\
.withColumnRenamed("sum(Quantity)", "Quantity")\
.orderBy(col("Quantity").desc())\
.show(5)

In [None]:
quantFilter = col("Quantity") > 5
catFilter = col("Category") != "Furniture"
cols = ["State", "Ship Mode", "Quantity"]
superstore.where(quantFilter | catFilter)\
.select("State", "Ship Mode", "Quantity")\
.groupBy("State", "Ship Mode")\
.sum("Quantity")\
.withColumnRenamed("sum(Quantity)", "Quantity")\
.orderBy(cols, ascending = True)\
.show(5)

In [None]:
from pyspark.sql.functions import corr

superstore.stat.corr("Quantity", "Profit")
superstore.select(corr("Quantity", "Profit")).show()

In [None]:
superstore.select("Sales", "Profit", "Quantity", "Discount").describe().show()

In [None]:
colName = "Sales"
quantileProbs = [0.25, 0.5, 0.75]
relError = 0.05

for i in superstore.stat.approxQuantile(colName, quantileProbs, relError):
    print(round(i, 3))

In [None]:
from pyspark.sql.functions import monotonically_increasing_id
superstore.select(monotonically_increasing_id()).withColumnRenamed("monotonically_increasing_id()", "ROW_ID").show(5)

In [None]:
from pyspark.sql.functions import initcap, lower, upper, lit

superstore.select(initcap(lit("test row")), upper(lit("test row")), lower(lit("TEST ROW"))).show(5)

In [None]:
from pyspark.sql.functions import ltrim, rtrim, lpad, rpad, trim

superstore.select(ltrim(lit("          hello         ")).alias("ltrim"),
                 rtrim(lit("         hello         ")).alias("rtrim"),
                 trim(lit("           hello        ")).alias("trim"),
                 rpad(lit("hello"), 10, " ").alias("rpad"),
                 lpad(lit("hello"), 3, " ").alias("lpad")).show(5)


In [None]:
from pyspark.sql.functions import regexp_replace

regex_string = "2017|2018"

superstore.select(regexp_replace(col("Order ID"), regex_string, "latest").alias("latest orders")).show(5)

In [None]:
from pyspark.sql.functions import translate

superstore.select(translate(col("State"), "AEIOUaeiou", "0123456789").alias("translated state")).show(5)

In [None]:
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())

dateDF.show(5, False)

In [None]:
dateDF.printSchema()

In [None]:
type(dateDF)

In [None]:
dateDF.columns

In [None]:
dateDF.rdd.getNumPartitions()

In [None]:
from pyspark.sql.functions import date_add, date_sub, date_trunc

dateDF.select(date_add(col("today"), 5), 
              date_sub(col("today"), 5), 
              date_trunc("yyyy", col("today"))).show(5)

In [None]:
from pyspark.sql.functions import to_date

spark.range(5).withColumn("date", lit("2019-01-01")).select(to_date(col("date"))).show()

In [None]:
from pyspark.sql.functions import datediff, months_between, to_date

dateDF.withColumn("week_ago", date_sub(col("today"), 7)).select(datediff(col("week_ago"), col("today"))).show(5)

In [None]:
dateDF.select(to_date(lit("2016-04-01")).alias("start"),
              to_date(lit("2016-05-21")).alias("end"))\
              .select(months_between(col("start"), col("end"))).show(5)

In [None]:
dateDF.select(to_date(lit("01-04-1991"), "dd-MM-yyyy").alias("start"),
             to_date(lit("02-01-2019"), "dd-MM-yyyy").alias("end"))\
            .select(months_between(col("end"), col("start"))).show(5)

In [None]:
from pyspark.sql.functions import coalesce

superstore.select(coalesce(col("City"), col("Product Name"))).show(5)

In [None]:
superstore.na.drop().show(5)

In [None]:
superstore.na.drop("all").show(5)

In [None]:
superstore.na.drop("all").show(5)

In [None]:
superstore.na.drop("all", subset = ["Order Id", "Order Date"]).show(5)

In [None]:
superstore.na.fill("All null values become this string").show(5)

In [None]:
superstore.na.replace([" "], ["UNKNOWN"], "Description").show(5)

In [None]:
#struct funtion is used to create a complex column by combining multiple columns so that they can be later queried

from pyspark.sql.functions import struct

df_test = superstore.select(struct("Row ID", "Order ID").alias("complex"))

In [None]:
df_test.select("complex.Order ID").show(5)

In [None]:
#split function is used to split rows of a column into arrays

from pyspark.sql.functions import split

superstore.select(split(col("Customer Name"), " ").alias("First_and_Last_Names"))\
.selectExpr("First_and_Last_Names[1]")\
.show(5)

In [None]:
#size function can be used to find the size of the array

from pyspark.sql.functions import size, array_contains

superstore.select(size(split(col("Customer Name"), " ")).alias("name_split")).show(5)

In [None]:
#array_contains can be used to check whether the array contains a given value

superstore.select(array_contains(split(col("Customer Name"), " "), "Hoffman").alias("is_hoffman")).show(5)

In [None]:
#explode function can be used to create new rows from the indicidual values of an array

from pyspark.sql.functions import explode

superstore.withColumn("splitted", split(col("Customer Name"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Customer Name","splitted", "exploded").show(5)

In [None]:
#map function can be used to create key value pairs of columns

from pyspark.sql.functions import create_map

superstore.select(create_map(col("Customer Name"), col("Order ID")).alias("mapped")).show(5, False)

In [None]:
#maps can be queried

superstore.select(create_map(col("Customer Name"), col("Order ID")).alias("mapped"))\
.selectExpr("mapped['Claire Gute']").show(5)

In [None]:
#handling jason data

jsondf = spark.range(1).selectExpr(""" '{"myJsonKey": 
                                                {"myJsonValues": [1, 2, 3]}}' 
                                                    as jsonString """)

In [None]:
from pyspark.sql.functions import get_json_object, json_tuple

jsondf.select(
    get_json_object(col("jsonString"), "$.myJsonKey.myJsonValues[0]").alias("column"), 
    json_tuple(col("jsonString"), "myJsonKey").alias("jsonKey")).show(10, False)

In [None]:
from pyspark.sql.functions import to_json

superstore.selectExpr("(`Order ID`, `Customer Name`) as myStruct")\
.select(to_json(col("myStruct"))).show(5, False)

# Aggregations

In [None]:
#Count of the Order IDs

from pyspark.sql.functions import count

superstore.select(count("Order ID")).show()

In [None]:
#Count of distinct Order ID

from pyspark.sql.functions import countDistinct

superstore.select(countDistinct("Order ID")).show()

In [None]:
#find only a certain degree of of count distinct

from pyspark.sql.functions import approx_count_distinct

superstore.select(approx_count_distinct("Order ID", 0.01)).show()

In [None]:
#find the first and the last items in a columns

from pyspark.sql.functions import first, last

superstore.select(first("Order ID"), last("Order ID")).show()

In [None]:
#find the min and max for a column

from pyspark.sql.functions import min, max

superstore.select(min("Quantity"), max("Quantity")).show()

In [None]:
#find the sum of a numerical column

from pyspark.sql.functions import sum

superstore.select(sum("Sales")).show()

In [None]:
# find the disticnt sum of a column

from pyspark.sql.functions import sumDistinct

superstore.select(sumDistinct("Sales")).show()

In [None]:
#find the sum, max, min, expr of a column

from pyspark.sql.functions import avg

superstore.select(sum("Quantity").alias("total_products_sold"), 
                  countDistinct("Customer ID").alias("distinct_products_sold"),
                 sum("Sales").alias("total_sales_amount"),
                 avg("Sales").alias("mean_sales")).selectExpr("total_products_sold", 
                                                             "distinct_products_sold",
                                                             "total_sales_amount",
                                                             "mean_sales").show()

In [None]:
from pyspark.sql.functions import var_pop, stddev_pop, var_samp, stddev_samp

superstore.select(var_pop("Quantity"), var_samp("Quantity"), 
                  stddev_pop("Quantity"), stddev_samp("Quantity")).show()



In [None]:
from pyspark.sql.functions import skewness, kurtosis

superstore.select(skewness("Sales"), kurtosis("Sales")).show()

In [None]:
#aggregarte complex types e.g aggregate the number of states

from pyspark.sql.functions import collect_list, collect_set

superstore.agg(collect_list("State"), collect_set("State")).show()

In [None]:
#groupby condition on dataframes

from pyspark.sql.functions import expr

superstore.groupby("Region", "State").agg(expr("count('Order ID')").alias("Order Count"))\
.orderBy("Region", "State").show(50)

In [None]:
#group by functions 

superstore.groupby("Order ID").agg(expr("max(Quantity)").alias("max_quantity"), stddev_pop("Quantity"))\
.orderBy("max_quantity", ascending = False).show()

In [None]:
#window functions ***IMPORTANT***

dfWithDate = superstore.withColumn("date", to_date(col("Order Date"), "MM/d/yyyy H:mm"))
dfWithDate = dfWithDate.select(col("Category"), col("date"), col("Sales"))

dfWithDate.createOrReplaceTempView("dfWithDate")

In [None]:
dfWithDate.show(3)

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

windowSpec = Window\
.partitionBy("date", "Category")\
.orderBy(desc("Sales"))\
.rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [None]:
maxPurchaseQuantity = max(col("Sales")).over(windowSpec)

In [None]:
from pyspark.sql.functions import dense_rank, rank, row_number

purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)
rownum = row_number().over(windowSpec)

In [None]:
dfWithDate.where("`Category` IS NOT NULL").orderBy("date")\
.select(col("Category"), 
        col("date"), 
        col("Sales"), 
        purchaseRank.alias("rank"), 
        purchaseDenseRank.alias("dense_rank"), 
        maxPurchaseQuantity.alias("max_sales"),
       rownum.alias("row id")).show()

In [None]:
#roll up

dfNoNull = dfWithDate.drop()
dfNoNull.createOrReplaceTempView("dfNoNull")

In [None]:
dfNoNull.show(5)

In [None]:
#grouping sets can be used to group data across multiple groups. Only available in SQL

spark.sql("SELECT Category, date, sum(Sales) FROM dfNoNull GROUP BY Category, date GROUPING SETS ((Category, date), ()) ORDER BY Category ASC, date DESC").show(10)

In [None]:
#roll up function

rolledUpDF = dfNoNull.rollup("date", "Category")\
.agg(sum("Sales")).selectExpr("date", "Category", "`sum(Sales)` as total_sales")\
.orderBy("date")

rolledUpDF.show()

In [None]:
rolledUpDF.where("Category IS NULL").show()

In [None]:
rolledUpDF.where("date IS NULL").show()

In [None]:
#cube function

cubeDF = dfNoNull.cube("date", "Category", "Sales")\
.agg(max("Sales"))\
.selectExpr("date", "Category", "`max(Sales)` as highest_sales")\
.orderBy("date")

In [None]:
cubeDF.sort("date", ascending = False).show()

In [None]:
cubeDF.where("Category IS NULL").show()

In [None]:
pivotDF = dfWithDate.groupby("date").pivot("Category").sum()

In [None]:
pivotDF.where("`date` > '2018-11-30'").orderBy("date", ascending = True).show(30)

In [None]:
from pyspark.sql.functions import monotonically_increasing_id, col, year

df1 = superstore.select(monotonically_increasing_id(), year("Order Date"))\
.withColumnRenamed("year(Order Date)", "date1")\
.withColumnRenamed("monotonically_increasing_id()", "ROW_ID")

In [None]:
df2 = superstore.select(monotonically_increasing_id(), year("Ship Date"))\
.withColumnRenamed("year(Ship Date)", "date2")\
.withColumnRenamed("monotonically_increasing_id()", "ROW_ID")

In [None]:
df3 = df2.join(df1, df1.ROW_ID == df2.ROW_ID)\
.withColumn("difference", df2.date2 - df1.date1)\
.selectExpr("date1", "date2", "difference")

In [None]:
df3.show(15)

In [None]:
df3.where("difference > 0").orderBy("date1", ascending = False).show()

In [None]:
df3.describe().toPandas().T

# Pair RDD

## Pair RDD with tuples

In [None]:
spark

# Joins

In [8]:
person = spark.createDataFrame([
    (0, "Bill Chambers", 0, [100]),
    (1, "Matei Zaharia", 1, [500, 250, 100]),
    (2, "Michael Armbrust", 1, [250, 100])]).toDF("id", "name", "graduate_course", "spark_status")

graduateProgram = spark.createDataFrame([
    (0, "Masters", "School of Information", "UC Berkeley"),
    (2, "Masters", "EECS", "UC Berkeley"),
    (1, "Ph.D.", "EECS", "UC Berkeley")]).toDF("id", "degree", "department", "school")

sparkStatus = spark.createDataFrame([
    (500, "Vice President"),
    (250, "PMC Member"),
    (100, "Contributor")]).toDF("id", "status")

In [9]:
person.createOrReplaceTempView("person")
graduateProgram.createOrReplaceTempView("graduateProgram")
sparkStatus.createOrReplaceTempView("sparkStatus")

### Inner Join

In [15]:
joinCondition = person["graduate_course"] == graduateProgram["id"]

joinedDf = person.join(graduateProgram, joinCondition)
joinedDf.show()

+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_course|   spark_status| id| degree|          department|     school|
+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|              0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|              1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|              1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+---------------+---------------+---+-------+--------------------+-----------+



In [13]:
spark.sql("SELECT * FROM person INNER JOIN graduateProgram ON person.graduate_course = graduateProgram.id").show()

+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_course|   spark_status| id| degree|          department|     school|
+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|              0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|              1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|              1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+---------------+---------------+---+-------+--------------------+-----------+



### Outer Join

In [16]:
joinType = "outer"

joinedDf = person.join(graduateProgram, joinCondition, joinType)
joinedDf.show()

+----+----------------+---------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_course|   spark_status| id| degree|          department|     school|
+----+----------------+---------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|              0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|              1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|              1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|           null|           null|  2|Masters|                EECS|UC Berkeley|
+----+----------------+---------------+---------------+---+-------+--------------------+-----------+



In [20]:
spark.sql("SELECT * FROM person FULL OUTER JOIN graduateProgram ON person.graduate_course = graduateProgram.id").show()

+----+----------------+---------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_course|   spark_status| id| degree|          department|     school|
+----+----------------+---------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|              0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|              1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|              1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|           null|           null|  2|Masters|                EECS|UC Berkeley|
+----+----------------+---------------+---------------+---+-------+--------------------+-----------+



### Left Outer Join

In [22]:
joinType = "left_outer"

joinedDf = person.join(graduateProgram, joinCondition, joinType)
joinedDf.show()

+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_course|   spark_status| id| degree|          department|     school|
+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|              0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|              1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|              1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+---------------+---------------+---+-------+--------------------+-----------+



In [23]:
spark.sql("SELECT * FROM person LEFT OUTER JOIN graduateProgram ON person.graduate_course = graduateProgram.id").show()

+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_course|   spark_status| id| degree|          department|     school|
+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|              0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|              1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|              1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+---------------+---------------+---+-------+--------------------+-----------+



### Right Outer Join

In [24]:
joinType = "right_outer"

joinedDf = person.join(graduateProgram, joinCondition, joinType)
joinedDf.show()

+----+----------------+---------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_course|   spark_status| id| degree|          department|     school|
+----+----------------+---------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|              0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|              1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|              1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|           null|           null|  2|Masters|                EECS|UC Berkeley|
+----+----------------+---------------+---------------+---+-------+--------------------+-----------+



In [25]:
spark.sql("SELECT * FROM person RIGHT OUTER JOIN graduateProgram ON person.graduate_course = graduateProgram.id").show()

+----+----------------+---------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_course|   spark_status| id| degree|          department|     school|
+----+----------------+---------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|              0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|              1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|              1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|           null|           null|  2|Masters|                EECS|UC Berkeley|
+----+----------------+---------------+---------------+---+-------+--------------------+-----------+



### Left Semi Joins

In [27]:
joinType = "left_semi"

joinedDf = graduateProgram.join(person, joinCondition, joinType)
joinedDf.show()

+---+-------+--------------------+-----------+
| id| degree|          department|     school|
+---+-------+--------------------+-----------+
|  0|Masters|School of Informa...|UC Berkeley|
|  1|  Ph.D.|                EECS|UC Berkeley|
+---+-------+--------------------+-----------+



In [29]:
spark.sql("SELECT * FROM graduateProgram LEFT SEMI JOIN person ON graduateProgram.id = person.graduate_course").show()

+---+-------+--------------------+-----------+
| id| degree|          department|     school|
+---+-------+--------------------+-----------+
|  0|Masters|School of Informa...|UC Berkeley|
|  1|  Ph.D.|                EECS|UC Berkeley|
+---+-------+--------------------+-----------+



### Left Anti Join

In [30]:
joinType = "left_anti"

joinedDf = graduateProgram.join(person, joinCondition, joinType)
joinedDf.show()

+---+-------+----------+-----------+
| id| degree|department|     school|
+---+-------+----------+-----------+
|  2|Masters|      EECS|UC Berkeley|
+---+-------+----------+-----------+



In [31]:
spark.sql("SELECT * FROM graduateProgram LEFT ANTI JOIN person ON graduateProgram.id = person.graduate_course").show()

+---+-------+----------+-----------+
| id| degree|department|     school|
+---+-------+----------+-----------+
|  2|Masters|      EECS|UC Berkeley|
+---+-------+----------+-----------+



### Cross Join

In [33]:
joinType = "cross"

joinedDf = person.join(graduateProgram, joinCondition, joinType)
joinedDf.show()

+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_course|   spark_status| id| degree|          department|     school|
+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|              0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|              1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|              1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+---------------+---------------+---+-------+--------------------+-----------+



In [35]:
spark.sql("SELECT * FROM person CROSS JOIN graduateProgram").show()

+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_course|   spark_status| id| degree|          department|     school|
+---+----------------+---------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|              0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  0|   Bill Chambers|              0|          [100]|  2|Masters|                EECS|UC Berkeley|
|  0|   Bill Chambers|              0|          [100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  1|   Matei Zaharia|              1|[500, 250, 100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|              1|[500, 250, 100]|  2|Masters|                EECS|UC Berkeley|
|  1|   Matei Zaharia|              1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|              1|     [250, 100]|  0|Masters|School of Informa...|UC Berkeley|
