## Spark SQL: "NOT EXISTS" AND "EXISTS" equivalent operations on dataframes



In [28]:
import findspark
findspark.init()
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [29]:
spark = SparkSession.builder.appName('test_dataframes').enableHiveSupport().getOrCreate()

## Generating data

In [30]:
data1 = [(1,"Andre"),(2,"Rose")]
data2 = [(1,"Andre"),(2,"Rose"),(3,"Daniel")]
data3 = [(1,"Andre"),(2,"Rose"),(3,"Daniel"), (3,"Daniel"), (4,"Anita")]
schema = StructType([
    StructField("id",StringType(),True),
    StructField("name",StringType(),True),
])

df1 = spark.createDataFrame(data1,schema)
df2 = spark.createDataFrame(data2,schema)
df3 = spark.createDataFrame(data3,schema)
df1.show()
df2.show()
df3.show()

+---+-----+
| id| name|
+---+-----+
|  1|Andre|
|  2| Rose|
+---+-----+

+---+------+
| id|  name|
+---+------+
|  1| Andre|
|  2|  Rose|
|  3|Daniel|
+---+------+

+---+------+
| id|  name|
+---+------+
|  1| Andre|
|  2|  Rose|
|  3|Daniel|
|  3|Daniel|
|  4| Anita|
+---+------+



## NOT EXISTS EQUIVALENT

### Method 1 - subtract

In [31]:
# All that exists in df2 but doesn't dexists in df1
dfr = df2.subtract(df1)
dfr.show()

+---+------+
| id|  name|
+---+------+
|  3|Daniel|
+---+------+



In [32]:
# All that exists in df3 but doesn't dexists in df1
dfr2 = df3.subtract(df1)
dfr2.show()



+---+------+
| id|  name|
+---+------+
|  3|Daniel|
|  4| Anita|
+---+------+



### Method 2 - left_anti. This is the 'classical' way to have something equivalent to 'NOT EXISTS'

In [33]:
# All that exists in df2 but doesn't dexists in df1
dfr = df2.join(df1,'id','left_anti')
dfr.show()

+---+------+
| id|  name|
+---+------+
|  3|Daniel|
+---+------+



In [34]:
# All that exists in df3 but doesn't dexists in df1
dfr2 = df3.join(df1,'id','left_anti').dropDuplicates()
dfr2.show()

+---+------+
| id|  name|
+---+------+
|  3|Daniel|
|  4| Anita|
+---+------+



### Method 3 - exceptAll - same thing as 'subtract', apearently!

In [35]:
# All that exists in df2 but doesn't dexists in df1
dfr = df2.exceptAll(df1)
dfr.show()

+---+------+
| id|  name|
+---+------+
|  3|Daniel|
+---+------+



In [36]:
# All that exists in df3 but doesn't dexists in df1
dfr2 = df3.exceptAll(df1).dropDuplicates()
dfr2.show()

+---+------+
| id|  name|
+---+------+
|  3|Daniel|
|  4| Anita|
+---+------+



## EXISTS EQUIVALENT

### Method 1 - left_semi

In [15]:
# Only that exists in df2 and df1
dfr = df2.join(df1,'id','left_semi')
dfr.show()

+---+-----+
| id| name|
+---+-----+
|  1|Andre|
|  2| Rose|
+---+-----+



In [16]:
# Only that exists in df3 and df1
dfr = df3.join(df1,'id','left_semi')
dfr.show()

+---+-----+
| id| name|
+---+-----+
|  1|Andre|
|  2| Rose|
+---+-----+

