In [198]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext, SQLContext

from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame

In [201]:
spark.stop()

In [202]:
spark = SparkSession.builder.master("local").appName("unionjointest").getOrCreate()

In [144]:
simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
+-------------+----------+-----+------+---+-----+



In [145]:
simpleData2 = [("James","Sales","NY",90000,34,10000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns2= ["employee_name","department","state","salary","age","bonus"]

df2 = spark.createDataFrame(data = simpleData2, schema = columns2)

df2.printSchema()
df2.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [146]:
df3 = df.union(df2)

In [147]:
df3.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [148]:
unionAllDF = df.unionAll(df2)
unionAllDF.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [149]:
df_union = df.union(df2).distinct()
df_union.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|       Robert|     Sales|   CA| 81000| 30|23000|
+-------------+----------+-----+------+---+-----+



In [150]:
df_union_drop = df.union(df2).dropDuplicates()
df_union_drop.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|       Robert|     Sales|   CA| 81000| 30|23000|
+-------------+----------+-----+------+---+-----+



In [151]:
df_union1 = df.union(df2).union(df3)
df_union1.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|


In [152]:
def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

df5 = unionAll(df, df2, df3)
df5.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|


In [153]:
# rdd_union
rdd = spark.sparkContext.parallelize([1, 1, 2, 3])
rdd.union(rdd).collect()

[1, 1, 2, 3, 1, 1, 2, 3]

In [154]:
product = spark.read.csv("./products.csv")
print(product.rdd.getNumPartitions())
productdup = product.union(product)
print(productdup.rdd.getNumPartitions())
productdup.show()

17
34
+----------+------------+-----+
|       _c0|         _c1|  _c2|
+----------+------------+-----+
|product_id|product_name|price|
|         0|   product_0|   22|
|         1|   product_1|   30|
|         2|   product_2|   91|
|         3|   product_3|   37|
|         4|   product_4|  145|
|         5|   product_5|  128|
|         6|   product_6|   66|
|         7|   product_7|  145|
|         8|   product_8|   51|
|         9|   product_9|   44|
|        10|  product_10|   53|
|        11|  product_11|   13|
|        12|  product_12|  104|
|        13|  product_13|  102|
|        14|  product_14|   24|
|        15|  product_15|   14|
|        16|  product_16|   38|
|        17|  product_17|   72|
|        18|  product_18|   16|
+----------+------------+-----+
only showing top 20 rows



In [155]:
product = spark.read.parquet("./products_parquet")
print(product.rdd.getNumPartitions())
product.show()

8
+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         0|   product_0|   22|
|         1|   product_1|   30|
|         2|   product_2|   91|
|         3|   product_3|   37|
|         4|   product_4|  145|
|         5|   product_5|  128|
|         6|   product_6|   66|
|         7|   product_7|  145|
|         8|   product_8|   51|
|         9|   product_9|   44|
|        10|  product_10|   53|
|        11|  product_11|   13|
|        12|  product_12|  104|
|        13|  product_13|  102|
|        14|  product_14|   24|
|        15|  product_15|   14|
|        16|  product_16|   38|
|        17|  product_17|   72|
|        18|  product_18|   16|
|        19|  product_19|   46|
+----------+------------+-----+
only showing top 20 rows



In [156]:
productdup = product.join(product, on = "product_id", how = "inner")
print(productdup.rdd.getNumPartitions())
productdup.show()

200
+----------+----------------+-----+----------------+-----+
|product_id|    product_name|price|    product_name|price|
+----------+----------------+-----+----------------+-----+
|  10000108|product_10000108|  115|product_10000108|  115|
|  10000172|product_10000172|  133|product_10000172|  133|
|  10000304|product_10000304|   71|product_10000304|   71|
|  10000454|product_10000454|  135|product_10000454|  135|
|  10000472|product_10000472|   62|product_10000472|   62|
|  10000528|product_10000528|  122|product_10000528|  122|
|  10000591|product_10000591|   23|product_10000591|   23|
|  10000670|product_10000670|   30|product_10000670|   30|
|  10000720|product_10000720|  118|product_10000720|  118|
|  10000723|product_10000723|   92|product_10000723|   92|
|  10000761|product_10000761|  124|product_10000761|  124|
|  10000835|product_10000835|   74|product_10000835|   74|
|  10000989|product_10000989|    8|product_10000989|    8|
|    100010|  product_100010|   98|  product_100010|

In [157]:
heroes_data = [
    ('Deadpool', 3), 
    ('Iron man', 1),
    ('Groot', 7),
]
race_data = [
    ('Kryptonian', 5), 
    ('Mutant', 3), 
    ('Human', 1), 
]
heroes = spark.createDataFrame(heroes_data, ['name', 'id'])
races = spark.createDataFrame(race_data, ['race', 'id'])

In [158]:
heroes.show()
races.show()

+--------+---+
|    name| id|
+--------+---+
|Deadpool|  3|
|Iron man|  1|
|   Groot|  7|
+--------+---+

+----------+---+
|      race| id|
+----------+---+
|Kryptonian|  5|
|    Mutant|  3|
|     Human|  1|
+----------+---+



In [159]:
heroes.crossJoin(races).show()

+--------+---+----------+---+
|    name| id|      race| id|
+--------+---+----------+---+
|Deadpool|  3|Kryptonian|  5|
|Deadpool|  3|    Mutant|  3|
|Deadpool|  3|     Human|  1|
|Iron man|  1|Kryptonian|  5|
|Iron man|  1|    Mutant|  3|
|Iron man|  1|     Human|  1|
|   Groot|  7|Kryptonian|  5|
|   Groot|  7|    Mutant|  3|
|   Groot|  7|     Human|  1|
+--------+---+----------+---+



In [160]:
heroes.join(heroes).show()

+--------+---+--------+---+
|    name| id|    name| id|
+--------+---+--------+---+
|Deadpool|  3|Deadpool|  3|
|Deadpool|  3|Iron man|  1|
|Deadpool|  3|   Groot|  7|
|Iron man|  1|Deadpool|  3|
|Iron man|  1|Iron man|  1|
|Iron man|  1|   Groot|  7|
|   Groot|  7|Deadpool|  3|
|   Groot|  7|Iron man|  1|
|   Groot|  7|   Groot|  7|
+--------+---+--------+---+



In [161]:
heroes.join(heroes, on = "id", how = 'inner').show()

+---+--------+--------+
| id|    name|    name|
+---+--------+--------+
|  7|   Groot|   Groot|
|  1|Iron man|Iron man|
|  3|Deadpool|Deadpool|
+---+--------+--------+



In [162]:
heroes.join(races, on = "id", how = 'inner').show()

+---+--------+------+
| id|    name|  race|
+---+--------+------+
|  1|Iron man| Human|
|  3|Deadpool|Mutant|
+---+--------+------+



In [163]:
heroes.join(races).show()

+--------+---+----------+---+
|    name| id|      race| id|
+--------+---+----------+---+
|Deadpool|  3|Kryptonian|  5|
|Deadpool|  3|    Mutant|  3|
|Deadpool|  3|     Human|  1|
|Iron man|  1|Kryptonian|  5|
|Iron man|  1|    Mutant|  3|
|Iron man|  1|     Human|  1|
|   Groot|  7|Kryptonian|  5|
|   Groot|  7|    Mutant|  3|
|   Groot|  7|     Human|  1|
+--------+---+----------+---+



In [164]:
# adding a new row to a dataframe
newrow = spark.createDataFrame([('Iron man', 1)], ['name', 'id'])

In [165]:
heroes_new = heroes.union(newrow)
heroes_new.show()

+--------+---+
|    name| id|
+--------+---+
|Deadpool|  3|
|Iron man|  1|
|   Groot|  7|
|Iron man|  1|
+--------+---+



In [166]:
# inner join don't remove duplicates
heroes_new.join(races, on = "id", how = 'inner').show()

+---+--------+------+
| id|    name|  race|
+---+--------+------+
|  1|Iron man| Human|
|  1|Iron man| Human|
|  3|Deadpool|Mutant|
+---+--------+------+



In [167]:
heroes.join(races, on = "id", how = 'left').show()

+---+--------+------+
| id|    name|  race|
+---+--------+------+
|  7|   Groot|  null|
|  1|Iron man| Human|
|  3|Deadpool|Mutant|
+---+--------+------+



In [168]:
heroes.join(races, on = "id", how = 'leftouter').show()

+---+--------+------+
| id|    name|  race|
+---+--------+------+
|  7|   Groot|  null|
|  1|Iron man| Human|
|  3|Deadpool|Mutant|
+---+--------+------+



In [169]:
heroes.join(races, on='id', how='right').show()

+---+--------+----------+
| id|    name|      race|
+---+--------+----------+
|  5|    null|Kryptonian|
|  1|Iron man|     Human|
|  3|Deadpool|    Mutant|
+---+--------+----------+



In [170]:
heroes.join(races, on='id', how='rightouter').show()

+---+--------+----------+
| id|    name|      race|
+---+--------+----------+
|  5|    null|Kryptonian|
|  1|Iron man|     Human|
|  3|Deadpool|    Mutant|
+---+--------+----------+



In [171]:
heroes.join(races, on='id', how='full').show()

+---+--------+----------+
| id|    name|      race|
+---+--------+----------+
|  7|   Groot|      null|
|  5|    null|Kryptonian|
|  1|Iron man|     Human|
|  3|Deadpool|    Mutant|
+---+--------+----------+



In [172]:
heroes.join(races, on='id', how='outer').show()

+---+--------+----------+
| id|    name|      race|
+---+--------+----------+
|  7|   Groot|      null|
|  5|    null|Kryptonian|
|  1|Iron man|     Human|
|  3|Deadpool|    Mutant|
+---+--------+----------+



In [173]:
heroes.join(races, on='id', how='leftsemi').show()

+---+--------+
| id|    name|
+---+--------+
|  1|Iron man|
|  3|Deadpool|
+---+--------+



In [174]:
heroes.join(races, on='id', how='leftanti').show()

+---+-----+
| id| name|
+---+-----+
|  7|Groot|
+---+-----+



# Join with Bradcast

In [181]:
peopleDF = spark.createDataFrame(data = [("andrea", "medellin"), ("rodolfo", "medellin"), ("abdul", "bangalore")], schema = ["first_name", "city"])
peopleDF.show()

+----------+---------+
|first_name|     city|
+----------+---------+
|    andrea| medellin|
|   rodolfo| medellin|
|     abdul|bangalore|
+----------+---------+



In [182]:
peopleDF = spark.sparkContext.parallelize([("andrea", "medellin"), ("rodolfo", "medellin"), ("abdul", "bangalore")]) \
           .toDF(("first_name", "city"))
peopleDF.show()

+----------+---------+
|first_name|     city|
+----------+---------+
|    andrea| medellin|
|   rodolfo| medellin|
|     abdul|bangalore|
+----------+---------+



In [184]:
citiesDF = spark.sparkContext.parallelize([("medellin", "colombia", 2.5), ("bangalore", "india", 12.3)]) \
           .toDF(("city", "country", "population"))
citiesDF.show()

+---------+--------+----------+
|     city| country|population|
+---------+--------+----------+
| medellin|colombia|       2.5|
|bangalore|   india|      12.3|
+---------+--------+----------+



In [186]:
peopleDF.join(citiesDF, on = 'city').show()

+---------+----------+--------+----------+
|     city|first_name| country|population|
+---------+----------+--------+----------+
|bangalore|     abdul|   india|      12.3|
| medellin|    andrea|colombia|       2.5|
| medellin|   rodolfo|colombia|       2.5|
+---------+----------+--------+----------+



In [187]:
peopleDF.join(citiesDF, on = 'city').explain()

== Physical Plan ==
*(5) Project [city#3846, first_name#3845, country#3859, population#3860]
+- *(5) SortMergeJoin [city#3846], [city#3858], Inner
   :- *(2) Sort [city#3846 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(city#3846, 200), true, [id=#5431]
   :     +- *(1) Filter isnotnull(city#3846)
   :        +- *(1) Scan ExistingRDD[first_name#3845,city#3846]
   +- *(4) Sort [city#3858 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(city#3858, 200), true, [id=#5437]
         +- *(3) Filter isnotnull(city#3858)
            +- *(3) Scan ExistingRDD[city#3858,country#3859,population#3860]




In [190]:
from pyspark.sql.functions import broadcast
peopleDF.join(broadcast(citiesDF), on = 'city').show()

+---------+----------+--------+----------+
|     city|first_name| country|population|
+---------+----------+--------+----------+
| medellin|    andrea|colombia|       2.5|
| medellin|   rodolfo|colombia|       2.5|
|bangalore|     abdul|   india|      12.3|
+---------+----------+--------+----------+



In [193]:
# from pyspark.sql.functions import broadcast
# peopleDF.join(broadcast(citiesDF), (peopleDF.city == citiesDF.city)).show()

+----------+---------+---------+--------+----------+
|first_name|     city|     city| country|population|
+----------+---------+---------+--------+----------+
|    andrea| medellin| medellin|colombia|       2.5|
|   rodolfo| medellin| medellin|colombia|       2.5|
|     abdul|bangalore|bangalore|   india|      12.3|
+----------+---------+---------+--------+----------+



In [196]:
peopleDF.join(broadcast(citiesDF), on = 'city').explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner,Buffer(city))
:- LogicalRDD [first_name#3845, city#3846], false
+- ResolvedHint (strategy=broadcast)
   +- LogicalRDD [city#3858, country#3859, population#3860], false

== Analyzed Logical Plan ==
city: string, first_name: string, country: string, population: double
Project [city#3846, first_name#3845, country#3859, population#3860]
+- Join Inner, (city#3846 = city#3858)
   :- LogicalRDD [first_name#3845, city#3846], false
   +- ResolvedHint (strategy=broadcast)
      +- LogicalRDD [city#3858, country#3859, population#3860], false

== Optimized Logical Plan ==
Project [city#3846, first_name#3845, country#3859, population#3860]
+- Join Inner, (city#3846 = city#3858), rightHint=(strategy=broadcast)
   :- Filter isnotnull(city#3846)
   :  +- LogicalRDD [first_name#3845, city#3846], false
   +- Filter isnotnull(city#3858)
      +- LogicalRDD [city#3858, country#3859, population#3860], false

== Physical Plan ==
*(2) Project [city#3846, first_

In [207]:
heroes_data = [
    ('Deadpool', 3), 
    ('Iron man', 1),
    ('Groot', 7),
]
race_data = [
    ('Kryptonian', 5), 
    ('Mutant', 3), 
    ('Human', 1), 
]
heroes = spark.createDataFrame(heroes_data, ['name', 'id'])
races = spark.createDataFrame(race_data, ['race', 'id_b'])

In [208]:
print(heroes.show())
print(races.show())

+--------+---+
|    name| id|
+--------+---+
|Deadpool|  3|
|Iron man|  1|
|   Groot|  7|
+--------+---+

None
+----------+----+
|      race|id_b|
+----------+----+
|Kryptonian|   5|
|    Mutant|   3|
|     Human|   1|
+----------+----+

None


In [211]:
df = heroes.join(races, heroes["id"] == races["id_b"], how = "inner")
df.show()

+--------+---+------+----+
|    name| id|  race|id_b|
+--------+---+------+----+
|Iron man|  1| Human|   1|
|Deadpool|  3|Mutant|   3|
+--------+---+------+----+



In [232]:
data1 = [
    (123, 1), 
    (456, 2),
    (111, 3),
    (678, 4),
]

data2 = [
    (456, 2), 
    (111, 3), 
    (876, 4), 
]

i = spark.createDataFrame(data1, ['COL_A', 'ID'])
j = spark.createDataFrame(data2, ['COL_B', 'ID_B'])

In [233]:
print(i.show())
print(j.show())

+-----+---+
|COL_A| ID|
+-----+---+
|  123|  1|
|  456|  2|
|  111|  3|
|  678|  4|
+-----+---+

None
+-----+----+
|COL_B|ID_B|
+-----+----+
|  456|   2|
|  111|   3|
|  876|   4|
+-----+----+

None


In [234]:
i.subtract(j).show()

+-----+---+
|COL_A| ID|
+-----+---+
|  123|  1|
|  678|  4|
+-----+---+



In [235]:
left_join = i.join(j, j["COL_B"] == i["COL_A"],how='left')
left_join.show()

+-----+---+-----+----+
|COL_A| ID|COL_B|ID_B|
+-----+---+-----+----+
|  678|  4| null|null|
|  111|  3|  111|   3|
|  123|  1| null|null|
|  456|  2|  456|   2|
+-----+---+-----+----+



In [236]:
left_join = left_join.filter(left_join["COL_B"].isNull()).select('COL_A', 'ID')
left_join.show()

+-----+---+
|COL_A| ID|
+-----+---+
|  678|  4|
|  123|  1|
+-----+---+



In [244]:
data1 = [
    (123, 1), 
    (456, 2),
    (111, 3),
    (678, 4),
]

data2 = [
    (981, 2), 
    (876, 5), 
]

i = spark.createDataFrame(data1, ['COL_A', 'ID'])
j = spark.createDataFrame(data2, ['COL_B', 'ID'])

In [245]:
print(i.show())
print(j.show())

+-----+---+
|COL_A| ID|
+-----+---+
|  123|  1|
|  456|  2|
|  111|  3|
|  678|  4|
+-----+---+

None
+-----+---+
|COL_B| ID|
+-----+---+
|  981|  2|
|  876|  5|
+-----+---+

None


In [246]:
i.subtract(j).show()

+-----+---+
|COL_A| ID|
+-----+---+
|  123|  1|
|  456|  2|
|  111|  3|
|  678|  4|
+-----+---+



In [249]:
fulljoin = i.join(j, on = "ID", how = "full")
fulljoin.show()

+---+-----+-----+
| ID|COL_A|COL_B|
+---+-----+-----+
|  5| null|  876|
|  1|  123| null|
|  3|  111| null|
|  2|  456|  981|
|  4|  678| null|
+---+-----+-----+



In [251]:
fulljoin.filter(fulljoin["COL_B"].isNull() == False).show()

+---+-----+-----+
| ID|COL_A|COL_B|
+---+-----+-----+
|  5| null|  876|
|  2|  456|  981|
+---+-----+-----+

