In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('UNION').getOrCreate()

In [3]:
simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema=columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
+-------------+----------+-----+------+---+-----+



In [4]:
simpleData2 = [("James","Sales","NY",90000,34,10000), \
  ("Maria","Finance","CA",90000,24,23000), \
  ("Jen","Finance","NY",79000,53,15000), \
  ("Jeff","Marketing","CA",80000,25,18000), \
  ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns2= ["employee_name","department","state","salary","age","bonus"]

df2 = spark.createDataFrame(data = simpleData2, schema = columns2)

df2.printSchema()
df2.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



### union - unions df correspondingly to their columns. If columns are not correspondent in other dataframe, wrong union happens


In [13]:
df.union(df2).orderBy("bonus").show(truncate=False)
distdf = df2.union(df)
distdf = distdf.orderBy("bonus")
distdf.show()

distdf.printSchema()
df.printSchema()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|James        |Sales     |NY   |90000 |34 |10000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Robert       |Sales     |CA   |81000 |30 |23000|
+-------------+----------+-----+------+---+-----+

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|        James|     Sales|   NY| 90000| 34|10000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|

In [15]:
data = [("James",34), ("Michael",56), 
        ("Robert",30), ("Maria",24) ]

df1 = spark.createDataFrame(data = data, schema=["name","id"])
df1.printSchema()
df1.show()

root
 |-- name: string (nullable = true)
 |-- id: long (nullable = true)

+-------+---+
|   name| id|
+-------+---+
|  James| 34|
|Michael| 56|
| Robert| 30|
|  Maria| 24|
+-------+---+



In [16]:
data2=[(34,"James"),(45,"Maria"), 
       (45,"Jen"),(34,"Jeff")]

df2 = spark.createDataFrame(data = data2, schema = ["id","name"])
df2.printSchema()
df2.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+---+-----+
| id| name|
+---+-----+
| 34|James|
| 45|Maria|
| 45|  Jen|
| 34| Jeff|
+---+-----+



### unionByName() - to merge/union two DataFrames with column names, different columns are also merged

In [17]:
df3 = df1.unionByName(df2)
df3.printSchema()
df3.show()

root
 |-- name: string (nullable = true)
 |-- id: long (nullable = true)

+-------+---+
|   name| id|
+-------+---+
|  James| 34|
|Michael| 56|
| Robert| 30|
|  Maria| 24|
|  James| 34|
|  Maria| 45|
|    Jen| 45|
|   Jeff| 34|
+-------+---+



### wrong union example

In [23]:
df3 = df1.union(df2)
df3.printSchema()
df3.show()

root
 |-- name: string (nullable = true)
 |-- id: string (nullable = true)

+-------+-----+
|   name|   id|
+-------+-----+
|  James|   34|
|Michael|   56|
| Robert|   30|
|  Maria|   24|
|     34|James|
|     45|Maria|
|     45|  Jen|
|     34| Jeff|
+-------+-----+

