In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.master("local[*]")\
.appName('union')\
.getOrCreate()

In [2]:
df1 = spark.createDataFrame([(1, "foo1"), (2, "bar1")], ("k", "v"))
df2 = spark.createDataFrame([(1, "foo1"), (2, "bar1")], ("k", "v"))

In [3]:
df1.registerTempTable("df1")
df2.registerTempTable("df2")

## union and union all

## Spark SQL

In [4]:
spark.sql("select * from df1  union select * from df2").count()

2

In [5]:
spark.sql("select * from df1  union all select * from df2").count()

4

## PySpark

In [6]:
df1.union(df2).count()

4

In [7]:
df1.unionAll(df2).count()

4

In [8]:
df1.union(df2).dropDuplicates().count()

2

In [None]:
from functools import reduce
from pyspark.sql import DataFrame


def unionAll(*dfs):
    """
    data frame union for two or multiple data frames.
    reference: https://stackoverflow.com/questions/
               33743978/spark-union-of-multiple-rdds
    """
    return reduce(DataFrame.unionAll, dfs)

In [9]:
df1 = spark.createDataFrame([(1, "foo1"), (2, "bar1")], ("k", "v"))
df2 = spark.createDataFrame([("foo2",3,"v1"), ("bar2",4,"v1")], ( "v","k","v1"))
df3 = spark.createDataFrame([( "foo3",5), ("bar3",6)], ( "v","k"))

In [10]:
df1.union(df3.select(df1.columns)).show()

+---+----+
|  k|   v|
+---+----+
|  1|foo1|
|  2|bar1|
|  5|foo3|
|  6|bar3|
+---+----+



In [11]:
df1.unionByName(df3).show()

+---+----+
|  k|   v|
+---+----+
|  1|foo1|
|  2|bar1|
|  5|foo3|
|  6|bar3|
+---+----+



In [None]:
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs) 


unionAll(*[df1, df2, df3]).show()

In [12]:
def unionAll(*dfs):
    first, *_ = dfs  # Python 3.x, for 2.x you'll have to unpack manually
    return first.sql_ctx.createDataFrame(
        first.sql_ctx._sc.union([df.select(first.columns).rdd for df in dfs]),first.schema
    )

In [13]:
unionAll(*[df1, df2, df3]).show()

+---+----+
|  k|   v|
+---+----+
|  1|foo1|
|  2|bar1|
|  3|foo2|
|  4|bar2|
|  5|foo3|
|  6|bar3|
+---+----+



In [15]:
spark.stop()