### 1. Merge Two DataFrames

In [40]:
from pyspark.sql import SparkSession

In [41]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Interview_Questions").getOrCreate()

In [42]:
df1_columns = ["id", "name", "age", "salary", "city"]

In [43]:
# Sample Data for DataFrame with 5 columns
data1 = [
    (1, "Alice", 28, 70000.00, "New York"),
    (2, "Bob", 34, 85000.50, "Los Angeles"),
    (3, "Cathy", 29, 95000.00, "Chicago"),
    (4, "David", 41, 123000.00, "San Francisco"),
    (5, "Eva", 36, 110000.00, "Seattle")
]

In [44]:
df1 = spark.createDataFrame(data1, schema = df1_columns)

In [45]:
df1.show()

+---+-----+---+--------+-------------+
| id| name|age|  salary|         city|
+---+-----+---+--------+-------------+
|  1|Alice| 28| 70000.0|     New York|
|  2|  Bob| 34| 85000.5|  Los Angeles|
|  3|Cathy| 29| 95000.0|      Chicago|
|  4|David| 41|123000.0|San Francisco|
|  5|  Eva| 36|110000.0|      Seattle|
+---+-----+---+--------+-------------+



In [46]:
df1.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: double (nullable = true)
 |-- city: string (nullable = true)



In [47]:
df2_columns = ["id", "name", "age", "salary"]

In [48]:
data2 = [
    (6, "Aniket", 28, 70000.00),
    (7, "Swati", 34, 85000.50),
    (3, "Anmol", 29, 95000.00)
]

In [49]:
df2 = spark.createDataFrame(data2, schema = df2_columns)

In [50]:
df2.show()

+---+------+---+-------+
| id|  name|age| salary|
+---+------+---+-------+
|  6|Aniket| 28|70000.0|
|  7| Swati| 34|85000.5|
|  3| Anmol| 29|95000.0|
+---+------+---+-------+



In [51]:
df2.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: double (nullable = true)



In [52]:
df2.rdd.getNumPartitions()

2

In [53]:
df3 = df1.union(df2)

AnalysisException: Union can only be performed on tables with the same number of columns, but the first table has 5 columns and the second table has 4 columns;
'Union false, false
:- LogicalRDD [id#104L, name#105, age#106L, salary#107, city#108], false
+- LogicalRDD [id#135L, name#136, age#137L, salary#138], false


In [55]:
from pyspark.sql.functions import lit

In [56]:
df2 = df2.withColumn("city",lit("Null"))

In [57]:
df2.show()

+---+------+---+-------+----+
| id|  name|age| salary|city|
+---+------+---+-------+----+
|  6|Aniket| 28|70000.0|Null|
|  7| Swati| 34|85000.5|Null|
|  3| Anmol| 29|95000.0|Null|
+---+------+---+-------+----+



In [58]:
df3 = df1.union(df2)

In [59]:
df3.show()

+---+------+---+--------+-------------+
| id|  name|age|  salary|         city|
+---+------+---+--------+-------------+
|  1| Alice| 28| 70000.0|     New York|
|  2|   Bob| 34| 85000.5|  Los Angeles|
|  3| Cathy| 29| 95000.0|      Chicago|
|  4| David| 41|123000.0|San Francisco|
|  5|   Eva| 36|110000.0|      Seattle|
|  6|Aniket| 28| 70000.0|         Null|
|  7| Swati| 34| 85000.5|         Null|
|  3| Anmol| 29| 95000.0|         Null|
+---+------+---+--------+-------------+



### What a column in between is missing? Like Age.

In [60]:
df4_columns = ["id", "name", "salary", "city"]

In [61]:
data4 = [
    (6, "Aniket", 70000.00,"Pune"),
    (7, "Swati", 85000.50, "Hyderabad"),
    (3, "Anmol", 95000.00, "Ahmedabad")
]

In [62]:
df4 = spark.createDataFrame(data4, schema = df4_columns)

In [63]:
df4.show()

+---+------+-------+---------+
| id|  name| salary|     city|
+---+------+-------+---------+
|  6|Aniket|70000.0|     Pune|
|  7| Swati|85000.5|Hyderabad|
|  3| Anmol|95000.0|Ahmedabad|
+---+------+-------+---------+



In [64]:
df5 = df4.withColumn("age", lit("Null"))

In [65]:
df5.show()

+---+------+-------+---------+----+
| id|  name| salary|     city| age|
+---+------+-------+---------+----+
|  6|Aniket|70000.0|     Pune|Null|
|  7| Swati|85000.5|Hyderabad|Null|
|  3| Anmol|95000.0|Ahmedabad|Null|
+---+------+-------+---------+----+



In [66]:
new_column_order = ["id", "name", "age", "salary", "city"]

In [68]:
df5 = df5.select(new_column_order)

In [69]:
df5.show()

+---+------+----+-------+---------+
| id|  name| age| salary|     city|
+---+------+----+-------+---------+
|  6|Aniket|Null|70000.0|     Pune|
|  7| Swati|Null|85000.5|Hyderabad|
|  3| Anmol|Null|95000.0|Ahmedabad|
+---+------+----+-------+---------+



In [70]:
df6 = df1.union(df5)

In [71]:
df6.show()

+---+------+----+--------+-------------+
| id|  name| age|  salary|         city|
+---+------+----+--------+-------------+
|  1| Alice|  28| 70000.0|     New York|
|  2|   Bob|  34| 85000.5|  Los Angeles|
|  3| Cathy|  29| 95000.0|      Chicago|
|  4| David|  41|123000.0|San Francisco|
|  5|   Eva|  36|110000.0|      Seattle|
|  6|Aniket|Null| 70000.0|         Pune|
|  7| Swati|Null| 85000.5|    Hyderabad|
|  3| Anmol|Null| 95000.0|    Ahmedabad|
+---+------+----+--------+-------------+



----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 55296)
Traceback (most recent call last):
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/accumulators.py", line 262, in handle
    poll(accum_updates)
  File "/opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/accumulators.py", line 235, in poll
    if func():
  File "/opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/accumulato