In [0]:
# Load both csv and json union them properly.
#  Instead of dropping duplicates, create a boolean column is_duplicate and set False to only one row and True to rest of them

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()


In [0]:
df_csv=spark.read.format("csv").load("dbfs:////FileStore/tables/csv/batch.csv")
df_json=spark.read.format("json").load("dbfs:/FileStore/tables/json")



In [0]:
df_json.show()
df_csv.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  28|   Finance|1994-11-22|  4| Emily| 70000|
|  39|        IT|1983-10-14|  9| James| 87000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  27|        HR|1995-08-20|  8|  Lisa| 58000|
+----+----------+----------+---+------+------+

+---+-----+----------+----+------+----------+
|_c0|  _c1|       _c2| _c3|   _c4|       _c5|
+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
|  1| John|1992-

In [0]:
df_csv.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DateType

In [0]:
schema =StructType([
    StructField("id",IntegerType()),
    StructField("name",StringType()),
    StructField("dob",DateType()),
    StructField("age",IntegerType()),
    StructField("salary",IntegerType()),
    StructField("department",StringType()),

])




In [0]:
df_csv=spark.read.format("csv").schema(schema).option("header",True).load("dbfs:////FileStore/tables/csv/batch.csv")
df_json = spark.read.format("json").schema(schema).option("header",True).load("dbfs:///FileStore/tables/json")


In [0]:
df_csv.printSchema()
df_json.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_csv.show()
df_json.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-

In [0]:
# Task : Load both csv and json union them
from pyspark.sql import functions as F
from pyspark.sql.window import Window
df=df_csv.union(df_json)
display(df)


id,name,dob,age,salary,department
1,John,1992-05-12,30.0,70000,IT
2,Alice,1997-02-28,25.0,60000,HR
3,Bob,,,80000,IT
4,Emily,1994-11-22,28.0,65000,Finance
1,John,1992-05-12,30.0,70000,IT
2,Alice,1997-02-28,25.0,60000,HR
3,Bob,,,80000,IT
4,Emily,1994-11-22,28.0,65000,Finance
5,David,1981-12-18,41.0,90000,HR
6,Susan,1989-07-05,33.0,75000,Finance


In [0]:
df.withColumn( "is_duplicate", (F.row_number().over(Window.partitionBy(df.columns).orderBy("id"))) > 1 ).show()

+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|is_duplicate|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|       false|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|        true|
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|
|  3|   Bob|      null|null| 80000|        IT|       false|
|  3|   Bob|      null|null| 80000|        IT|        true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|        true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       false|
|  5| David|1981-12-18|  41| 90000|        HR|       false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|
|  7|  Mike|1976-03-15|  46| 95000|     

In [0]:
# Calculate mean salary and check if it is greater or equal to the salary of employees in each department.
mean_salary = (F.avg("salary"))

df=df.withColumn( "mean_salary", ((mean_salary).over(Window.partitionBy("department")) > df.salary ))
display(df)



id,name,dob,age,salary,department,mean_salary
4,Emily,1994-11-22,28.0,65000,Finance,True
4,Emily,1994-11-22,28.0,65000,Finance,True
6,Susan,1989-07-05,33.0,75000,Finance,False
10,Sophie,1992-06-30,30.0,62000,Finance,True
2,Alice,1997-02-28,25.0,90000,Finance,False
4,Emily,1994-11-22,28.0,70000,Finance,True
2,Alice,1997-02-28,25.0,60000,HR,True
2,Alice,1997-02-28,25.0,60000,HR,True
5,David,1981-12-18,41.0,90000,HR,False
8,Lisa,1995-08-20,27.0,58000,HR,True


In [0]:
# Calculate mean salary and check if it is greater or equal to the salary of all employees

df.withColumn( "mean_salary", (df.select(F.avg("salary")).first()[0]) <= df.salary ).show()

+---+------+----------+----+------+----------+-----------+
| id|  name|       dob| age|salary|department|mean_salary|
+---+------+----------+----+------+----------+-----------+
|  1|  John|1992-05-12|  30| 70000|        IT|      false|
|  2| Alice|1997-02-28|  25| 60000|        HR|      false|
|  3|   Bob|      null|null| 80000|        IT|       true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|      false|
|  1|  John|1992-05-12|  30| 70000|        IT|      false|
|  2| Alice|1997-02-28|  25| 60000|        HR|      false|
|  3|   Bob|      null|null| 80000|        IT|       true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|      false|
|  5| David|1981-12-18|  41| 90000|        HR|       true|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       true|
|  7|  Mike|1976-03-15|  46| 95000|        IT|       true|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|      false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|       true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|      fals