In [0]:
dbutils.fs.rm("dbfs:/FileStore/data.zip", recurse = True)

Out[2]: False

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

In [0]:
%fs ls dbfs:///FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/csv/,csv/,0,0
dbfs:/FileStore/tables/data.zip,data.zip,2274,1707719091000
dbfs:/FileStore/tables/json/,json/,0,0
dbfs:/FileStore/tables/parquet/,parquet/,0,0


In [0]:
df_csv = spark.read.format("csv").option("header",True).load("dbfs:///FileStore/tables/csv/batch.csv")

In [0]:
df_json = spark.read.format("json").option("header",True).load("dbfs:///FileStore/tables/json/")

In [0]:
from pyspark.sql.types import StructType,StructField, IntegerType,StringType,DateType

In [0]:
schema = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("age", IntegerType()),
    StructField("salary", IntegerType()),
    StructField("department", StringType()),
])

In [0]:
df_csv = spark.read.format("csv").schema(schema).option("header",True).load("dbfs:///FileStore/tables/csv/batch.csv")

In [0]:
df_json = spark.read.format("json").schema(schema).option("header",True).load("dbfs:///FileStore/tables/json")

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_json.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



PySpark Task - 1 :  
Load both csv and json union them properly. Instead of dropping duplicates, create a boolean column is_duplicate and set False to only one row and True to rest of them

In [0]:
from pyspark.sql.window import Window
from pyspark.sql import functions as f

In [0]:
df = df_csv.union(df_json)

In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df.withColumn(
    "is_duplicate",
    (f.row_number().over(Window.partitionBy(df.columns).orderBy("id"))) > 1
).show()

+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|is_duplicate|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|       false|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|        true|
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|
|  3|   Bob|      null|null| 80000|        IT|       false|
|  3|   Bob|      null|null| 80000|        IT|        true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|        true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       false|
|  5| David|1981-12-18|  41| 90000|        HR|       false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|
|  7|  Mike|1976-03-15|  46| 95000|     

PySpark Task - 2 :  
A. Calculate mean salary and check if it is greater or equal to the salary of employees in each department.      

In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df.withColumn(
    "mean_salary",
    (f.avg("salary").over(Window.partitionBy("department")) > df.salary
)).show()

+---+------+----------+----+------+----------+-----------+
| id|  name|       dob| age|salary|department|mean_salary|
+---+------+----------+----+------+----------+-----------+
|  4| Emily|1994-11-22|  28| 65000|   Finance|       true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       true|
|  6| Susan|1989-07-05|  33| 75000|   Finance|      false|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|       true|
|  2| Alice|1997-02-28|  25| 90000|   Finance|      false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       true|
|  5| David|1981-12-18|  41| 90000|        HR|      false|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|       true|
|  1|  John|1992-05-12|  30| 70000|        IT|       true|
|  3|   Bob|      null|null| 80000|        IT|      false|
|  1|  John|1992-05-12|  30| 70000|        IT|       true|
|  3|   Bob|      null|null| 80000|        IT|      fals

PySpark Task - 2 :  
B. Calculate mean salary and check if it is greater or equal to the salary of all employees 

In [0]:
df.withColumn(
    "mean_salary",
    (f.avg("salary").over(Window.partitionBy()) > df.salary
)).show()

+---+------+----------+----+------+----------+-----------+
| id|  name|       dob| age|salary|department|mean_salary|
+---+------+----------+----+------+----------+-----------+
|  1|  John|1992-05-12|  30| 70000|        IT|       true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       true|
|  3|   Bob|      null|null| 80000|        IT|      false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       true|
|  1|  John|1992-05-12|  30| 70000|        IT|       true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       true|
|  3|   Bob|      null|null| 80000|        IT|      false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       true|
|  5| David|1981-12-18|  41| 90000|        HR|      false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|      false|
|  7|  Mike|1976-03-15|  46| 95000|        IT|      false|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|       true|
|  2| Alice|1997-02-28|  25| 90000|   Finance|      false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       tru