Create a new notebook
Load both csv and json file. Union them properly.
Instead of dropping the duplicates, create a boolean column is_duplicate and set False to only one row and set False to only one row and True to the rest of them

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
schema = StructType(
    [
        StructField("id", IntegerType()),
        StructField("name", StringType()),
        StructField("dob", DateType()),
        StructField("age", IntegerType()),
        StructField("salary", IntegerType()),
        StructField("department", StringType()),
    ]
)

In [0]:
df_csv = spark.read.format("csv").schema(schema).option("header", True).option("inferSchema", True).load("dbfs:///FileStore/tables/csv/batch.csv")

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json = spark.read.format("json").schema(schema).load("dbfs:///FileStore/tables/json").orderBy("id") #sets header auto no order of data, 


In [0]:
df_json.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

In [0]:
df = df_csv.union(df_json)
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  9| James|1983-10-14|  39| 87000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
+---+------+----------+----+------+----------+



In [0]:
from pyspark.sql.functions import row_number

In [0]:
window =Window.partitionBy(*df.columns).orderBy("id")
# next approach
# df.withColumn("is_duplicate", 
#     F.when(
#         row_number().over(window) == 1,
#         False
#     ).otherwise(
#         True
#     )
# ).show()

df.withColumn("is_duplicate", 
    F.row_number().over(window) != 1,      
).show()


+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|is_duplicate|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|       false|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|        true|
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|
|  3|   Bob|      null|null| 80000|        IT|       false|
|  3|   Bob|      null|null| 80000|        IT|        true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|        true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       false|
|  5| David|1981-12-18|  41| 90000|        HR|       false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|
|  7|  Mike|1976-03-15|  46| 95000|     

Calculate mean salary and check if it is greater or equal to the salary of employees in each department.
 
Calculate mean salary and check if it is greater or equal to the salary of all employees.

In [0]:
# department_window =Window.partitionBy("department")
# df = df.dropDuplicates()
# df=df.withColumn(
#     "mean_salary",
#     F.mean("salary").over(department_window)
# )

In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  9| James|1983-10-14|  39| 87000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
+---+------+----------+----+------+----------+



In [0]:
department_window =Window.partitionBy("department")


In [0]:

df.withColumns(
    {
        "mean_salary":F.mean("salary").over(department_window),
        "is_salary_greater_department_mean":F.col("salary") >= F.col("mean_salary"),
    }
).show()

+---+------+----------+----+------+----------+-----------------+---------------------------------+
| id|  name|       dob| age|salary|department|      mean_salary|is_salary_greater_department_mean|
+---+------+----------+----+------+----------+-----------------+---------------------------------+
|  4| Emily|1994-11-22|  28| 65000|   Finance|71166.66666666667|                            false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|71166.66666666667|                             true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|71166.66666666667|                            false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|71166.66666666667|                            false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|71166.66666666667|                             true|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|71166.66666666667|                            false|
|  2| Alice|1997-02-28|  25| 60000|        HR|          67000.0|                            false|
|  2| Alic

In [0]:
window_spec =Window.partitionBy()

df.withColumns(
    {
        "mean": F.mean("salary").over(window_spec),
        "is_salary_above_mean":
        F.col("salary")>=F.mean("salary").over(window_spec)
    }
).show()


+---+------+----------+----+------+----------+-----------------+--------------------+
| id|  name|       dob| age|salary|department|             mean|is_salary_above_mean|
+---+------+----------+----+------+----------+-----------------+--------------------+
|  1|  John|1992-05-12|  30| 70000|        IT|73352.94117647059|               false|
|  2| Alice|1997-02-28|  25| 60000|        HR|73352.94117647059|               false|
|  3|   Bob|      null|null| 80000|        IT|73352.94117647059|                true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|73352.94117647059|               false|
|  1|  John|1992-05-12|  30| 70000|        IT|73352.94117647059|               false|
|  1|  John|1992-05-12|  30| 70000|        IT|73352.94117647059|               false|
|  2| Alice|1997-02-28|  25| 60000|        HR|73352.94117647059|               false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|73352.94117647059|                true|
|  3|   Bob|      null|null| 80000|        IT|73352.94