load both csv and json.
instead of dropping duplicates, create a boolean column is_duplicate and set False to only one row and true to rest of them.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark = SparkSession.builder.getOrCreate()
spark


In [0]:
# df_csv = spark.read.format("csv").option("header",True).option("inferSchema",True).load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
schema = StructType([
    StructField("id",IntegerType()),
    StructField("name",StringType()),
    StructField("dob",DateType()),
    StructField("age",IntegerType()),
    StructField("salary",IntegerType()),
    StructField("department",StringType())
])

In [0]:
df_csv = spark.read.format("csv").schema(schema).option("header",True).load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
df_json = spark.read.format("json").schema(schema).load("dbfs:/FileStore/tables/json/")

In [0]:
df_json = df_json.select(df_csv.columns)

In [0]:
df_union = df_csv.union(df_json)

In [0]:
df_union = df_union.orderBy("id")

In [0]:
df_union.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  9| James|1983-10-14|  39| 87000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
+---+------+----------+----+------+----------+



In [0]:
df_union.createOrReplaceTempView("table1")

In [0]:
#using spark sql
df_result_duplicate = spark.sql("select *,row_number() over (partition by id,name,dob,age,salary,department order by id) AS rnk from table1")
df_result_duplicate.show()

+---+------+----------+----+------+----------+---+
| id|  name|       dob| age|salary|department|rnk|
+---+------+----------+----+------+----------+---+
|  1|  John|1992-05-12|  30| 70000|        IT|  1|
|  1|  John|1992-05-12|  30| 70000|        IT|  2|
|  1|  John|1992-05-12|  30| 70000|        IT|  3|
|  2| Alice|1997-02-28|  25| 60000|        HR|  1|
|  2| Alice|1997-02-28|  25| 60000|        HR|  2|
|  2| Alice|1997-02-28|  25| 90000|   Finance|  1|
|  3|   Bob|      null|null| 80000|        IT|  1|
|  3|   Bob|      null|null| 80000|        IT|  2|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  1|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  2|
|  4| Emily|1994-11-22|  28| 70000|   Finance|  1|
|  5| David|1981-12-18|  41| 90000|        HR|  1|
|  6| Susan|1989-07-05|  33| 75000|   Finance|  1|
|  7|  Mike|1976-03-15|  46| 95000|        IT|  1|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|  1|
|  9| James|1983-10-14|  39| 87000|        IT|  1|
| 10|Sophie|1992-06-30|  30| 62

In [0]:
from pyspark.sql import functions as F

In [0]:
df_result_duplicate = df_result_duplicate.withColumn(
    "is_duplicate",
    F.col("rnk") != 1
)
df_result_duplicate.show()

+---+------+----------+----+------+----------+---+------------+
| id|  name|       dob| age|salary|department|rnk|is_duplicate|
+---+------+----------+----+------+----------+---+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|  1|       false|
|  1|  John|1992-05-12|  30| 70000|        IT|  2|        true|
|  1|  John|1992-05-12|  30| 70000|        IT|  3|        true|
|  2| Alice|1997-02-28|  25| 60000|        HR|  1|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|  2|        true|
|  2| Alice|1997-02-28|  25| 90000|   Finance|  1|       false|
|  3|   Bob|      null|null| 80000|        IT|  1|       false|
|  3|   Bob|      null|null| 80000|        IT|  2|        true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  1|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  2|        true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|  1|       false|
|  5| David|1981-12-18|  41| 90000|        HR|  1|       false|
|  6| Susan|1989-07-05|  33| 75000|   Fi

In [0]:
from pyspark.sql import Window

In [0]:
window = Window.partitionBy(df_union.columns).orderBy("id")

In [0]:
df_duplicate_bool = df_union.withColumn(
    "is_duplicate",
    F.row_number().over(window) != 1   
)
df_duplicate_bool.show()

+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|is_duplicate|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|       false|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|        true|
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|
|  3|   Bob|      null|null| 80000|        IT|       false|
|  3|   Bob|      null|null| 80000|        IT|        true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|        true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       false|
|  5| David|1981-12-18|  41| 90000|        HR|       false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|
|  7|  Mike|1976-03-15|  46| 95000|     

Calculate mean salary and check if it is greater or equal to the salary of employees in each department.
 
Calculate mean salary and check if it is greater or equal to the salary of all employees.

In [0]:
#using spark sql
df_result_sal_greater_avgsal_eachdept = spark.sql("SELECT *,ROUND(AVG(salary) OVER (PARTITION BY department),2) AS avg_salary FROM table1")
df_result_sal_greater_avgsal_eachdept.show()

+---+------+----------+----+------+----------+----------+
| id|  name|       dob| age|salary|department|avg_salary|
+---+------+----------+----+------+----------+----------+
|  2| Alice|1997-02-28|  25| 90000|   Finance|  71166.67|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  71166.67|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  71166.67|
|  4| Emily|1994-11-22|  28| 70000|   Finance|  71166.67|
|  6| Susan|1989-07-05|  33| 75000|   Finance|  71166.67|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|  71166.67|
|  2| Alice|1997-02-28|  25| 60000|        HR|   67000.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|   67000.0|
|  5| David|1981-12-18|  41| 90000|        HR|   67000.0|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|   67000.0|
|  1|  John|1992-05-12|  30| 70000|        IT|  78857.14|
|  1|  John|1992-05-12|  30| 70000|        IT|  78857.14|
|  1|  John|1992-05-12|  30| 70000|        IT|  78857.14|
|  3|   Bob|      null|null| 80000|        IT|  78857.14|
|  3|   Bob|  

In [0]:
df_result_sal_greater_avgsal_eachdept = df_result_sal_greater_avgsal_eachdept.withColumn(
    "is_greater",
    F.col("salary") >= F.col("avg_salary")
)
df_result_sal_greater_avgsal_eachdept.show()

+---+------+----------+----+------+----------+----------+----------+
| id|  name|       dob| age|salary|department|avg_salary|is_greater|
+---+------+----------+----+------+----------+----------+----------+
|  2| Alice|1997-02-28|  25| 90000|   Finance|  71166.67|      true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  71166.67|     false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  71166.67|     false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|  71166.67|     false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|  71166.67|      true|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|  71166.67|     false|
|  2| Alice|1997-02-28|  25| 60000|        HR|   67000.0|     false|
|  2| Alice|1997-02-28|  25| 60000|        HR|   67000.0|     false|
|  5| David|1981-12-18|  41| 90000|        HR|   67000.0|      true|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|   67000.0|     false|
|  1|  John|1992-05-12|  30| 70000|        IT|  78857.14|     false|
|  1|  John|1992-05-12|  30| 70000

In [0]:
#using spark sql
df_result_sal_greater_alldept = spark.sql("SELECT *,ROUND(AVG(salary) OVER (),2) AS avg_salary FROM table1")
df_result_sal_greater_alldept.show()

+---+------+----------+----+------+----------+----------+
| id|  name|       dob| age|salary|department|avg_salary|
+---+------+----------+----+------+----------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|  73352.94|
|  1|  John|1992-05-12|  30| 70000|        IT|  73352.94|
|  1|  John|1992-05-12|  30| 70000|        IT|  73352.94|
|  2| Alice|1997-02-28|  25| 60000|        HR|  73352.94|
|  2| Alice|1997-02-28|  25| 60000|        HR|  73352.94|
|  2| Alice|1997-02-28|  25| 90000|   Finance|  73352.94|
|  3|   Bob|      null|null| 80000|        IT|  73352.94|
|  3|   Bob|      null|null| 80000|        IT|  73352.94|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  73352.94|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  73352.94|
|  4| Emily|1994-11-22|  28| 70000|   Finance|  73352.94|
|  5| David|1981-12-18|  41| 90000|        HR|  73352.94|
|  6| Susan|1989-07-05|  33| 75000|   Finance|  73352.94|
|  7|  Mike|1976-03-15|  46| 95000|        IT|  73352.94|
|  8|  Lisa|19

In [0]:
df_result_sal_greater_alldept = df_result_sal_greater_alldept.withColumn(
    "is_greater",
    F.col("salary") >= F.col("avg_salary")
)
df_result_sal_greater_alldept.show()

+---+------+----------+----+------+----------+----------+----------+
| id|  name|       dob| age|salary|department|avg_salary|is_greater|
+---+------+----------+----+------+----------+----------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|  73352.94|     false|
|  1|  John|1992-05-12|  30| 70000|        IT|  73352.94|     false|
|  1|  John|1992-05-12|  30| 70000|        IT|  73352.94|     false|
|  2| Alice|1997-02-28|  25| 60000|        HR|  73352.94|     false|
|  2| Alice|1997-02-28|  25| 60000|        HR|  73352.94|     false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|  73352.94|      true|
|  3|   Bob|      null|null| 80000|        IT|  73352.94|      true|
|  3|   Bob|      null|null| 80000|        IT|  73352.94|      true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  73352.94|     false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|  73352.94|     false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|  73352.94|     false|
|  5| David|1981-12-18|  41| 90000

In [0]:
window = Window.partitionBy("department")

In [0]:
df_avgsal_dept = df_union.withColumn(
    "sal_greater_than_avgsal",
    F.col("salary") >= F.avg("salary").over(window)     
)
df_avgsal_dept.show()

+---+------+----------+----+------+----------+-----------------------+
| id|  name|       dob| age|salary|department|sal_greater_than_avgsal|
+---+------+----------+----+------+----------+-----------------------+
|  1|  John|1992-05-12|  30| 70000|        IT|                  false|
|  1|  John|1992-05-12|  30| 70000|        IT|                  false|
|  1|  John|1992-05-12|  30| 70000|        IT|                  false|
|  2| Alice|1997-02-28|  25| 60000|        HR|                  false|
|  2| Alice|1997-02-28|  25| 60000|        HR|                  false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|                   true|
|  3|   Bob|      null|null| 80000|        IT|                   true|
|  3|   Bob|      null|null| 80000|        IT|                   true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|                  false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|                  false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|                  false|
|  5| 

In [0]:
window = Window.partitionBy()

In [0]:
df_avgsal_all_dept = df_union.withColumn(
    "sal_greater_than_avgsal_alldept",
    F.col("salary") >= F.avg("salary").over(window)
)
df_avgsal_all_dept.show()

+---+------+----------+----+------+----------+-------------------------------+
| id|  name|       dob| age|salary|department|sal_greater_than_avgsal_alldept|
+---+------+----------+----+------+----------+-------------------------------+
|  1|  John|1992-05-12|  30| 70000|        IT|                          false|
|  1|  John|1992-05-12|  30| 70000|        IT|                          false|
|  1|  John|1992-05-12|  30| 70000|        IT|                          false|
|  2| Alice|1997-02-28|  25| 60000|        HR|                          false|
|  2| Alice|1997-02-28|  25| 60000|        HR|                          false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|                           true|
|  3|   Bob|      null|null| 80000|        IT|                           true|
|  3|   Bob|      null|null| 80000|        IT|                           true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|                          false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|      