### Task 1
load both csv and json union them properly.

Instead of dropping duplicates, create a boolean column is_duplicate and set False to only one row and True to rest of them

In [0]:
from pyspark.sql.types import StructType, StructField,IntegerType, StringType, DateType

In [0]:
df_csv = spark.read.format("csv").option("header", True).load("dbfs:/FileStore/tables/csv/batch.csv")
df_csv.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- department: string (nullable = true)



In [0]:
sch = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("age", IntegerType()),
    StructField("salary", IntegerType()),
    StructField("department", StringType())
])

In [0]:
df_csv = spark.read.format("csv").schema(sch).option("header", True).load("dbfs:/FileStore/tables/csv/batch.csv")
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_json = spark.read.format("json").schema(sch).load("dbfs:/FileStore/tables/json").orderBy("id")
df_json.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json.show()


+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  9| James|1983-10-14|  39| 87000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
+---+------+----------+----+------+----------+



In [0]:
# to change the order of columns 
# not needed if schema passed manually
'''
df_json = df_json.select(df_csv.columns)
df_json.show()
'''

Out[10]: '\ndf_json = df_json.select(df_csv.columns)\ndf_json.show()\n'

In [0]:
df = df_csv.union(df_json)
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  9| James|1983-10-14|  39| 87000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
+---+------+----------+----+------+----------+



In [0]:
from pyspark.sql import Window
from pyspark.sql import functions as F 


In [0]:
# create window object for duplicates
w = Window.partitionBy(df.columns).orderBy("id")

In [0]:

df1 = df.withColumn(
    "is_duplicate", 
    F.row_number().over(w) > 1   # First value = false , Duplicates = True
).show()

+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|is_duplicate|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|       false|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|        true|
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|
|  3|   Bob|      null|null| 80000|        IT|       false|
|  3|   Bob|      null|null| 80000|        IT|        true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|        true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       false|
|  5| David|1981-12-18|  41| 90000|        HR|       false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|
|  7|  Mike|1976-03-15|  46| 95000|     

### Task 2
Calculate mean salary and check if it is greater or equal to the salary of employees in each department. -1

Calculate mean salary and check if it is greater or equal to the salary of all employees. boolean

In [0]:
w2 = Window.partitionBy("department").orderBy("id")

In [0]:
df2 = df.withColumn(
    "Sal_greater_or_=_departmentwise_avg_sal",
    F.when(
        F.avg("salary").over(w2) <= F.col("salary"), 
        F.col("salary")
    ).otherwise(
        -1
    )   
).show()

+---+------+----------+----+------+----------+---------------------------------------+
| id|  name|       dob| age|salary|department|Sal_greater_or_=_departmentwise_avg_sal|
+---+------+----------+----+------+----------+---------------------------------------+
|  2| Alice|1997-02-28|  25| 90000|   Finance|                                  90000|
|  4| Emily|1994-11-22|  28| 65000|   Finance|                                     -1|
|  4| Emily|1994-11-22|  28| 65000|   Finance|                                     -1|
|  4| Emily|1994-11-22|  28| 70000|   Finance|                                     -1|
|  6| Susan|1989-07-05|  33| 75000|   Finance|                                  75000|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|                                     -1|
|  2| Alice|1997-02-28|  25| 60000|        HR|                                  60000|
|  2| Alice|1997-02-28|  25| 60000|        HR|                                  60000|
|  5| David|1981-12-18|  41| 90000|        

##### Calculate mean salary and check if it is greater or equal to the salary of all employees. boolean

In [0]:
w3 = Window.orderBy("id")

In [0]:
# not getting expected output from this
'''
df3= df.withColumn(
    "Sal_greater_or_equal_to_avgsalaryofallemployees",
    F.avg('salary').over(w3) <= F.col('salary')   
).show()
'''

Out[55]: '\ndf3= df.withColumn(\n    "Sal_greater_or_equal_to_avgsalaryofallemployees",\n    F.avg(\'salary\').over(w3) <= F.col(\'salary\')   \n).show()\n'

In [0]:
# caluculate mean sal
mean_sal = df.agg(F.avg(F.col("salary"))).first()[0]
print(mean_sal)

73352.94117647059


In [0]:
df3= df.withColumn(
    "Sal_greater_or_equal_to_avgsalaryofallemployees",
    mean_sal <= F.col('salary')    # return output in boolean
).show()

+---+------+----------+----+------+----------+-----------------------------------------------+
| id|  name|       dob| age|salary|department|Sal_greater_or_equal_to_avgsalaryofallemployees|
+---+------+----------+----+------+----------+-----------------------------------------------+
|  1|  John|1992-05-12|  30| 70000|        IT|                                          false|
|  2| Alice|1997-02-28|  25| 60000|        HR|                                          false|
|  3|   Bob|      null|null| 80000|        IT|                                           true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|                                          false|
|  1|  John|1992-05-12|  30| 70000|        IT|                                          false|
|  1|  John|1992-05-12|  30| 70000|        IT|                                          false|
|  2| Alice|1997-02-28|  25| 60000|        HR|                                          false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|    