In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

In [0]:
df_csv = spark.read.format("csv").option("header", True).load("dbfs:/FileStore/tables/csv/batch.csv")
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType

In [0]:
schema = StructType([ StructField("id", IntegerType()),
                      StructField("name", StringType()), 
                      StructField("dob", DateType()), 
                      StructField("age", IntegerType()), 
                      StructField("salary", IntegerType()), 
                      StructField("department", StringType())])

In [0]:
df_csv = spark.read.format("csv").option("header", True).schema(schema).load("dbfs:/FileStore/tables/csv/batch.csv")
df_csv.printSchema()
df_csv.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_json = spark.read.format("json").load("dbfs:/FileStore/tables/json")

In [0]:
df = df_csv.union(df_json) 
df.printSchema() 
df.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: string (nullable = true)
 |-- department: string (nullable = true)

+----+-------+----------+----+------+----------+
|  id|   name|       dob| age|salary|department|
+----+-------+----------+----+------+----------+
|   1|   John|1992-05-12|  30| 70000|        IT|
|   2|  Alice|1997-02-28|  25| 60000|        HR|
|   3|    Bob|      null|null| 80000|        IT|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|
|  30|     IT|1992-05-12|   1|  John|     70000|
|  25|     HR|1997-02-28|   2| Alice|     60000|
|null|     IT|      null|   3|   Bob|     80000|
|  28|Finance|1994-11-22|   4| Emily|     65000|
|  41|     HR|1981-12-18|   5| David|     90000|
|  33|Finance|1989-07-05|   6| Susan|     75000|
|  46|     IT|1976-03-15|   7|  Mike|     95000|
|  30|Finance|1992-06-30|  10|Sophie|     62000|
|  25|Finance|1997-02-28|   2| Alice|     9

In [0]:
df.count()

Out[13]: 17

In [0]:
from pyspark.sql.window import Window 
from pyspark.sql.functions import col, count, avg

In [0]:
windowSpec = Window.partitionBy([col(x) for x in df.columns]) 
df = df.withColumn("count", count("*").over(windowSpec)) 
df = df.withColumn("isDuplicate", col("count") > 1) 
df = df.drop("count") 
df.show()

+----+-------+----------+----+------+----------+-----------+
|  id|   name|       dob| age|salary|department|isDuplicate|
+----+-------+----------+----+------+----------+-----------+
|null|     IT|      null|   3|   Bob|     80000|      false|
|   1|   John|1992-05-12|  30| 70000|        IT|      false|
|   2|  Alice|1997-02-28|  25| 60000|        HR|      false|
|   3|    Bob|      null|null| 80000|        IT|      false|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|      false|
|  25|Finance|1997-02-28|   2| Alice|     90000|      false|
|  25|     HR|1997-02-28|   2| Alice|     60000|      false|
|  27|     HR|1995-08-20|   8|  Lisa|     58000|      false|
|  28|Finance|1994-11-22|   4| Emily|     65000|      false|
|  28|Finance|1994-11-22|   4| Emily|     70000|      false|
|  30|Finance|1992-06-30|  10|Sophie|     62000|      false|
|  30|     IT|1992-05-12|   1|  John|     70000|       true|
|  30|     IT|1992-05-12|   1|  John|     70000|       true|
|  33|Finance|1989-07-05

In [0]:
df = df.orderBy("id") 
df.show()

+----+-------+----------+----+------+----------+-----------+
|  id|   name|       dob| age|salary|department|isDuplicate|
+----+-------+----------+----+------+----------+-----------+
|null|     IT|      null|   3|   Bob|     80000|      false|
|   1|   John|1992-05-12|  30| 70000|        IT|      false|
|   2|  Alice|1997-02-28|  25| 60000|        HR|      false|
|   3|    Bob|      null|null| 80000|        IT|      false|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|      false|
|  25|Finance|1997-02-28|   2| Alice|     90000|      false|
|  25|     HR|1997-02-28|   2| Alice|     60000|      false|
|  27|     HR|1995-08-20|   8|  Lisa|     58000|      false|
|  28|Finance|1994-11-22|   4| Emily|     65000|      false|
|  28|Finance|1994-11-22|   4| Emily|     70000|      false|
|  30|Finance|1992-06-30|  10|Sophie|     62000|      false|
|  30|     IT|1992-05-12|   1|  John|     70000|       true|
|  30|     IT|1992-05-12|   1|  John|     70000|       true|
|  33|Finance|1989-07-05

Calculate mean salary and check if it is greater or equal to the salary of the employees in each department. Calculate mean salary and check if it is greater or equal to the salary of all the employees

In [0]:
df = df.dropDuplicates() 
df.show()

+----+-------+----------+----+------+----------+-----------+
|  id|   name|       dob| age|salary|department|isDuplicate|
+----+-------+----------+----+------+----------+-----------+
|null|     IT|      null|   3|   Bob|     80000|      false|
|   1|   John|1992-05-12|  30| 70000|        IT|      false|
|   2|  Alice|1997-02-28|  25| 60000|        HR|      false|
|   3|    Bob|      null|null| 80000|        IT|      false|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|      false|
|  25|Finance|1997-02-28|   2| Alice|     90000|      false|
|  25|     HR|1997-02-28|   2| Alice|     60000|      false|
|  27|     HR|1995-08-20|   8|  Lisa|     58000|      false|
|  28|Finance|1994-11-22|   4| Emily|     65000|      false|
|  28|Finance|1994-11-22|   4| Emily|     70000|      false|
|  30|Finance|1992-06-30|  10|Sophie|     62000|      false|
|  30|     IT|1992-05-12|   1|  John|     70000|       true|
|  33|Finance|1989-07-05|   6| Susan|     75000|      false|
|  39|     IT|1983-10-14

In [0]:
windowSpec = Window.partitionBy(df.department) 
df = df.withColumn("Mean salary", avg("salary").over(windowSpec)) 
df.show()

+----+-------+----------+----+------+----------+-----------+-----------+
|  id|   name|       dob| age|salary|department|isDuplicate|Mean salary|
+----+-------+----------+----+------+----------+-----------+-----------+
|  27|     HR|1995-08-20|   8|  Lisa|     58000|      false|       null|
|  25|     HR|1997-02-28|   2| Alice|     60000|      false|       null|
|  30|Finance|1992-06-30|  10|Sophie|     62000|      false|       null|
|  28|Finance|1994-11-22|   4| Emily|     65000|      false|       null|
|  28|Finance|1994-11-22|   4| Emily|     70000|      false|       null|
|  30|     IT|1992-05-12|   1|  John|     70000|       true|       null|
|  33|Finance|1989-07-05|   6| Susan|     75000|      false|       null|
|null|     IT|      null|   3|   Bob|     80000|      false|       null|
|  39|     IT|1983-10-14|   9| James|     87000|      false|       null|
|  25|Finance|1997-02-28|   2| Alice|     90000|      false|       null|
|  41|     HR|1981-12-18|   5| David|     90000|   

In [0]:
df = df.withColumn( "Is Salary greater than mean Salary", col("Mean salary") < col("salary") ).orderBy("id") 
df.show()

+----+-------+----------+----+------+----------+-----------+-----------+----------------------------------+
|  id|   name|       dob| age|salary|department|isDuplicate|Mean salary|Is Salary greater than mean Salary|
+----+-------+----------+----+------+----------+-----------+-----------+----------------------------------+
|null|     IT|      null|   3|   Bob|     80000|      false|       null|                              null|
|   1|   John|1992-05-12|  30| 70000|        IT|      false|    75000.0|                             false|
|   2|  Alice|1997-02-28|  25| 60000|        HR|      false|    60000.0|                             false|
|   3|    Bob|      null|null| 80000|        IT|      false|    75000.0|                              true|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|      false|    65000.0|                             false|
|  25|     HR|1997-02-28|   2| Alice|     60000|      false|       null|                              null|
|  25|Finance|1997-02-28|   