In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

In [0]:
df_csv =spark.read.format("csv").load("dbfs:/FileStore/tables/csv/batch.csv",header=True)

In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DateType

In [0]:
sch = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("age", IntegerType()),
    StructField("Salary", IntegerType()),
    StructField("department", StringType()),

])

In [0]:
df_csv =spark.read.format("csv").schema(sch).load("dbfs:/FileStore/tables/csv/batch.csv",header=True)

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json = spark.read.format("json").load("dbfs:/FileStore/tables/json")

In [0]:
df_json.printSchema()

root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
df_json = df_json.select(df_csv.columns)

In [0]:
print(df_csv.columns,df_json.columns)

['id', 'name', 'dob', 'age', 'Salary', 'department'] ['id', 'name', 'dob', 'age', 'Salary', 'department']


In [0]:
df= df_csv.union(df_json)

In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|Salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
part_by = Window.partitionBy('name','department').orderBy('id')

In [0]:
df = df.withColumn("row_number" ,F.row_number().over(part_by))

In [0]:
df = df.withColumn("is_duplicate",F.when(
    df["row_number"]==1,
    "false"
).otherwise(
    "true"
))

In [0]:
df= df.drop(df.row_number)

In [0]:
df.show()     # 1st task output

+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|Salary|department|is_duplicate|
+---+------+----------+----+------+----------+------------+
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|        true|
|  3|   Bob|      null|null| 80000|        IT|       false|
|  3|   Bob|      null|null| 80000|        IT|        true|
|  5| David|1981-12-18|  41| 90000|        HR|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|        true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|        true|
|  9| James|1983-10-14|  39| 87000|        IT|       false|
|  1|  John|1992-05-12|  30| 70000|        IT|       false|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  8|  Lisa|1995-08-20|  27| 58000|     

In [0]:
part_b = Window.partitionBy('department')

In [0]:
df = df.withColumns(
    {
    "mean_salary_department" : F.mean('salary').over(part_b),

    "mean_sal_is_greater" : F.when(
        F.col('mean_salary_department') >= F.col('salary'),
        "Yes"
    ).otherwise(
        "No"
    )
    }
    
)

In [0]:
df.show()   # Task 2

+---+------+----------+----+------+----------+------------+----------------------+-------------------+
| id|  name|       dob| age|Salary|department|is_duplicate|mean_salary_department|mean_sal_is_greater|
+---+------+----------+----+------+----------+------------+----------------------+-------------------+
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|     71166.66666666667|                 No|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|     71166.66666666667|                Yes|
|  4| Emily|1994-11-22|  28| 65000|   Finance|        true|     71166.66666666667|                Yes|
|  4| Emily|1994-11-22|  28| 70000|   Finance|        true|     71166.66666666667|                Yes|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|       false|     71166.66666666667|                Yes|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|     71166.66666666667|                 No|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|              

In [0]:
part_c = Window.orderBy()

In [0]:
df = df.withColumns(
    {
    "mean_salary_employee" : F.mean('salary').over(part_c),

    "mean_sal_employee_greater" : F.when(
        F.col('mean_salary_employee') >= F.col('salary'),
        "Yes"
    ).otherwise(
        "No"
    )
    }
    
)

In [0]:
df.show()     #task 3

+---+------+----------+----+------+----------+------------+----------------------+-------------------+--------------------+-------------------------+
| id|  name|       dob| age|Salary|department|is_duplicate|mean_salary_department|mean_sal_is_greater|mean_salary_employee|mean_sal_employee_greater|
+---+------+----------+----+------+----------+------------+----------------------+-------------------+--------------------+-------------------------+
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|     71166.66666666667|                 No|   73352.94117647059|                       No|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|               67000.0|                Yes|   73352.94117647059|                      Yes|
|  2| Alice|1997-02-28|  25| 60000|        HR|        true|               67000.0|                Yes|   73352.94117647059|                      Yes|
|  3|   Bob|      null|null| 80000|        IT|       false|     78857.14285714286|                 N