In [0]:
from pyspark.sql import SparkSession
Spark = SparkSession.builder.getOrCreate()
spark

In [0]:
file_path_csv= "dbfs:///FileStore/tables/csv/batch.csv"


In [0]:
from pyspark.sql.types import StructType,StructField, IntegerType, StringType, DateType 
from pyspark.sql import functions as f 
from pyspark.sql.window import Window

In [0]:

schema = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("age", IntegerType()),
    StructField("salary", IntegerType()),
    StructField("department", StringType()),
])

In [0]:
df_csv= spark.read.format("csv").schema(schema).option('header',True).load(file_path_csv)

In [0]:
df_csv.show()
df_csv.printSchema()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json=spark.read.format("json").load("dbfs:///FileStore/tables/json")

In [0]:
df_json.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  28|   Finance|1994-11-22|  4| Emily| 70000|
|  39|        IT|1983-10-14|  9| James| 87000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  27|        HR|1995-08-20|  8|  Lisa| 58000|
+----+----------+----------+---+------+------+



In [0]:
df_json.printSchema()

root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
dbfs:/FileStore/shared_uploads/jayesh.tarvatkar@dctinc.com/batch.jsonl

In [0]:
df_json_batch = spark.read.format("json").schema(schema).load("dbfs:/FileStore/shared_uploads/jayesh.tarvatkar@dctinc.com/batch.jsonl")

In [0]:
df_final_json = df_json.union(df_json_batch) 
df_final_json.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  28|   Finance|1994-11-22|  4| Emily| 70000|
|  39|        IT|1983-10-14|  9| James| 87000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  27|        HR|1995-08-20|  8|  Lisa| 58000|
+----+----------+----------+---+------+------+



In [0]:

df_final_json.printSchema()

root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: string (nullable = true)



In [0]:
 df_final_json = df_final_json.select(df_csv.columns) 
 df_final_json.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df = df_csv.union(df_final_json).orderBy("id") 
df.show()


+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  9| James|1983-10-14|  39| 87000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
+---+------+----------+----+------+----------+



In [0]:
df.count()

Out[23]: 17

In [0]:
window_spec = Window.partitionBy(df.columns).orderBy(df.id)


In [0]:
result_1 = df.withColumn( "is_duplicate", (f.row_number().over(window_spec) > 1) ) 
result_1.show()


+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|is_duplicate|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|       false|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|        true|
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|
|  3|   Bob|      null|null| 80000|        IT|       false|
|  3|   Bob|      null|null| 80000|        IT|        true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|        true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       false|
|  5| David|1981-12-18|  41| 90000|        HR|       false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|
|  7|  Mike|1976-03-15|  46| 95000|     

In [0]:
df_unique = result_1.filter( f.col("is_duplicate") == False ) 
df_unique.show()


+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|is_duplicate|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|
|  3|   Bob|      null|null| 80000|        IT|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       false|
|  5| David|1981-12-18|  41| 90000|        HR|       false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|
|  7|  Mike|1976-03-15|  46| 95000|        IT|       false|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|       false|
|  9| James|1983-10-14|  39| 87000|        IT|       false|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|       false|
+---+------+----------+----+------+----------+------------+



In [0]:
df_unique.count()

Out[30]: 12

#TASK2

In [0]:

df_unique.show()

+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|is_duplicate|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|
|  3|   Bob|      null|null| 80000|        IT|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       false|
|  5| David|1981-12-18|  41| 90000|        HR|       false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|
|  7|  Mike|1976-03-15|  46| 95000|        IT|       false|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|       false|
|  9| James|1983-10-14|  39| 87000|        IT|       false|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|       false|
+---+------+----------+----+------+----------+------------+



In [0]:
window_spec = Window.partitionBy("department")

In [0]:
result1 = df_unique.withColumns( { "greater_than_mean_sal":f.col("salary")>=f.avg("salary").over(window_spec) } ) 
result1.show()


+---+------+----------+----+------+----------+------------+---------------------+
| id|  name|       dob| age|salary|department|is_duplicate|greater_than_mean_sal|
+---+------+----------+----+------+----------+------------+---------------------+
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|                 true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|                false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       false|                false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|                 true|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|       false|                false|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|                false|
|  5| David|1981-12-18|  41| 90000|        HR|       false|                 true|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|       false|                false|
|  1|  John|1992-05-12|  30| 70000|        IT|       false|                false|
|  3|   Bob|    

In [0]:
result2 = df_unique.withColumns( { "greater_than_mean_sal_all":f.col("salary")>= df.select(f.avg("salary")).first()[0] } ) 
result2.show()


+---+------+----------+----+------+----------+------------+-------------------------+
| id|  name|       dob| age|salary|department|is_duplicate|greater_than_mean_sal_all|
+---+------+----------+----+------+----------+------------+-------------------------+
|  1|  John|1992-05-12|  30| 70000|        IT|       false|                    false|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|                    false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|                     true|
|  3|   Bob|      null|null| 80000|        IT|       false|                     true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|                    false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       false|                    false|
|  5| David|1981-12-18|  41| 90000|        HR|       false|                     true|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|                     true|
|  7|  Mike|1976-03-15|  46| 95000|        IT|       f