In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [0]:
df_csv = spark.read.format('csv').load('dbfs:///FileStore/tables/csv/batch.csv')

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
|_c0|  _c1|       _c2| _c3|   _c4|       _c5|
+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
schema = StructType([
    StructField('id', IntegerType()),
    StructField('name', StringType()),
    StructField('dob', DateType()),
    StructField('age', IntegerType()),
    StructField('salary', IntegerType()),
    StructField('department', StringType())
])

In [0]:
df_csv = spark.read.format('csv').schema(schema).option('header', True).load('dbfs:///FileStore/tables/csv/batch.csv')

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json = spark.read.format('json').schema(schema).load('dbfs:///FileStore/tables/json')

In [0]:
df_json.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df = df_json.union(df_csv)

In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
+---+------+----------+----+------+----------+



In [0]:
from pyspark.sql import functions as F

In [0]:
from pyspark.sql import Window

Add new column "is_Duplicate" to indicate whether it is duplicate or not.

In [0]:
window = Window.partitionBy('id').orderBy('id')

find_duplicate_df = df.withColumn(
    'is_Duplicate',
    F.row_number().over(window) != 1
)

In [0]:
find_duplicate_df.display()

id,name,dob,age,salary,department,is_Duplicate
1,John,1992-05-12,30.0,70000,IT,False
1,John,1992-05-12,30.0,70000,IT,True
1,John,1992-05-12,30.0,70000,IT,True
2,Alice,1997-02-28,25.0,60000,HR,False
2,Alice,1997-02-28,25.0,90000,Finance,True
2,Alice,1997-02-28,25.0,60000,HR,True
3,Bob,,,80000,IT,False
3,Bob,,,80000,IT,True
4,Emily,1994-11-22,28.0,65000,Finance,False
4,Emily,1994-11-22,28.0,70000,Finance,True


Find average salary and check whether salary is above average or not by both department and all data.

In [0]:
partition_dep = Window.partitionBy('department')

average_salary_df = df.withColumns(
    {
        'average_salary_by_department': F.avg('salary').over(partition_dep),
        'is_above_avg_by_dep': F.col('salary') >= F.col('average_salary_by_department')
    }
)

In [0]:
average_salary = (df.select(F.avg('salary')).first()[0])

average_salary_df = average_salary_df.withColumns(
    {
    'avg_salary': F.lit(average_salary),
    'is_above_avg': F.col('salary') >= average_salary}
)

In [0]:
average_salary_df.display()

id,name,dob,age,salary,department,average_salary_by_department,is_above_avg_by_dep,avg_salary,is_above_avg
4,Emily,1994-11-22,28.0,65000,Finance,71166.66666666667,False,73352.94117647059,False
6,Susan,1989-07-05,33.0,75000,Finance,71166.66666666667,True,73352.94117647059,True
10,Sophie,1992-06-30,30.0,62000,Finance,71166.66666666667,False,73352.94117647059,False
2,Alice,1997-02-28,25.0,90000,Finance,71166.66666666667,True,73352.94117647059,True
4,Emily,1994-11-22,28.0,70000,Finance,71166.66666666667,False,73352.94117647059,False
4,Emily,1994-11-22,28.0,65000,Finance,71166.66666666667,False,73352.94117647059,False
2,Alice,1997-02-28,25.0,60000,HR,67000.0,False,73352.94117647059,False
5,David,1981-12-18,41.0,90000,HR,67000.0,True,73352.94117647059,True
8,Lisa,1995-08-20,27.0,58000,HR,67000.0,False,73352.94117647059,False
2,Alice,1997-02-28,25.0,60000,HR,67000.0,False,73352.94117647059,False
