In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

In [0]:
%fs ls dbfs:///FileStore/tables/json

path,name,size,modificationTime
dbfs:/FileStore/tables/json/10_20220101.json,10_20220101.json,102,1707460794000
dbfs:/FileStore/tables/json/1_20220101.json,1_20220101.json,94,1707460794000
dbfs:/FileStore/tables/json/2_20220101.json,2_20220101.json,100,1707460795000
dbfs:/FileStore/tables/json/4_20220101.json,4_20220101.json,100,1707460795000
dbfs:/FileStore/tables/json/8_20220101.json,8_20220101.json,94,1707460795000
dbfs:/FileStore/tables/json/9_20220101.json,9_20220101.json,95,1707460795000
dbfs:/FileStore/tables/json/batch.jsonl,batch.jsonl,671,1707460795000


In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType, StringType, DateType

In [0]:
sch = StructType([
    StructField("id",IntegerType()),
    StructField("name",StringType()),
    StructField("dob",DateType()),
    StructField("age",IntegerType()),
    StructField("salary",IntegerType()),
    StructField("department",StringType()),
]) 

In [0]:
df_csv = spark.read.format("csv").schema(sch).option("header", True).load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json = spark.read.format("json").load("dbfs:/FileStore/tables/json")
df_json.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  28|   Finance|1994-11-22|  4| Emily| 70000|
|  39|        IT|1983-10-14|  9| James| 87000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  27|        HR|1995-08-20|  8|  Lisa| 58000|
+----+----------+----------+---+------+------+



In [0]:
df_json.printSchema()

root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
new_json =  df_json.select(df_csv.columns) 
new_json.show()   

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df = df_csv.union(new_json)
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
print(df_csv.columns,new_json.columns)

['id', 'name', 'dob', 'age', 'salary', 'department'] ['id', 'name', 'dob', 'age', 'salary', 'department']


In [0]:
print(df.count())

17


In [0]:
duplicate_count = df.groupBy(df.columns).count()
duplicate_count.show()

+---+------+----------+----+------+----------+-----+
| id|  name|       dob| age|salary|department|count|
+---+------+----------+----+------+----------+-----+
|  4| Emily|1994-11-22|  28| 65000|   Finance|    2|
|  2| Alice|1997-02-28|  25| 60000|        HR|    2|
|  1|  John|1992-05-12|  30| 70000|        IT|    3|
|  3|   Bob|      null|null| 80000|        IT|    2|
|  7|  Mike|1976-03-15|  46| 95000|        IT|    1|
|  5| David|1981-12-18|  41| 90000|        HR|    1|
|  6| Susan|1989-07-05|  33| 75000|   Finance|    1|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|    1|
|  2| Alice|1997-02-28|  25| 90000|   Finance|    1|
|  4| Emily|1994-11-22|  28| 70000|   Finance|    1|
|  9| James|1983-10-14|  39| 87000|        IT|    1|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|    1|
+---+------+----------+----+------+----------+-----+



In [0]:
from pyspark.sql import Window
from pyspark.sql import functions as F

In [0]:
w = Window.partitionBy(df.columns).orderBy("id")

In [0]:
df1 = df.withColumn(
    "is_duplicate",
    F.row_number().over(w) > 1   # First value = false , Duplicates = True
).show()


+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|is_duplicate|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|       false|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  1|  John|1992-05-12|  30| 70000|        IT|        true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|        true|
|  2| Alice|1997-02-28|  25| 90000|   Finance|       false|
|  3|   Bob|      null|null| 80000|        IT|       false|
|  3|   Bob|      null|null| 80000|        IT|        true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|        true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|       false|
|  5| David|1981-12-18|  41| 90000|        HR|       false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|       false|
|  7|  Mike|1976-03-15|  46| 95000|     

In [0]:
df = duplicate_count.withColumn(
    "is_duplicate",
    F.when( F.col("count") >1,True).otherwise(False)) 