In [3]:
import findspark
findspark.init()

In [4]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PWC - Data Validation").master("local[*]").getOrCreate()

24/03/25 23:32:14 WARN Utils: Your hostname, ajith-Lenovo-G50-80 resolves to a loopback address: 127.0.1.1; using 192.168.1.40 instead (on interface wlp3s0)
24/03/25 23:32:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/03/25 23:32:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
spark

### Data Validation between Source and Target table

In [26]:
source_data = [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]
source_schema = ['id', 'name']
source_df = spark.createDataFrame(source_data, source_schema)


target_data = [(1, 'A'), (2, 'B'), (3, 'X'), (4, 'F'), (6, 'G')]
target_schema = ['id', 'name']
target_df = spark.createDataFrame(target_data, target_schema)

source_df.show()
target_df.show()

+---+----+
| id|name|
+---+----+
|  1|   A|
|  2|   B|
|  3|   C|
|  4|   D|
|  5|   E|
+---+----+

+---+----+
| id|name|
+---+----+
|  1|   A|
|  2|   B|
|  3|   X|
|  4|   F|
|  6|   G|
+---+----+



## Solution

In [27]:
from pyspark.sql.functions import col, when, coalesce
df = source_df.alias("t1").join(target_df.alias("t2"), on=(col("t1.id") == col("t2.id")), how='full')
df.show()

+----+----+----+----+
|  id|name|  id|name|
+----+----+----+----+
|   1|   A|   1|   A|
|   2|   B|   2|   B|
|   3|   C|   3|   X|
|   4|   D|   4|   F|
|   5|   E|null|null|
|null|null|   6|   G|
+----+----+----+----+



In [28]:
df = df.select(col("t1.id").alias("source_id"), col("t2.id").alias("target_id"), col("t1.name") \
               .alias("source_name"), col("t2.name").alias("target_name"))
df.show()

+---------+---------+-----------+-----------+
|source_id|target_id|source_name|target_name|
+---------+---------+-----------+-----------+
|        1|        1|          A|          A|
|        2|        2|          B|          B|
|        3|        3|          C|          X|
|        4|        4|          D|          F|
|        5|     null|          E|       null|
|     null|        6|       null|          G|
+---------+---------+-----------+-----------+



In [29]:
df = df.withColumn("comment", when((col("source_id") == col("target_id")) \
                                   & (col("source_name") != col("target_name")), "Mismatched") \
                   .when(col("target_id").isNull(), "new in source").when(col("source_id").isNull(), "new in target")
                  )

df.show()

+---------+---------+-----------+-----------+-------------+
|source_id|target_id|source_name|target_name|      comment|
+---------+---------+-----------+-----------+-------------+
|        1|        1|          A|          A|         null|
|        2|        2|          B|          B|         null|
|        3|        3|          C|          X|   Mismatched|
|        4|        4|          D|          F|   Mismatched|
|        5|     null|          E|       null|new in source|
|     null|        6|       null|          G|new in target|
+---------+---------+-----------+-----------+-------------+



In [30]:
df = df.filter(col("comment").isNotNull())
df.show()

+---------+---------+-----------+-----------+-------------+
|source_id|target_id|source_name|target_name|      comment|
+---------+---------+-----------+-----------+-------------+
|        3|        3|          C|          X|   Mismatched|
|        4|        4|          D|          F|   Mismatched|
|        5|     null|          E|       null|new in source|
|     null|        6|       null|          G|new in target|
+---------+---------+-----------+-----------+-------------+



In [31]:
final_df = df.withColumn("id", coalesce(col("source_id"), col("target_id"))).select("id", "comment")
final_df.show()

+---+-------------+
| id|      comment|
+---+-------------+
|  3|   Mismatched|
|  4|   Mismatched|
|  5|new in source|
|  6|new in target|
+---+-------------+

