In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.getOrCreate()
spark

In [0]:
%fs
rm -r dbfs:/user/hive/warehouse/source_table


In [0]:
%fs
rm -r dbfs:/user/hive/warehouse/target_table


In [0]:
%fs ls dbfs:///FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/Drugs_package.csv,Drugs_package.csv,27234691,1707889107000
dbfs:/FileStore/tables/Drugs_product.csv,Drugs_product.csv,38743242,1707889121000
dbfs:/FileStore/tables/Member.csv,Member.csv,131388,1710503245000
dbfs:/FileStore/tables/cat-1.csv,cat-1.csv,66568,1708719672000
dbfs:/FileStore/tables/cat.csv,cat.csv,66568,1708719259000
dbfs:/FileStore/tables/csv/,csv/,0,0
dbfs:/FileStore/tables/data.zip,data.zip,2274,1707719091000
dbfs:/FileStore/tables/final/,final/,0,0
dbfs:/FileStore/tables/json/,json/,0,0
dbfs:/FileStore/tables/parquet/,parquet/,0,0


##Bootcamp Final Task
- Create excel file
- load the excel file into database (source table file)
- Do some transformation tasks on it 
- Convert it into target table. Now we have 2 tables one is row table(source table) and another is final table(target table)
- Do data validation task
- Do data profiling task
- Finally create menu driven programm of it for automation

### LOADING CSV FILE AND CREATING DATAFRAME OF IT
- member is a name of our dataframe

In [0]:
source_table = spark.read.format("csv").option("header", True).load("dbfs:/FileStore/tables/Member.csv")


###CREATING TABLE FROM ABOVE DATAFRAME
- we create 'member' table

In [0]:
source_table.write.mode("overwrite").saveAsTable("source_table")


###VIEW TABLE
- we can see all attributes from table here

In [0]:
%sql
select * from source_table;


member_id,first_name,middle_name,last_name,address,City,State,zip,phones,email,gender,date_of_birth,year_of_birth
1001,Rahul,Murad,Tyagi,"8780 A, Street",Silvassa,Madhya Pradesh,36535,8376757228,velit.eu@google.co,Male,8/7/2022,2022.0
1001,Rahul,Murad,Tyagi,"8780 A, Street",Silvassa,Madhya Pradesh,36535,8376757228,velit.eu@google.co,Male,8/7/2022,2022.0
1001,Rahul,Murad,Tyagi,"8780 A, Street",Silvassa,Madhya Pradesh,36535,8376757228,velit.eu@google.co,Male,8/7/2022,2022.0
1002,Ninad,Pravin,Joshi,"P.O. Box 951, 7570 Ipsum. Ave",Nagaon,Karnataka,9166,9367167681,sodales.purusyahoo.co,Male,4/22/2010,2010.0
1003,Kanchana,Lochan,Sen,"247-1088 Ornare, Rd.",Kohima,Karnataka,76525,9834595037,nunc.pulvinar.arcu@google.com,Female,7/24/2024,2024.0
1004,Ila,Nikhil,Jai,7789 Imperdiet Street,Itanagar,Uttarakhand,96048,9794127424,aliquam@outlook.co,Female,4/25/2023,2023.0
1005,Narayana,Rama,Vish,"P.O. Box 978, 1675 Nulla Road",Solapur,Daman and Diu,81542,7576378768,urna.nullam@hotmail.org,Male,1/29/2017,2017.0
1006,Drishti,Murali,Nara,479-3442 Lorem Ave,Bilaspur,Goa,96955,8061857731,tellus.justo@icloud.co,Female,7/4/2024,2024.0
1007,Vasudha,Swarna,Subramani,2626 Magna Rd.,Imphal,Uttarakhand,18912,9635113903,amet.ultricies@google.co,,6/17/2024,2024.0
1008,Mira,Dev,Sudha,130-112 Risus. Av.,Shimla,Tamil Nadu,13037,8080442784,consectetuer.mauris@outlook.co,Female,4/9/2023,2023.0


### TOTAL DUPLICATE ROWS
- we have to remove this duplicate rows and performe some transformations and then create 'target' table for further process
- we have 6 duplicate member id's (1001,1183,1262,1453,1748,1997)

In [0]:
%sql
WITH cte AS (
    SELECT
        ROW_NUMBER() OVER(PARTITION BY member_id ORDER BY member_id) AS rnk,
        *
    FROM
        source_table
)
SELECT
    *
FROM
    cte
WHERE
    rnk > 1;


rnk,member_id,first_name,middle_name,last_name,address,City,State,zip,phones,email,gender,date_of_birth,year_of_birth
2,1001,Rahul,Murad,Tyagi,"8780 A, Street",Silvassa,Madhya Pradesh,36535,8376757228,velit.eu@google.co,Male,8/7/2022,2022
3,1001,Rahul,Murad,Tyagi,"8780 A, Street",Silvassa,Madhya Pradesh,36535,8376757228,velit.eu@google.co,Male,8/7/2022,2022
2,1183,Tushar,Pravin,Raman,Ap #544-8185 Eget St.,Delhi,Uttar Pradesh,26800,7545221472,donec@outlook.net,Male,3/11/2015,2015
3,1183,Tushar,Pravin,Raman,Ap #544-8185 Eget St.,Delhi,Uttar Pradesh,26800,7545221472,donec@outlook.net,Male,3/11/2015,2015
2,1262,Khurshid,Narendra,Rana,292-3896 Feugiat. Av.,Panjim,Tripura,73084,9905858024,vel@hotmail.org,Female,12/13/2024,2024
2,1453,Pran,Amandeep,Narang,"P.O. Box 158, 4409 Aenean Rd.",Pondicherry,Sikkim,59799,9348248454,justo.nec.anteicloud.co,Male,8/16/2017,2017
2,1748,Dev,Vishal,Krishnamurthy,Ap #835-2829 Gravida Avenue,Silvassa,Karnataka,51219,9173612455,a.facilisis@hotmail.com,Male,9/4/2017,2017
3,1748,Dev,Vishal,Krishnamurthy,Ap #835-2829 Gravida Avenue,Silvassa,Karnataka,51219,9173612455,a.facilisis@hotmail.com,Male,9/4/2017,2017
2,1997,Radha,Narendra,Nagpal,Ap #567-310 Faucibus Ave,Kavaratti,Andhra Pradesh,94896,9853378512,quisque.imperdiet.erat@google.net,,12/24/2024,2024


#### SOME TRANSFORMATIONS
- Remove duplicate records and keep only valid records.
- Blank fields should have 'NULL' value.
- Zip should have length of 5 characters, if its less then 5 then put 'NULL' value and if its grater than 5 then accept only first 5.
- Phones should have length of 10 characters, if its less then 10 then put NULL value. and if its grater than 10 then accept only first 10.
- Invalid mail should be 'NULL' value.
- Invalid date_of_birth should be 'NULL' value
- Add new field (address_details) with : address, City, State, zip.
address_details nvarchar

In [0]:
%sql
CREATE TABLE target_table AS
(
with cte as (
    select 
        row_number() over(partition by member_id ORDER BY member_id) as rnk,
        *
    from source_table
) 
select 
    member_id,
    first_name,
    middle_name,
    last_name,
    address,
    City,
    State,
    case when length(zip) > 5 then left(zip,5) 
         when length(zip) = 5 then zip 
    end as zip,
    concat(
        address," ",
        City," ",
        State," ",
        (
            case when length(zip) > 5 then left(zip,5) 
                 when length(zip) = 5 then zip 
                 when length(zip) < 5 then " " 
            end
        )
    ) as address_details,
    case when length(phones) > 10 then left(phones,10) 
         when length(phones) = 10 then phones 
    end as phones,
    case when email like '%@%' AND email like '%.%' then email 
    end as email,
    gender,
    case when to_date(date_of_birth,"M/d/yyyy") < current_date() 
         then to_date(date_of_birth,"M/d/yyyy") 
    end as date_of_birth,
    year_of_birth 
from cte 
where rnk = 1
);


num_affected_rows,num_inserted_rows


####HERE ARE OUR TARGET TABLE AFTER PERFORMING TRANSFORMATIONS

In [0]:
%sql
SELECT * FROM target_table

member_id,first_name,middle_name,last_name,address,City,State,zip,address_details,phones,email,gender,date_of_birth,year_of_birth
1001,Rahul,Murad,Tyagi,"8780 A, Street",Silvassa,Madhya Pradesh,36535.0,"8780 A, Street Silvassa Madhya Pradesh 36535",8376757228.0,velit.eu@google.co,Male,2022-08-07,2022.0
1002,Ninad,Pravin,Joshi,"P.O. Box 951, 7570 Ipsum. Ave",Nagaon,Karnataka,,"P.O. Box 951, 7570 Ipsum. Ave Nagaon Karnataka",9367167681.0,,Male,2010-04-22,2010.0
1003,Kanchana,Lochan,Sen,"247-1088 Ornare, Rd.",Kohima,Karnataka,76525.0,"247-1088 Ornare, Rd. Kohima Karnataka 76525",9834595037.0,nunc.pulvinar.arcu@google.com,Female,,2024.0
1004,Ila,Nikhil,Jai,7789 Imperdiet Street,Itanagar,Uttarakhand,96048.0,7789 Imperdiet Street Itanagar Uttarakhand 96048,9794127424.0,aliquam@outlook.co,Female,2023-04-25,2023.0
1005,Narayana,Rama,Vish,"P.O. Box 978, 1675 Nulla Road",Solapur,Daman and Diu,81542.0,"P.O. Box 978, 1675 Nulla Road Solapur Daman and Diu 81542",7576378768.0,urna.nullam@hotmail.org,Male,2017-01-29,2017.0
1006,Drishti,Murali,Nara,479-3442 Lorem Ave,Bilaspur,Goa,96955.0,479-3442 Lorem Ave Bilaspur Goa 96955,8061857731.0,tellus.justo@icloud.co,Female,,2024.0
1007,Vasudha,Swarna,Subramani,2626 Magna Rd.,Imphal,Uttarakhand,18912.0,2626 Magna Rd. Imphal Uttarakhand 18912,9635113903.0,amet.ultricies@google.co,,,2024.0
1008,Mira,Dev,Sudha,130-112 Risus. Av.,Shimla,Tamil Nadu,13037.0,130-112 Risus. Av. Shimla Tamil Nadu 13037,8080442784.0,consectetuer.mauris@outlook.co,Female,2023-04-09,2023.0
1009,Krishna,Dayaram,Nara,Ap #357-275 Orci Rd.,Gwalior,Karnataka,98772.0,Ap #357-275 Orci Rd. Gwalior Karnataka 98772,9158148681.0,,Male,2011-01-29,2011.0
1010,Sona,Hardeep,Sai,"P.O. Box 562, 5835 Lacinia Rd.",Port Blair,Chandigarh,67852.0,"P.O. Box 562, 5835 Lacinia Rd. Port Blair Chandigarh 67852",9663164211.0,,Female,2023-11-07,2023.0


In [0]:
target_table = spark.read.option("header", True).option("inferSchmea",True).load("dbfs:/user/hive/warehouse/target_table")


In [0]:
source_table.display()

member_id,first_name,middle_name,last_name,address,City,State,zip,phones,email,gender,date_of_birth,year_of_birth
1001,Rahul,Murad,Tyagi,"8780 A, Street",Silvassa,Madhya Pradesh,36535,8376757228,velit.eu@google.co,Male,8/7/2022,2022.0
1001,Rahul,Murad,Tyagi,"8780 A, Street",Silvassa,Madhya Pradesh,36535,8376757228,velit.eu@google.co,Male,8/7/2022,2022.0
1001,Rahul,Murad,Tyagi,"8780 A, Street",Silvassa,Madhya Pradesh,36535,8376757228,velit.eu@google.co,Male,8/7/2022,2022.0
1002,Ninad,Pravin,Joshi,"P.O. Box 951, 7570 Ipsum. Ave",Nagaon,Karnataka,9166,9367167681,sodales.purusyahoo.co,Male,4/22/2010,2010.0
1003,Kanchana,Lochan,Sen,"247-1088 Ornare, Rd.",Kohima,Karnataka,76525,9834595037,nunc.pulvinar.arcu@google.com,Female,7/24/2024,2024.0
1004,Ila,Nikhil,Jai,7789 Imperdiet Street,Itanagar,Uttarakhand,96048,9794127424,aliquam@outlook.co,Female,4/25/2023,2023.0
1005,Narayana,Rama,Vish,"P.O. Box 978, 1675 Nulla Road",Solapur,Daman and Diu,81542,7576378768,urna.nullam@hotmail.org,Male,1/29/2017,2017.0
1006,Drishti,Murali,Nara,479-3442 Lorem Ave,Bilaspur,Goa,96955,8061857731,tellus.justo@icloud.co,Female,7/4/2024,2024.0
1007,Vasudha,Swarna,Subramani,2626 Magna Rd.,Imphal,Uttarakhand,18912,9635113903,amet.ultricies@google.co,,6/17/2024,2024.0
1008,Mira,Dev,Sudha,130-112 Risus. Av.,Shimla,Tamil Nadu,13037,8080442784,consectetuer.mauris@outlook.co,Female,4/9/2023,2023.0


In [0]:
target_table.display()

member_id,first_name,middle_name,last_name,address,City,State,zip,address_details,phones,email,gender,date_of_birth,year_of_birth
1001,Rahul,Murad,Tyagi,"8780 A, Street",Silvassa,Madhya Pradesh,36535.0,"8780 A, Street Silvassa Madhya Pradesh 36535",8376757228.0,velit.eu@google.co,Male,2022-08-07,2022.0
1002,Ninad,Pravin,Joshi,"P.O. Box 951, 7570 Ipsum. Ave",Nagaon,Karnataka,,"P.O. Box 951, 7570 Ipsum. Ave Nagaon Karnataka",9367167681.0,,Male,2010-04-22,2010.0
1003,Kanchana,Lochan,Sen,"247-1088 Ornare, Rd.",Kohima,Karnataka,76525.0,"247-1088 Ornare, Rd. Kohima Karnataka 76525",9834595037.0,nunc.pulvinar.arcu@google.com,Female,,2024.0
1004,Ila,Nikhil,Jai,7789 Imperdiet Street,Itanagar,Uttarakhand,96048.0,7789 Imperdiet Street Itanagar Uttarakhand 96048,9794127424.0,aliquam@outlook.co,Female,2023-04-25,2023.0
1005,Narayana,Rama,Vish,"P.O. Box 978, 1675 Nulla Road",Solapur,Daman and Diu,81542.0,"P.O. Box 978, 1675 Nulla Road Solapur Daman and Diu 81542",7576378768.0,urna.nullam@hotmail.org,Male,2017-01-29,2017.0
1006,Drishti,Murali,Nara,479-3442 Lorem Ave,Bilaspur,Goa,96955.0,479-3442 Lorem Ave Bilaspur Goa 96955,8061857731.0,tellus.justo@icloud.co,Female,,2024.0
1007,Vasudha,Swarna,Subramani,2626 Magna Rd.,Imphal,Uttarakhand,18912.0,2626 Magna Rd. Imphal Uttarakhand 18912,9635113903.0,amet.ultricies@google.co,,,2024.0
1008,Mira,Dev,Sudha,130-112 Risus. Av.,Shimla,Tamil Nadu,13037.0,130-112 Risus. Av. Shimla Tamil Nadu 13037,8080442784.0,consectetuer.mauris@outlook.co,Female,2023-04-09,2023.0
1009,Krishna,Dayaram,Nara,Ap #357-275 Orci Rd.,Gwalior,Karnataka,98772.0,Ap #357-275 Orci Rd. Gwalior Karnataka 98772,9158148681.0,,Male,2011-01-29,2011.0
1010,Sona,Hardeep,Sai,"P.O. Box 562, 5835 Lacinia Rd.",Port Blair,Chandigarh,67852.0,"P.O. Box 562, 5835 Lacinia Rd. Port Blair Chandigarh 67852",9663164211.0,,Female,2023-11-07,2023.0


###Validate below things on soruce and target tables(DDL validation):
- Validate column count
- Validate common columns
- Validate mistmatch columns between source and target
- Validate column data types(matched vs mistached)
- Validate row count

In [0]:
import pandas as pd
from pyspark.sql.functions import col


def column_count(source_table,target_table):
    # Assuming source_table and target_table are DataFrame objects
    source_column_count = len(source_table.columns)
    target_column_count = len(target_table.columns)

    # Creating a DataFrame with the counts
    df = pd.DataFrame({
        'source_column_count': [source_column_count],
        'target_column_count': [target_column_count]
    })

    # Displaying the DataFrame
    print("column count of tables")
    display(df)




def common_columns(source_table,target_table):
    # Assuming source_table and target_table are DataFrame objects
    source_columns = set(source_table.columns)
    target_columns = set(target_table.columns)

    # Find common columns
    common_columns = source_columns.intersection(target_columns)

    if len(common_columns) == 0:
        print("No common columns found")
    else:
        common_columns_df = pd.DataFrame(list(common_columns), columns=["Common_columns"])

    # Displaying the DataFrame
    print("Common columns between both tables")
    display(common_columns_df)




def unique_columns(source_table,target_table):
    # Assuming source_table and target_table are DataFrame objects
    source_columns = set(source_table.columns)
    target_columns = set(target_table.columns)

    # Find columns mismatched to each DataFrame
    source_unique_columns = source_columns - target_columns
    target_unique_columns = target_columns - source_columns

    # Creating DataFrames for mismatched columns
    source_unique_columns_df = pd.DataFrame(list(source_unique_columns), columns=["Mismatched_columns_in_source_table"])
    target_unique_columns_df = pd.DataFrame(list(target_unique_columns), columns=["Mismatched_columns_in_target_table"])

    # Displaying the DataFrames
    print("Mismatched Columns in Source table:")
    if len(source_unique_columns) == 0:
        print("No unique columns found")
    else:
        display(source_unique_columns_df)

    print("\nMismatched Columns in Target DataFrame:")
    if len(target_unique_columns) == 0:
        print("No unique columns found")
    else:
        display(target_unique_columns_df)




def data_types(source_table,target_table):
    # Assuming source_table and target_table are DataFrame objects
    source_data_types = source_table.dtypes
    target_data_types = target_table.dtypes

    # Checking column data types
    mismatched_columns = []
    matched_columns = []

    for source_col, target_col in zip(source_data_types, target_data_types):
        if source_col[1] != target_col[1]:
            mismatched_columns.append((source_col[0], source_col[1], target_col[1]))
        else:
            matched_columns.append((source_col[0], source_col[1]))

    # Creating DataFrames for matched and mismatched columns
    mismatched_columns_df = pd.DataFrame(mismatched_columns, columns=["Column_name", "Source_table_data_type","Target_table_data_type"])
    matched_columns_df = pd.DataFrame(matched_columns, columns=["Column_name", "Data_type"])

    # Displaying the DataFrames
    print("Mismatched column data types between source and target:")
    if len(mismatched_columns) == 0:
        print("No Mismatched columns found")
    else:
        display(mismatched_columns_df)

    print("\nMatched column data types between source and target:")
    if len(matched_columns) == 0:
        print("No Matched columns found")
    else:
        display(matched_columns_df)




def row_count(source_table,target_table):
    # Assuming source_table and target_table are DataFrame objects
    source_row_count = source_table.count()
    target_row_count = target_table.count()

    # Creating a DataFrame with row counts
    row_count_df = pd.DataFrame({
        'Table': ['Source', 'Target'],
        'Row_count': [source_row_count, target_row_count]
    })

    # Displaying the DataFrame
    print("Row count of both tables")
    display(row_count_df)




def data_validation():
    source_table = input("Enter table1: ")
    target_table = input("Enter table2: ")
    source_table = spark.read.option("header", True).option("inferSchema", True).load("dbfs:/user/hive/warehouse/" + source_table)
    target_table = spark.read.option("header", True).option("inferSchema", True).load("dbfs:/user/hive/warehouse/" + target_table)

    while True:
        print("\n1. Get column count\n2. Get columns\n3. Get unique columns\n4. Get Data Types\n5. Row count\n6. Get All")
        choice = input("Enter your choice: ")

        if choice == "1":
            column_count(source_table,target_table)
            break
        elif choice == "2":
            common_columns(source_table,target_table)
            break
        elif choice == "3":
            unique_columns(source_table,target_table)
            break
        elif choice == "4":
            data_types(source_table,target_table)
            break
        elif choice == "5":
            row_count(source_table,target_table)
            break
        elif choice == "6":
            column_count(source_table,target_table),common_columns(source_table,target_table),unique_columns(source_table,target_table),data_types(source_table,target_table),row_count(source_table,target_table)
            break
        else:
            print("Invalid choice. Please try again.")
if __name__ == "__main__":
    data_validation()




Enter table1:  source_table

Enter table2:  target_table


1. Get column count
2. Get columns
3. Get unique columns
4. Get Data Types
5. Row count
6. Get All


Enter your choice:  6

column count of tables


source_column_count,target_column_count
13,14


Common columns between both tables


Common_columns
middle_name
date_of_birth
zip
State
gender
City
email
phones
address
last_name


Mismatched Columns in Source table:
No unique columns found

Mismatched Columns in Target DataFrame:


Mismatched_columns_in_target_table
address_details


Mismatched column data types between source and target:


Column_name,Source_table_data_type,Target_table_data_type
year_of_birth,string,date



Matched column data types between source and target:


Column_name,Data_type
member_id,string
first_name,string
middle_name,string
last_name,string
address,string
City,string
State,string
zip,string
phones,string
email,string


Row count of both tables


Table,Row_count
Source,1009
Target,1000


###Validate below things on each fields on source and target table(Data profiling)
- Validate record count
- validate distinct record counts for each field.
- Validate null count for each field.
- Validate distinct record percenatge for each field.
- Validate null percentage for each field.

In [0]:
import pandas as pd
from pyspark.sql.functions import col


def record_count(source_table,target_table):
    # Assuming source_table and target_table are DataFrame objects
    source_record_count = source_table.count()
    target_record_count = target_table.count()

    # Creating a DataFrame with row counts
    record_count_df = pd.DataFrame({
        'Table': ['Source', 'Target'],
        'Record Count': [source_record_count, target_record_count]
    })

    # Displaying the DataFrame
    print("Record count for both tables")
    display(record_count_df)




def distinct_count(source_table,target_table):
    # Assuming source_table and target_table are DataFrame objects
    source_distinct_counts = {}
    target_distinct_counts = {}
    source_total_rows = source_table.count()
    target_total_rows = target_table.count()

    for col in source_table.columns:
        source_distinct_count = source_table.select(col).distinct().count()
        source_distinct_counts[col] = source_distinct_count

    for col in target_table.columns:
        target_distinct_count = target_table.select(col).distinct().count()
        target_distinct_counts[col] = target_distinct_count

    # Creating a DataFrame for distinct record counts
    data = []
    for col in source_table.columns:
        source_count = source_distinct_counts.get(col, 0)
        target_count = target_distinct_counts.get(col, 0)
        source_percentage = (source_count / source_total_rows) * 100
        target_percentage = (target_count / target_total_rows) * 100
        if source_count != target_count:
            data.append({
                'Field': col,
                'Source Count': source_count,
                'Target Count': target_count,
                'Match': 'Mismatch',
                'Source Percentage': source_percentage,
                'Target Percentage': target_percentage
            })
        else:
            data.append({
                'Field': col,
                'Source Distinct Count': source_count,
                'Target Distinct Count': target_count,
                'Match': 'Match',
                'Source Distinct Percentage': source_percentage,
                'Target Distinct Percentage': target_percentage
            })

    distinct_record_counts_df = pd.DataFrame(data)

    # Displaying the DataFrame
    print("Distinct values and its percentage of each columns from both tables")
    display(distinct_record_counts_df)




def null_count(source_table,target_table):
    from pyspark.sql.functions import col

    # Assuming source_table and target_table are DataFrame objects
    source_null_counts = {}
    target_null_counts = {}
    source_total_rows = source_table.count()
    target_total_rows = target_table.count()

    for col_name in source_table.columns:
        source_null_count = source_table.filter(col(col_name).isNull()).count()
        source_null_counts[col_name] = source_null_count

    for col_name in target_table.columns:
        target_null_count = target_table.filter(col(col_name).isNull()).count()
        target_null_counts[col_name] = target_null_count

    # Creating a DataFrame for null record counts
    data = []
    for col_name in source_table.columns:
        source_count = source_null_counts.get(col_name, 0)
        target_count = target_null_counts.get(col_name, 0)
        source_percentage = (source_count / source_total_rows) * 100
        target_percentage = (target_count / target_total_rows) * 100
        if source_count != target_count:
            data.append({
                'Field': col_name,
                'Source null Count': source_count,
                'Target null Count': target_count,
                'Match': 'Mismatch',
                'Source null Percentage': source_percentage,
                'Target null Percentage': target_percentage
            })
        else:
            data.append({
                'Field': col_name,
                'Source null Count': source_count,
                'Target null Count': target_count,
                'Match': 'Match',
                'Source null Percentage': source_percentage,
                'Target null Percentage': target_percentage
            })

    null_record_counts_df = pd.DataFrame(data)

    # Displaying the DataFrame
    print("Null values and its percentage of each columns from both tables")
    display(null_record_counts_df)





def data_profiling():
    source_table = input("Enter table1: ")
    target_table = input("Enter table2: ")
    source_table = spark.read.option("header", True).option("inferSchema", True).load("dbfs:/user/hive/warehouse/" + source_table)
    target_table = spark.read.option("header", True).option("inferSchema", True).load("dbfs:/user/hive/warehouse/" + target_table)
    while True:
        print("\n1. Get Record count\n2. Get distinct count and its percentage\n3. Get null count and its percentage\n4. Get All")
        choice = input("Enter your choice: ")

        if choice == "1":
            record_count(source_table,target_table)
            break
        elif choice == "2":
            distinct_count(source_table,target_table)
            break
        elif choice == "3":
            null_count(source_table,target_table)
            break
        elif choice == "4":
            record_count(source_table,target_table),distinct_count(source_table,target_table),null_count(source_table,target_table)
            break
        else:
            print("Invalid choice. Please try again.")
if __name__ == "__main__":
    data_profiling()


Enter table1:  source_table

Enter table2:  target_table


1. Get Record count
2. Get distinct count and its percentage
3. Get null count and its percentage
4. Get All


Enter your choice:  4

Record count for both tables


Table,Record Count
Source,1009
Target,1000


Distinct values and its percentage of each columns from both tables


Field,Source Distinct Count,Target Distinct Count,Match,Source Distinct Percentage,Target Distinct Percentage,Source Count,Target Count,Source Percentage,Target Percentage
member_id,1000.0,1000.0,Match,99.10802775024776,100.0,,,,
first_name,391.0,391.0,Match,38.75123885034688,39.1,,,,
middle_name,274.0,274.0,Match,27.15559960356789,27.4,,,,
last_name,103.0,103.0,Match,10.20812685827552,10.3,,,,
address,1000.0,1000.0,Match,99.10802775024776,100.0,,,,
City,249.0,249.0,Match,24.67789890981169,24.9,,,,
State,35.0,35.0,Match,3.468780971258672,3.5000000000000004,,,,
zip,,,Mismatch,,,996.0,974.0,98.71159563924678,97.4
phones,,,Mismatch,,,1000.0,950.0,99.10802775024776,95.0
email,,,Mismatch,,,985.0,885.0,97.62140733399404,88.5


Null values and its percentage of each columns from both tables


Field,Source null Count,Target null Count,Match,Source null Percentage,Target null Percentage
member_id,0,0,Match,0.0,0.0
first_name,11,11,Match,1.0901883052527257,1.1
middle_name,14,14,Match,1.3875123885034688,1.4
last_name,14,14,Match,1.3875123885034688,1.4
address,0,0,Match,0.0,0.0
City,0,0,Match,0.0,0.0
State,0,0,Match,0.0,0.0
zip,0,23,Mismatch,0.0,2.3
phones,0,51,Mismatch,0.0,5.1
email,0,101,Mismatch,0.0,10.1
