In [0]:
# from pyspark.sql import SparkSession

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, concat_ws, year, array, struct, current_date, to_date, when, expr,create_map,date_format,concat_ws,length,substring,coalesce
from pyspark.sql.types import StringType, TimestampType
 
sheets = ['Overview', 'Target Domain Model', 'Source Data Dictionary', 'Header', 'Detail', 'ContactInfo', 'Address']
path = '/FileStore/tables/Project_1.xlsx'
dataFormat = "com.crealytics.spark.excel"
 
def createExcelDataFrame(sheet_index, table_name):
    sheet_name = sheets[sheet_index]
    df = spark.read.format(dataFormat).option("inferschema", True).option("header", True).option("dataAddress", f"'{sheet_name}'!").load(path)
    df.createOrReplaceTempView(table_name)
    return df
 
def createExcelDataFrameForHeader():
    return createExcelDataFrame(3, "header_table")
 
def createExcelDataFrameForDetail():
    return createExcelDataFrame(4, "detail_table")
 
def createExcelDataFrameForContactInfo():
    return createExcelDataFrame(5, "contact_info_table")
 
def createExcelDataFrameForAddress():
    return createExcelDataFrame(6, "address_table")
 
# Create SQL tables
createExcelDataFrameForHeader()
createExcelDataFrameForDetail()
createExcelDataFrameForContactInfo()
createExcelDataFrameForAddress()

DataFrame[id: double, city: string, issued_date: timestamp, address_line_1: string, address_line_2: string, state: string, address_type: string, zipcode: string]

In [0]:
header_df = spark.table("header_table")
detail_df = spark.table("detail_table")
contact_info_df = spark.table("contact_info_table")
address_df = spark.table("address_table")

# Join tables using the common ID column
joined_df = (
    header_df
    .join(detail_df, on="id", how="left")
    .join(contact_info_df, on="id", how="left")
    .join(address_df, on="id", how="left")
)

# Remove duplicate rows
combined_table = joined_df.dropDuplicates(["id"])


In [0]:
# Setting a pivot year for Spark date parsing
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
 
# date format to MM/dd/yy
combined_table = combined_table.withColumn(
    "deceased_date",
    when(
        col("deceased_date").isNotNull(),
        expr("to_date(deceased_date, 'MM/dd/yyyy')")
    )
)
 
# addresses array Transformation
addresses_array = array(
    struct(
        col("address_type").alias("address_type"),
        col("address_line_1").alias("address_line_1"),
        col("address_line_2").alias("address_line_2"),
        col("city").alias("city"),
        col("state").alias("state_province"),  
        col("zipcode").alias("postal_code"),
        col("zipcode").alias("zip_code_extension"),  
        col("state").alias("country")
    )
).alias("addresses")
 
# Transformation for phones array
phones_array = array(
    struct(
        col("phone").alias("number"),
        col("usage_type").alias("phone_type"),  
    )
).alias("phones")
 
# Transformation for employment object
employment_object = struct(
    col("company").alias("employer_name"),
    # col("job_role").alias("employee_role"),
    lit("Active").alias("employee_status"),  # Assuming a default value for 'employee_status'
    date_format(col("job_hiredate"), "MM/dd/yyyy").alias("employee_hiredate")
).alias("employment")
 
# Transformation for languages array
languages_array = array(
    col("spoken_language_1").alias("language"),
    col("spoken_language_2").alias("language")
).alias("languages")
 
additional_source_value_map = create_map(
    lit("relationship").alias("key"),  
    col("relationship").alias("value"),
    lit("religion").alias("key"),  # Use the actual column name from your Excel file
    col("religion").alias("value"),
 
).alias("additional_source_value")
 
# Calculate deceased_age based on deceased_date
transformed_df = combined_table.withColumn(
    "prefix_name",
    when(
        (col("gender") == "M"), lit("Mr")
    ).when(
        (col("gender") == "F") & (col("marital_status").isin("Single", "Divorced")), lit("Miss")
    ).when(
        (col("gender") == "F") & (col("marital_status").isin("Married", "Widowed")), lit("Mrs")
    ).when(
        (col("gender") == "F") & (col("marital_status").isNull()), lit("Miss")
    )
    .otherwise(lit(""))
).withColumn(
    "suffix_name",
    when(col("job_role").like("%Doctor%"), lit("Dr"))
    .when(col("job_role").like("%Engineer%"), lit("Er"))
    .when(col("job_role").like("%Nurse%"), lit("RN"))
    .otherwise(lit(""))
).withColumn(
    "record_source",
    lit("Nova").cast(StringType())
).withColumn(
    "record_created_ts",
    current_date().cast(TimestampType())
).withColumn(
    "deceased_age",
    when(
        col("deceased_date").isNotNull(),
        year(current_date()) - year(col("deceased_date"))
    ).otherwise(lit(0))
).withColumn(
    "deceased_ind",
    when(
        col("deceased_date").isNotNull(),
        lit(True)
    ).otherwise(lit(False))
    ).withColumn(
    "date_of_birth",
    date_format(col("date_of_birth"), "MM/dd/yyyy")
). withColumn(
    "ssn",
    when(
        length(col("ssn")) == 9,
        concat_ws("-", substring(col("ssn"), 1, 3), substring(col("ssn"), 4, 2), substring(col("ssn"), 6, 4))
    ).otherwise(col("ssn"))
).select(
    col("id").cast("integer").alias("source_id"),
    col("insurer_id").cast("integer").alias("subscriber_id"),
    col("first_name"),
    col("middle_name"),
    col("last_name"),
    col("prefix_name"),
    # lit("").alias("suffix_name"),
    col("suffix_name"),
    concat_ws(" ", col("first_name"), col("middle_name"), col("last_name")).alias("name"),
    col("record_source"),
    col("record_created_ts"),
    lit(False).alias("is_verified"),
    addresses_array,
    phones_array,
    col("email"),
    lit(False).alias("privacy_preference"),
    col("ssn").alias("national_id"),
    col("gender"),
    col("marital_status"),
    col("date_of_birth"),
    year(col("date_of_birth")).cast("string").alias("year_of_birth"),
    col("deceased_ind"),
    col("deceased_age"),
    col("deceased_date"),
    languages_array,
    employment_object,
    additional_source_value_map
)
 
# # Show the resulting DataFrame
# transformed_df.show(1, truncate=False)
transformed_df.display()
print(transformed_df.count())
print(transformed_df.columns)

source_id,subscriber_id,first_name,middle_name,last_name,prefix_name,suffix_name,name,record_source,record_created_ts,is_verified,addresses,phones,email,privacy_preference,national_id,gender,marital_status,date_of_birth,year_of_birth,deceased_ind,deceased_age,deceased_date,languages,employment,additional_source_value
70001,40184,Hettie,,Keenlayside,Mrs,,Hettie Keenlayside,Nova,2024-02-01T00:00:00Z,False,"List(List(Mail, 4307 Ashley Village Suite 758, null, New Kyle, North Dakota, 87337, 87337, North Dakota))","List(List((455) 3130004, Work))",jkeenlayside0@disqus.com,False,168-92-1075,F,Widowed,08/05/1939,,False,0,,"List(West Frisian, Swahili)","List(Gabcube, Active, 01/29/1964)","Map(relationship -> child, religion -> Buddhism)"
70002,40092,Reade,,Laverenz,Mr,,Reade Laverenz,Nova,2024-02-01T00:00:00Z,False,"List(List(Residental, 737 Banks Row, Apt. 505, North Heather, Arkansas, 2341, 2341, Arkansas))","List(List((994) 4561640, Work))",dlaverenz1@senate.gov,False,782-24-9907,M,Widowed,05/14/1941,,False,0,,"List(Swati, Danish)","List(Skibox, Active, 05/18/1958)","Map(relationship -> friend, religion -> Christianity)"
70003,40233,Minnnie,,Baack,Mrs,,Minnnie Baack,Nova,2024-02-01T00:00:00Z,False,"List(List(Mail, 807 Jesus Mills Suite 598, Suite 735, Churchbury, Texas, 97223, 97223, Texas))","List(List((771) 6498755, Work))",dbaack2@sina.com.cn,False,726-01-1271,F,Married,11/20/1982,,False,0,,"List(Swati, null)","List(Dabjam, Active, 06/10/2011)","Map(relationship -> spouse, religion -> Buddhism)"
70004,40058,Tana,Agata,Aiken,Miss,,Tana Agata Aiken,Nova,2024-02-01T00:00:00Z,False,"List(List(Residental, 9831 Robert Falls, Apt. 086, Michelleland, Oregon, 05921-5281, 05921-5281, Oregon))","List(List((450) 8886723, Work))",aaiken3@nydailynews.com,False,492-62-0968,F,,02/18/1929,,False,0,,"List(New Zealand Sign Language, Punjabi)","List(Aimbu, Active, 10/08/2014)","Map(relationship -> spouse, religion -> null)"
70005,40088,Cyndia,,Tolomelli,Miss,,Cyndia Tolomelli,Nova,2024-02-01T00:00:00Z,False,"List(List(Mail, 95855 Davis Lodge, Suite 059, Kimberlymouth, Louisiana, 33733, 33733, Louisiana))","List(List((423) 1700133, Work))",ltolomelli4@istockphoto.com,False,802-24-1062,F,,05/31/1920,,False,0,,"List(null, Albanian)","List(Edgepulse, Active, 01/16/1931)","Map(relationship -> friend, religion -> null)"
70006,40170,Johnny,Renaud,Gibben,Mr,,Johnny Renaud Gibben,Nova,2024-02-01T00:00:00Z,False,"List(List(Residental, 8737 Flores Extension Suite 549, null, Jasonbury, Idaho, 20277, 20277, Idaho))","List(List((334) 1254061, Work))",rgibben5@tumblr.com,False,563-98-1576,M,Single,07/01/1958,,False,0,,"List(Georgian, null)","List(Oodoo, Active, 12/26/2021)","Map(relationship -> child, religion -> Buddhism)"
70007,40194,Judas,,Mitford,Mr,,Judas Mitford,Nova,2024-02-01T00:00:00Z,False,"List(List(Residental, 7475 Michael Land, Suite 392, New Latoyamouth, Hawaii, 89157, 89157, Hawaii))","List(List((915) 7431041, Work))",bmitford6@github.io,False,626-84-9457,M,Divorced,07/30/1993,,False,0,,"List(New Zealand Sign Language, Nepali)","List(Bluejam, Active, 06/11/2018)","Map(relationship -> parent, religion -> Hinduism)"
70008,40079,Wilden,Tobin,Huertas,Mr,,Wilden Tobin Huertas,Nova,2024-02-01T00:00:00Z,False,"List(List(Residental, 891 Frank Squares Suite 096, Apt. 809, Whitefort, Virginia, 17694, 17694, Virginia))","List(List((816) 4980330, Work))",thuertas7@yahoo.co.jp,False,667-45-8806,M,Widowed,08/02/1906,,False,0,,"List(Norwegian, null)","List(Roomm, Active, 03/07/1923)","Map(relationship -> spouse, religion -> Other)"
70009,40466,Gaelan,,Smitheman,Mr,,Gaelan Smitheman,Nova,2024-02-01T00:00:00Z,False,"List(List(Mail, 839 Garcia Highway, Apt. 915, Jermaineborough, North Dakota, 25837, 25837, North Dakota))","List(List((460) 8203658, Work))",msmitheman8@ezinearticles.com,False,854-32-5148,M,Divorced,03/04/1926,,False,0,,"List(Japanese, Catalan)","List(Trupe, Active, 12/15/1998)","Map(relationship -> sibling, religion -> Christianity)"
70010,40061,Letti,,Folkard,Miss,,Letti Folkard,Nova,2024-02-01T00:00:00Z,False,"List(List(Residental, 7357 Beck Garden Apt. 240, null, Hunterfort, Wyoming, 69080, 69080, Wyoming))","List(List((577) 3110757, Work))",tfolkard9@biblegateway.com,False,867-58-4596,F,Divorced,03/07/1900,,False,0,,"List(Tajik, Tamil)","List(Yambee, Active, 03/11/2005)","Map(relationship -> child, religion -> Other)"


1500
['source_id', 'subscriber_id', 'first_name', 'middle_name', 'last_name', 'prefix_name', 'suffix_name', 'name', 'record_source', 'record_created_ts', 'is_verified', 'addresses', 'phones', 'email', 'privacy_preference', 'national_id', 'gender', 'marital_status', 'date_of_birth', 'year_of_birth', 'deceased_ind', 'deceased_age', 'deceased_date', 'languages', 'employment', 'additional_source_value']
