
## Overview

Nova, a health insurance provider, has shared its active member details with you. You are required to ingest their data into your "member" domain model. They shared four source tables: Header, Detail, ContactInfo & Address, and their data dictionary with you. Your task is to use Databricks for data ingestion and validation tasks. Once the data is ingested, the next step is data validation. This involves performing checks and tests on the ingested data to ensure its integrity & consistency. There may be some anomaly in the data.


In [0]:

# Define a list of sheet names
sheet_names = ['Header', 'Detail', 'ContactInfo', 'Address']
file_location = "/FileStore/tables/Project_1.xlsx"
 
 
# Create an empty dictionary to store DataFrames
dfs = {}
 
# Loop through each sheet name and read data into DataFrame
for sheet_name in sheet_names:
    df = spark.read.format("com.crealytics.spark.excel") \
               .option("inferschema", True) \
               .option("header", True) \
               .option("dataAddress", f"{sheet_name}!") \
               .option("sheetName", sheet_name) \
               .load(file_location)
    dfs[sheet_name] = df
 
# Display or further process the data as needed
# for sheet_name, df in dfs.items():
#     print(f"Sheet Name: {sheet_name}")
#     display(df)

In [0]:
# TO VIEW ALL THE SHEETS

for sheet_name, df in dfs.items():
    # Replace special characters with underscores
    view_name = sheet_name.replace(" ", "_").replace("-", "_")
    df.createOrReplaceTempView(view_name)
 
# Display contents of temporary views
for sheet_name in sheet_names:
    # Replace special characters with underscores
    view_name = sheet_name.replace(" ", "_").replace("-", "_")
    # print(f"Viewing contents of temporary table: {view_name}")
    # spark.sql(f"SELECT * FROM {view_name}").show()

In [0]:
%sql

CREATE TABLE NovaTable(
    source_id STRING,
    subscriber_id STRING,
    first_name STRING,
    middle_name STRING,
    last_name STRING,
    prefix_name STRING,
    suffix_name STRING,
    name STRING,
    record_source STRING,
    record_created_ts TIMESTAMP,
    is_verified BOOLEAN,
    addresses ARRAY<STRUCT<
        address_type: STRING,
        address_line_1: STRING,
        address_line_2: STRING,
        city: STRING,
        state_province: STRING,
        postal_code: STRING,
        zip_code_extension: STRING,
        country: STRING
    >>,
    phones ARRAY<STRUCT<
        phone_type: STRING,
        number: STRING
    >>,
    email STRING,
    privacy_preference BOOLEAN,
    national_id STRING,
    gender STRING,
    maritial_status STRING,
    date_of_birth DATE,
    year_of_birth STRING,
    deceased_ind BOOLEAN,
    deceased_age STRING,
    deceased_date DATE,
    languages ARRAY<STRING>,
    employment STRUCT<
        employer_name: STRING,
        employee_role: STRING,
        employee_status: STRING,
        employee_hiredate: DATE
    >,
    additional_source_value MAP<STRING, STRING>
);

In [0]:
%sql
INSERT INTO NovaTable
SELECT
  source_id,
  subscriber_id,
  first_name,
  middle_name,
  last_name,
  prefix_name,
  suffix_name,
   CONCAT_WS(' ', first_name, middle_name, last_name) AS name,
  record_source,
  record_created_ts,
  is_verified,
  addresses,
  phones,
  email,
  privacy_preference,
  national_id,
  gender,
  marital_status,
  date_of_birth,
  year_of_birth,
  deceased_ind,
  deceased_age,
  deceased_date,
  languages,
  employment,
  additional_source_value
FROM (
  SELECT
    header.id AS source_id,
    header.insurer_id AS subscriber_id,
    detail.first_name AS first_name,
    detail.middle_name AS middle_name,
    detail.last_name AS last_name,
    CASE
      WHEN detail.gender = 'F' AND detail.marital_status IN ('Widowed', 'Divorced', 'Single') THEN 'Ms.'
      WHEN detail.gender = 'F' AND detail.marital_status = 'Married' THEN 'Mrs.'
      WHEN detail.gender = 'M' THEN 'Mr.'
      ELSE NULL
    END AS prefix_name,
  CASE
  WHEN detail.job_role IS NOT NULL AND POSITION(' ' IN detail.job_role) > 0 THEN
    CONCAT(
      SUBSTRING(detail.job_role, 1, 1),
      SUBSTRING(detail.job_role, POSITION(' ' IN detail.job_role) + 1, 1)
    )
 
  WHEN job_role LIKE '%Engineer%' THEN 'Er'
  WHEN job_role LIKE '%Nurse%' THEN 'RN'
  WHEN job_role LIKE '%Analyst%' THEN 'Analyst'
  WHEN job_role LIKE '%Manager%' THEN 'Mgr'
  WHEN job_role LIKE '%Administrator%' THEN 'Admin'
  WHEN job_role LIKE '%Developer%' THEN 'Dev'
  WHEN job_role LIKE '%Assistant%' THEN 'Asst'
  WHEN job_role LIKE '%Technician%' THEN 'Tech'
  WHEN job_role LIKE '%Account%' THEN 'Acct'
  WHEN job_role LIKE '%Biostatistician%' THEN 'BioStat'
  WHEN job_role LIKE '%Health Coach%' THEN 'HlthCoach'
  WHEN job_role LIKE '%Designer%' THEN 'Designer'
  WHEN job_role LIKE '%Statistician%' THEN 'Stat'
  WHEN job_role LIKE '%Programmer%' THEN 'Prog'
  WHEN job_role LIKE '%Professor%' THEN 'Prof'
  WHEN job_role LIKE '%Recruiter%' THEN 'RC'
  WHEN job_role LIKE '%Operator%' THEN 'OP'
  WHEN job_role LIKE '%Librarian%' THEN 'Lib'
  WHEN job_role LIKE '%Coordinator%' THEN 'Coord'
  WHEN job_role LIKE '%Automation Specialist%' THEN 'AutoSpec'
  WHEN job_role LIKE '%VP%' THEN 'VP'
  WHEN job_role LIKE '%Geologist%' THEN 'Geol'
  ELSE NULL
END AS suffix_name,

    CONCAT(detail.first_name, ' ', detail.middle_name, ' ', detail.last_name) AS name,
    'NOVA' AS record_source,
    CURRENT_TIMESTAMP() AS record_created_ts,
    true AS is_verified,
    ARRAY_AGG(STRUCT(
      address.address_type AS address_type,
      address.address_line_1 AS address_line_1,
      address.address_line_2 AS address_line_2,
      address.city AS city,
      address.state AS state_province,
      CASE
        WHEN LOCATE('-', address.zipcode) > 0 THEN SUBSTRING_INDEX(address.zipcode, '-', -1)
        WHEN LENGTH(address.zipcode) = 4 THEN address.zipcode
        ELSE NULL
      END AS postal_code,
      CASE
        WHEN LOCATE('-', address.zipcode) > 0 THEN SUBSTRING_INDEX(address.zipcode, '-', 1)
        WHEN LENGTH(address.zipcode) = 5 THEN address.zipcode
        ELSE NULL
      END AS zip_code_extension,
      'US' AS country
    )) OVER (PARTITION BY address.id) AS addresses,
    ARRAY_AGG(STRUCT(
      contactinfo.usage_type AS phone_type,
      CASE
        WHEN LENGTH(REGEXP_REPLACE(contactinfo.phone, '[^0-9]', '')) = 10 THEN contactinfo.phone
        ELSE NULL
      END AS number
    )) OVER (PARTITION BY contactinfo.id) AS phones,
    CASE
      WHEN LOCATE('@', detail.email) > 0 AND LOCATE('.', detail.email) > 0 THEN detail.email
      ELSE NULL
    END AS email,
    true AS privacy_preference,
    CASE
      WHEN LENGTH(REGEXP_REPLACE(detail.ssn, '[^0-9]', '')) = 9 THEN REGEXP_REPLACE(detail.ssn, '[^0-9]', '')
      ELSE NULL
    END AS national_id,
    detail.gender AS gender,
    detail.marital_status AS marital_status,
    DATE(detail.date_of_birth) AS date_of_birth,
    YEAR(detail.date_of_birth) AS year_of_birth,
    CASE
      WHEN detail.deceased_date IS NULL THEN false
      ELSE true
    END AS deceased_ind,
    CASE
      WHEN detail.deceased_date IS NOT NULL THEN
        CASE
          WHEN YEAR(to_date(detail.deceased_date, 'M/d/yy')) - YEAR(detail.date_of_birth) > 122 THEN
            YEAR(to_date(detail.deceased_date, 'M/d/yy')) - YEAR(detail.date_of_birth) - 100
          ELSE
            YEAR(to_date(detail.deceased_date, 'M/d/yy')) - YEAR(detail.date_of_birth)
        END
      ELSE NULL
    END AS deceased_age,
    to_date(detail.deceased_date, 'M/d/yy') AS deceased_date,
    ARRAY(detail.spoken_language_1, detail.spoken_language_2) AS languages,
  STRUCT(
  detail.company AS employer_name,
  detail.job_role AS employee_role,
  NULL AS employee_status,
  detail.job_hiredate AS employee_hiredate
) AS employment,

    map('relationship', header.relationship, 'religion', detail.religion) AS additional_source_value,
    ROW_NUMBER() OVER (PARTITION BY header.id ORDER BY detail.id) AS row_num
  FROM header
  LEFT JOIN detail ON header.id = detail.id
  LEFT JOIN contactinfo ON header.id = contactinfo.id
  LEFT JOIN address ON header.id = address.id
) temp
WHERE row_num = 1
ORDER BY first_name ASC;

num_affected_rows,num_inserted_rows
1500,1500


In [0]:
%sql
select * from NovaTable

source_id,subscriber_id,first_name,middle_name,last_name,prefix_name,suffix_name,name,record_source,record_created_ts,is_verified,addresses,phones,email,privacy_preference,national_id,gender,maritial_status,date_of_birth,year_of_birth,deceased_ind,deceased_age,deceased_date,languages,employment,additional_source_value
71233.0,41314.0,Aaren,Kristina,Paslow,Ms.,SA,Aaren Kristina Paslow,NOVA,2024-02-03T20:41:56.747+0000,True,"List(List(Residental, 92614 Robert Village, Suite 502, West Amystad, Arizona, null, 20690, US), List(Residental, 92614 Robert Village, Suite 502, West Amystad, Arizona, null, 20690, US))","List(List(Home, (990) 7094174), List(Work, (737) 5432817))",kpaslow6g@skype.com,True,723280641.0,F,Divorced,1990-05-24,1990,False,,,"List(GuaranÃ­, null)","List(Centizu, Structural Analysis Engineer, null, 1942-02-17)","Map(relationship -> spouse, religion -> Islam)"
70270.0,40425.0,Abbey,Leona,Reddick,Ms.,IS,Abbey Leona Reddick,NOVA,2024-02-03T20:41:56.747+0000,True,"List(List(Mail, 650 Brandon Alley, Suite 442, Port Bonnie, Rhode Island, null, 80518, US), List(Residental, 2836 John Harbors Apt. 809, Suite 166, Parkerton, Ohio, null, 75900, US))","List(List(Work, (764) 8054265), List(Work, (764) 8054265))",lreddick7h@jugem.jp,True,891323912.0,F,Divorced,1984-07-24,1984,False,,,"List(Georgian, Hiri Motu)","List(Zooxo, Information Systems Manager, null, 1907-04-28)","Map(relationship -> parent, religion -> Islam)"
70759.0,41156.0,Abbie,Dalila,Coppeard,Ms.,CA,Abbie Dalila Coppeard,NOVA,2024-02-03T20:41:56.747+0000,True,"List(List(Residental, 21177 Thompson Forges Apt. 335, null, East Brenda, Delaware, null, 98918, US), List(Residental, 21177 Thompson Forges Apt. 335, null, East Brenda, Delaware, null, 98918, US))","List(List(Home, (140) 8611723), List(Work, (773) 4506117))",dcoppeardl2@youtu.be,True,329112834.0,F,Divorced,1938-07-27,1938,False,,,"List(Zulu, Tetum)","List(Photospace, Compensation Analyst, null, 1918-11-19)","Map(relationship -> friend, religion -> Christianity)"
70140.0,40525.0,Abel,Hunter,Wretham,Mr.,ST,Abel Hunter Wretham,NOVA,2024-02-03T20:41:56.747+0000,True,"List(List(Residental, 71370 Hill Tunnel Apt. 762, null, Port Kellyland, Oregon, null, 64220, US), List(Mail, 17267 Butler Port Suite 112, Suite 301, Watsonbury, Alabama, 8961, null, US))","List(List(Work, (802) 5654247), List(Work, (802) 5654247))",hwretham3v@webnode.com,True,568630399.0,M,,1990-11-24,1990,False,,,"List(Marathi, French)","List(Omba, Software Test Engineer III, null, 1983-06-22)","Map(relationship -> child, religion -> null)"
70216.0,40466.0,Adair,Ingemar,Garrie,Mr.,CA,Adair Ingemar Garrie,NOVA,2024-02-03T20:41:56.747+0000,True,"List(List(Mail, 6939 Kevin Mews, Suite 594, Fieldsstad, Washington, null, 83694, US), List(Residental, 0492 Schmidt Alley Apt. 453, null, Port Donald, Delaware, null, 77631, US))","List(List(Work, (813) 4672327), List(Work, (813) 4672327))",igarrie5z@cocolog-nifty.com,True,743866551.0,M,Married,1900-10-01,1900,False,,,"List(Belarusian, null)","List(Browsetype, Compensation Analyst, null, 1951-12-14)","Map(relationship -> sibling, religion -> Christianity)"
70674.0,40771.0,Adaline,,Rockcliff,Ms.,,Adaline Rockcliff,NOVA,2024-02-03T20:41:56.747+0000,True,"List(List(Residental, 028 Gates Corners Apt. 667, Suite 367, Brendafurt, New Mexico, null, 32095, US))","List(List(Work, (374) 5615730))",wrockcliffip@istockphoto.com,True,446525807.0,F,Divorced,1955-05-15,1955,False,,,"List(Danish, Oriya)","List(Youspan, Teacher, null, 1929-03-04)","Map(relationship -> parent, religion -> Other)"
71348.0,41628.0,Adda,Alyson,Upex,Ms.,TW,Adda Alyson Upex,NOVA,2024-02-03T20:41:56.747+0000,True,"List(List(Residental, 9406 Carlson Parks Suite 077, Apt. 951, West Michaelchester, Montana, null, 15803, US), List(Residental, 9406 Carlson Parks Suite 077, Apt. 951, West Michaelchester, Montana, null, 15803, US))","List(List(Home, (248) 5348653), List(Work, (660) 7921874))",aupex9n@yahoo.co.jp,True,390852297.0,F,Single,1946-01-24,1946,False,,,"List(Assamese, Czech)","List(Babbleblab, Technical Writer, null, 1989-02-25)","Map(relationship -> spouse, religion -> Buddhism)"
70591.0,40777.0,Addie,Malvina,Dyshart,,RA,Addie Malvina Dyshart,NOVA,2024-02-03T20:41:56.747+0000,True,"List(List(Residental, 315 Vincent Spur, Apt. 741, South Larry, Tennessee, null, 93964, US))","List(List(Work, (704) 4997831))",mdyshartge@opera.com,True,703150548.0,,Married,1959-12-21,1959,False,,,"List(Kyrgyz, Papiamento)","List(Photobean, Research Assistant I, null, 1909-10-26)","Map(relationship -> spouse, religion -> Christianity)"
71083.0,41146.0,Addie,,Heffy,Mr.,SF,Addie Heffy,NOVA,2024-02-03T20:41:56.747+0000,True,"List(List(Residental, 587 Glover Manor Apt. 989, null, South Alexandra, Virginia, null, 59141, US), List(Residental, 587 Glover Manor Apt. 989, null, South Alexandra, Virginia, null, 59141, US))","List(List(Home, (451) 1058735), List(Work, (345) 8525187))",jheffy2a@howstuffworks.com,True,893579382.0,M,Divorced,1978-06-10,1978,False,,,"List(Georgian, Northern Sotho)","List(Topdrive, Senior Financial Analyst, null, 1986-06-02)","Map(relationship -> parent, religion -> Buddhism)"
71099.0,41389.0,Adela,,Eat,Mrs.,PE,Adela Eat,NOVA,2024-02-03T20:41:56.747+0000,True,"List(List(Residental, 0711 Adam Viaduct Suite 071, Apt. 434, Wileyside, Massachusetts, null, 78462, US), List(Residental, 0711 Adam Viaduct Suite 071, Apt. 434, Wileyside, Massachusetts, null, 78462, US))","List(List(Home, (506) 1816139), List(Work, (621) 7538600))",keat2q@biblegateway.com,True,787310870.0,F,Married,1928-05-15,1928,True,88.0,2016-11-01,"List(Filipino, Hiri Motu)","List(Jaxspan, Product Engineer, null, 1984-09-12)","Map(relationship -> sibling, religion -> Hinduism)"
