In [0]:
%sql
CREATE TABLE nova_healthcare(
    source_id VARCHAR(100),
    subscriber_id VARCHAR(100),
    first_name VARCHAR(100),
    middle_name VARCHAR(100),
    last_name VARCHAR(100),
    prefix_name VARCHAR(100),
    suffix_name VARCHAR(100),
    name VARCHAR(100),
    record_source VARCHAR(100),
    record_created_ts TIMESTAMP,
    is_verified BOOLEAN,
    addresses ARRAY<STRUCT<
        address_type STRING,
        address_line_1 STRING,
        address_line_2 STRING,
        city STRING,
        state_province STRING,
        postal_code STRING,
        zip_code_extension STRING,
        country STRING
    >>,
    phones ARRAY<STRUCT<
      phone_type STRING,
      number STRING
    >>,
    email VARCHAR(100),
    privacy_preference BOOLEAN,
    national_id VARCHAR(100),
    gender VARCHAR(100),
    marital_status VARCHAR(100),
    date_of_birth DATE,
    year_of_birth VARCHAR(4),
    deceased_ind BOOLEAN,
    deceased_age INTEGER,
    deceased_date DATE,
    
    languages ARRAY<STRUCT<
      spoken_language_1 STRING,
      spoken_language_2 STRING
    >>,

    employment STRUCT<
        employer_name VARCHAR(255),
        employee_role VARCHAR(255),
        employee_status VARCHAR(50),
        employee_hiredate DATE
    >,
    additional_source_value string
);



In [0]:
%sql
INSERT INTO nova_healthcare(
  source_id,
  subscriber_id,
  first_name,
  middle_name,
  last_name,
  prefix_name,
  suffix_name,
  name,
  record_source,
  record_created_ts,
  is_verified,
  addresses,
  phones,
  email,
  privacy_preference,
  national_id,
  gender,
  marital_status,
  date_of_birth,
  year_of_birth,
  deceased_ind,
  deceased_age,
  deceased_date,
  languages,
  employment,
  additional_source_value
)
SELECT 
  h.id AS source_id,
  h.insurer_id AS subscriber_id,
  d.first_name AS first_name,
  d.middle_name AS middle_name,
  d.last_name AS last_name,
  CASE
    WHEN d.gender = 'M' AND d.marital_status IN ('Married', 'Single', 'Widowed') THEN 'Mr.'
    WHEN d.gender = 'M' THEN 'Mr.'
    WHEN d.gender = 'F' AND d.marital_status = 'Single' THEN 'Miss.'
    WHEN d.gender = 'F' AND d.marital_status IN ('Married', 'Widowed', 'Divorced') THEN 'Mrs.'
    ELSE NULL
  END AS prefix_name,
  CASE
    WHEN d.job_role = 'Registered Nurse' THEN 'RN'
    WHEN d.job_role = 'Pharmacist' THEN 'PharmD'
    WHEN d.job_role = 'Assistant Professor' THEN 'PhD'
    WHEN d.job_role = 'Associate Professor' THEN 'PhD'
    ELSE NULL
  END AS suffix_name,
  COALESCE(
    CONCAT_WS(' ', d.first_name, NULLIF(d.middle_name, ''), d.last_name),
    d.first_name,
    d.last_name,
    d.middle_name
  ) AS name, 
  'Nova HealthCare' as record_source,
  CURRENT_TIMESTAMP() AS record_created_ts,
  CASE
        WHEN email LIKE '%@%.%'
        AND LENGTH(email) - LENGTH(REPLACE(email, '@', '')) = 1
        AND POSITION('@' IN email) < POSITION('.' IN email)
        AND SUBSTRING_INDEX(email, '@', 1) RLIKE '^[a-zA-Z0-9._]+$'
        AND SUBSTRING_INDEX(email, '@', -1) RLIKE '^[a-zA-Z0-9.-]+$'
        THEN 'true'
        ELSE 'false'
    END as is_verified,
  addresses_agg.addresses,
  phones_agg.phones,
  d.email,
  true as privacy_preference,
  d.ssn as national_id,
  d.gender,
  d.marital_status,
  TO_DATE(D.date_of_birth, 'MM/dd/yyyy') AS date_of_birth,
  year(TO_DATE(D.date_of_birth,'mm/dd/yyyy')) as year_of_birth,
  CASE
    WHEN deceased_date IS NOT NULL THEN true
    ELSE false
  END AS deceased_ind,
  TRY_CAST(
    date_diff(
      TO_DATE(deceased_date, 'MM/dd/yyyy'),
      TO_DATE(date_of_birth, 'MM/dd/yyyy')
    ) / 365
    AS INT
  ) as deceased_age,
  TO_DATE(d.deceased_date, 'MM/dd/yyyy') AS deceased_date,
  languages_agg.languages,
  STRUCT(
    d.company AS employer_name,
    d.job_role AS employee_role,
    CASE
    WHEN deceased_date IS NOT NULL THEN 'Deceased'
    WHEN TO_DATE(d.job_hiredate, 'MM/dd/yyyy') < TO_DATE('01/01/1964', 'MM/dd/yyyy') THEN 'Inactive'
    ELSE 'Active'
END AS employee_status,
    TO_DATE(d.job_hiredate, 'MM/dd/yyyy') AS employee_hiredate
  ) AS employment,
  h.relationship as additional_source_value
FROM header11 h
LEFT JOIN detail11 AS d ON h.id = d.id
LEFT JOIN (
  SELECT id, ARRAY_AGG(
    STRUCT(
      address_type,
      address_line_1,
      address_line_2,
      city,
      state,
      CASE WHEN LENGTH(zipcode) = 4 THEN zipcode END AS postal_code,
      CASE WHEN LENGTH(zipcode) = 5 THEN zipcode END AS zip_code_extension,
      'USA' as country
    )
  ) AS addresses
  FROM address11
  GROUP BY id
) AS addresses_agg ON h.id = addresses_agg.id
LEFT JOIN (
  SELECT id, ARRAY_AGG(
    STRUCT(
      usage_type,
      phone
    )
  ) AS phones
  FROM contact11
  GROUP BY id
) AS phones_agg ON h.id = phones_agg.id
LEFT JOIN (
  SELECT id, ARRAY_AGG(
    STRUCT(
      spoken_language_1,
      spoken_language_2 
    )
  ) AS languages
  FROM detail11
  GROUP BY id
) AS languages_agg ON h.id = languages_agg.id;


num_affected_rows,num_inserted_rows
1500,1500


In [0]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [0]:
%sql 
select *
from nova_healthcare

source_id,subscriber_id,first_name,middle_name,last_name,prefix_name,suffix_name,name,record_source,record_created_ts,is_verified,addresses,phones,email,privacy_preference,national_id,gender,marital_status,date_of_birth,year_of_birth,deceased_ind,deceased_age,deceased_date,languages,employment,additional_source_value
70001,40184,Hettie,,Keenlayside,Mrs.,,Hettie Keenlayside,Nova HealthCare,2024-02-01T17:46:41.082+0000,True,"List(List(Residental, 046 Cox Lights, Suite 600, Griffinhaven, South Carolina, null, 71709, USA), List(Mail, 4307 Ashley Village Suite 758, null, New Kyle, North Dakota, null, 87337, USA))","List(List(Work, (455) 3130004))",jkeenlayside0@disqus.com,True,168-92-1075,F,Widowed,1939-08-05,1939,False,,,"List(List(West Frisian, Swahili))","List(Gabcube, Clinical Specialist, Active, 1964-01-29)",child
70002,40092,Reade,,Laverenz,Mr.,,Reade Laverenz,Nova HealthCare,2024-02-01T17:46:41.082+0000,True,"List(List(Mail, 183 Dalton Viaduct, Suite 844, South Natalie, Oregon, null, 77714, USA), List(Residental, 737 Banks Row, Apt. 505, North Heather, Arkansas, 2341, null, USA))","List(List(Work, (994) 4561640))",dlaverenz1@senate.gov,True,782-24-9907,M,Widowed,1941-05-14,1941,False,,,"List(List(Swati, Danish))","List(Skibox, Staff Scientist, Inactive, 1958-05-18)",friend
70003,40233,Minnnie,,Baack,Mrs.,,Minnnie Baack,Nova HealthCare,2024-02-01T17:46:41.082+0000,True,"List(List(Residental, 27634 Miller Prairie, null, West Ruth, North Dakota, null, 73151, USA), List(Mail, 807 Jesus Mills Suite 598, Suite 735, Churchbury, Texas, null, 97223, USA))","List(List(Work, (771) 6498755))",dbaack2@sina.com.cn,True,726-01-1271,F,Married,1982-11-20,1982,False,,,"List(List(Swati, null))","List(Dabjam, Paralegal, Active, 2011-06-10)",spouse
70004,40058,Tana,Agata,Aiken,,,Tana Agata Aiken,Nova HealthCare,2024-02-01T17:46:41.082+0000,True,"List(List(Mail, 36717 Philip Common, Suite 278, Thomasborough, Idaho, null, null, USA), List(Residental, 9831 Robert Falls, Apt. 086, Michelleland, Oregon, null, null, USA))","List(List(Work, (450) 8886723))",aaiken3@nydailynews.com,True,492-62-0968,F,,1929-02-18,1929,False,,,"List(List(New Zealand Sign Language, Punjabi))","List(Aimbu, VP Marketing, Active, 2014-10-08)",spouse
70005,40088,Cyndia,,Tolomelli,,,Cyndia Tolomelli,Nova HealthCare,2024-02-01T17:46:41.082+0000,True,"List(List(Residental, 0861 Caldwell Dam, Suite 783, North Robertborough, New Mexico, null, 31718, USA), List(Mail, 95855 Davis Lodge, Suite 059, Kimberlymouth, Louisiana, null, 33733, USA))","List(List(Work, (423) 1700133))",ltolomelli4@istockphoto.com,True,802-24-1062,F,,1920-05-31,1920,False,,,"List(List(null, Albanian))","List(Edgepulse, Senior Developer, Inactive, 1931-01-16)",friend
70006,40170,Johnny,Renaud,Gibben,Mr.,,Johnny Renaud Gibben,Nova HealthCare,2024-02-01T17:46:41.082+0000,True,"List(List(Mail, 1049 Riggs Stream Suite 632, Suite 465, New Christopher, Maine, null, 43342, USA), List(Residental, 8737 Flores Extension Suite 549, null, Jasonbury, Idaho, null, 20277, USA))","List(List(Work, (334) 1254061))",rgibben5@tumblr.com,True,563-98-1576,M,Single,1958-07-01,1958,False,,,"List(List(Georgian, null))","List(Oodoo, Human Resources Assistant I, Active, 2021-12-26)",child
70007,40194,Judas,,Mitford,Mr.,,Judas Mitford,Nova HealthCare,2024-02-01T17:46:41.082+0000,True,"List(List(Mail, 16915 Michelle Fields Apt. 930, Suite 488, South Pamela, New Mexico, null, 44394, USA), List(Residental, 7475 Michael Land, Suite 392, New Latoyamouth, Hawaii, null, 89157, USA))","List(List(Work, (915) 7431041))",bmitford6@github.io,True,626-84-9457,M,Divorced,1993-07-30,1993,False,,,"List(List(New Zealand Sign Language, Nepali))","List(Bluejam, Data Coordinator, Active, 2018-06-11)",parent
70008,40079,Wilden,Tobin,Huertas,Mr.,,Wilden Tobin Huertas,Nova HealthCare,2024-02-01T17:46:41.082+0000,True,"List(List(Mail, 64558 Alexis Club, Apt. 683, Lake Lindaside, New Hampshire, 7936, null, USA), List(Residental, 891 Frank Squares Suite 096, Apt. 809, Whitefort, Virginia, null, 17694, USA))","List(List(Work, (816) 4980330))",thuertas7@yahoo.co.jp,True,667-45-8806,M,Widowed,1906-08-02,1906,False,,,"List(List(Norwegian, null))","List(Roomm, Database Administrator I, Inactive, 1923-03-07)",spouse
70009,40466,Gaelan,,Smitheman,Mr.,,Gaelan Smitheman,Nova HealthCare,2024-02-01T17:46:41.082+0000,True,"List(List(Residental, 7163 Thompson Park Suite 842, null, Kevinside, Nebraska, null, 74553, USA), List(Mail, 839 Garcia Highway, Apt. 915, Jermaineborough, North Dakota, null, 25837, USA))","List(List(Work, (460) 8203658))",msmitheman8@ezinearticles.com,True,854-32-5148,M,Divorced,1926-03-04,1926,False,,,"List(List(Japanese, Catalan))","List(Trupe, Analyst Programmer, Active, 1998-12-15)",sibling
70010,40061,Letti,,Folkard,Mrs.,,Letti Folkard,Nova HealthCare,2024-02-01T17:46:41.082+0000,True,"List(List(Mail, 388 Perry Mills, Suite 521, Monroechester, Minnesota, null, 13191, USA), List(Residental, 7357 Beck Garden Apt. 240, null, Hunterfort, Wyoming, null, 69080, USA))","List(List(Work, (577) 3110757))",tfolkard9@biblegateway.com,True,867-58-4596,F,Divorced,1900-03-07,1900,False,,,"List(List(Tajik, Tamil))","List(Yambee, Staff Scientist, Active, 2005-03-11)",child
