In [0]:
df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/thapaliyasuj@gmail.com/Detail.csv")
df1.write.mode("overwrite").saveAsTable("Details")


In [0]:
%sql
select * from details;

id,first_name,middle_name,last_name,email,ssn,gender,religion,marital_status,date_of_birth,deceased_date,spoken_language_1,spoken_language_2,company,job_role,job_hiredate
70001,Hettie,,Keenlayside,jkeenlayside0@disqus.com,168-92-1075,F,Buddhism,Widowed,8/5/1939,,West Frisian,Swahili,Gabcube,Clinical Specialist,1/29/1964
70002,Reade,,Laverenz,dlaverenz1@senate.gov,782-24-9907,M,Christianity,Widowed,5/14/1941,,Swati,Danish,Skibox,Staff Scientist,5/18/1958
70003,Minnnie,,Baack,dbaack2@sina.com.cn,726-01-1271,F,Buddhism,Married,11/20/1982,,Swati,,Dabjam,Paralegal,6/10/2011
70004,Tana,Agata,Aiken,aaiken3@nydailynews.com,492-62-0968,F,,,2/18/1929,,New Zealand Sign Language,Punjabi,Aimbu,VP Marketing,10/8/2014
70005,Cyndia,,Tolomelli,ltolomelli4@istockphoto.com,802-24-1062,F,,,5/31/1920,,,Albanian,Edgepulse,Senior Developer,1/16/1931
70006,Johnny,Renaud,Gibben,rgibben5@tumblr.com,563-98-1576,M,Buddhism,Single,7/1/1958,,Georgian,,Oodoo,Human Resources Assistant I,12/26/2021
70007,Judas,,Mitford,bmitford6@github.io,626-84-9457,M,Hinduism,Divorced,7/30/1993,,New Zealand Sign Language,Nepali,Bluejam,Data Coordinator,6/11/2018
70008,Wilden,Tobin,Huertas,thuertas7@yahoo.co.jp,667-45-8806,M,Other,Widowed,8/2/1906,,Norwegian,,Roomm,Database Administrator I,3/7/1923
70009,Gaelan,,Smitheman,msmitheman8@ezinearticles.com,854-32-5148,M,Christianity,Divorced,3/4/1926,,Japanese,Catalan,Trupe,Analyst Programmer,12/15/1998
70010,Letti,,Folkard,tfolkard9@biblegateway.com,867-58-4596,F,Other,Divorced,3/7/1900,,Tajik,Tamil,Yambee,Staff Scientist,3/11/2005


In [0]:
file_path = "dbfs:/FileStore/shared_uploads/thapaliyasuj@gmail.com/Address.xlsx"
df_xlsx = spark.read.format("com.crealytics.spark.excel") \
                    .option("header", "true") \
                    .option("inferSchema", "true") \
                    .load(file_path)

df_xlsx.write.mode("overwrite").saveAsTable("Address")

In [0]:
txt_path = "dbfs:/FileStore/shared_uploads/thapaliyasuj@gmail.com/contactinfo.txt"


df_txt = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", "\t") \
    .load(txt_path)

pandas_dftxt = df_txt.toPandas()
spark_dftxt = spark.createDataFrame(pandas_dftxt)
spark_dftxt.createOrReplaceTempView("temp")

# Create or replace a table using Spark SQL
spark.sql("CREATE OR REPLACE TABLE contactinfo AS SELECT * FROM temp")


Out[5]: DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
json_path = "dbfs:/FileStore/shared_uploads/thapaliyasuj@gmail.com/header.json"
df_json = spark.read.option("multiline", "true").json(json_path)
df_json.write.mode("overwrite").saveAsTable("Header")

In [0]:
%sql
CREATE TABLE IF NOT EXISTS targettable (
  source_id STRING,
  subscriber_id STRING,
  first_name STRING,
  middle_name STRING,
  last_name STRING,
  prefix_name STRING,
  suffix_name STRING,
  name STRING,
  record_source STRING,
  record_created_ts TIMESTAMP,
  is_verified BOOLEAN,
  addresses ARRAY<STRUCT<
                  address_type: STRING, 
                  address_line_1: STRING, 
                  address_line_2: STRING, 
                  city: STRING, 
                  state_province: STRING,
                  postal_code : STRING,
                  zip_code_extension : STRING,
                  country: STRING
              >>,
  phones STRUCT<
              number: STRING, 
              phone_type: STRING>,
  email STRING,
  privacy_preference BOOLEAN,
  national_id STRING,
  gender STRING,
  marital_status STRING,
  date_of_birth DATE,
  year_of_birth STRING,
  deceased_ind BOOLEAN,
  deceased_age INT,
  deceased_date DATE,
  languages ARRAY<STRING>,
  employment STRUCT<
              employee_name: STRING, 
              employee_role: STRING,
              employee_status: STRING, 
              employee_hiredate: DATE>
) USING DELTA
LOCATION '/dbfs:/user/hive/warehouse/targettable';


In [0]:
%sql
WITH prefixed_names AS (
  SELECT
    id,
    CASE 
      WHEN gender = 'M' THEN 'Mr.'
      WHEN gender = 'F' AND (marital_status = 'single' OR marital_status IS NULL) THEN 'Miss'
      WHEN gender = 'F' AND marital_status IN ('married', 'widowed', 'divorced') THEN 'Mrs.'
      WHEN gender IS NULL AND (marital_status IS NULL OR marital_status IN ('divorced', 'single', 'widowed', 'married')) THEN 'Mx.'
      ELSE 'not known'
    END AS prefix
  FROM details
),
verified_emails AS (
  SELECT
    id,
    email,
    CASE 
      WHEN CHARINDEX('@', email) > 0 
           AND CHARINDEX('.', email, CHARINDEX('@', email)) > CHARINDEX('@', email) THEN TRUE
      ELSE FALSE
    END AS is_verified
  FROM details
),


address_data AS (
  SELECT
    id,
    COLLECT_LIST(STRUCT(
      address_type,
      address_line_1,
      address_line_2,
      city,
      state AS state_province,
      LEFT(zipcode, 5) AS postal_code,
      SUBSTRING(zipcode FROM 6) AS zip_code_extension,
      'USA' AS country
    )) AS addresses
  FROM address
  GROUP BY id
),


phone_data AS (
  SELECT
    id,
    FIRST(STRUCT(phone, usage_type)) AS phones
  FROM contactinfo
  GROUP BY id
)

INSERT INTO targettable
SELECT 
  h.id AS source_id,
  h.insurer_id AS subscriber_id,
  d.first_name,
  d.middle_name,
  d.last_name,
  pn.prefix AS prefix_name,
  CASE 
    WHEN d.job_role = "Registered Nurse" THEN "RN"
    WHEN d.job_role = "Doctor" THEN "Dr."
    WHEN d.job_role = "Professor" THEN "Prof."
    WHEN d.job_role LIKE "%Engineer" THEN "Er."
    ELSE "-"
  END AS suffix_name,

  d.first_name || ' ' || d.last_name || COALESCE(' ' || d.middle_name, '') AS name,

  'Nova_Health' AS record_source,
  CURRENT_TIMESTAMP() AS record_created_ts,
  ve.is_verified,
  ad.addresses,
  pd.phones,
  d.email,
  FALSE AS privacy_preference,
  d.ssn AS national_id,
  d.gender AS gender,
  d.marital_status AS marital_status,
  TO_DATE(d.date_of_birth, 'M/d/yyyy') as date_of_birth, 
  YEAR(TO_DATE(D.date_of_birth, 'M/d/yyyy')) as year_of_birth,
  CASE 
        WHEN deceased_date IS NOT NULL THEN TRUE
        ELSE FALSE 
  END AS deceased_ind,
  YEAR(TO_DATE(d.deceased_date, 'M/d/yyyy')) - YEAR(TO_DATE(d.date_of_birth, 'M/d/yyyy')) AS deceased_age,
  TO_DATE(d.deceased_date, 'M/d/yyyy') AS deceased_date,

  ARRAY(d.spoken_language_1, d.spoken_language_2) AS languages,

  STRUCT(
    d.first_name || ' ' || d.last_name || ' ' || d.middle_name AS employee_name,
    d.job_role,
    CASE 
      WHEN d.job_hiredate IS NULL THEN 'inactive' 
      ELSE 'active' 
    END AS employee_status,
    TO_DATE(d.job_hiredate, 'M/d/yyyy') AS employee_hiredate
  ) AS employment
FROM header h
LEFT JOIN details d ON h.id = d.id
LEFT JOIN prefixed_names pn ON h.id = pn.id
LEFT JOIN verified_emails ve ON d.id = ve.id
LEFT JOIN address_data ad ON h.id = ad.id
LEFT JOIN phone_data pd ON h.id = pd.id
GROUP BY ALL

num_affected_rows,num_inserted_rows
1500,1500


In [0]:
%sql
select * from targettable;

source_id,subscriber_id,first_name,middle_name,last_name,prefix_name,suffix_name,name,record_source,record_created_ts,is_verified,addresses,phones,email,privacy_preference,national_id,gender,marital_status,date_of_birth,year_of_birth,deceased_ind,deceased_age,deceased_date,languages,employment
70002,40092,Reade,,Laverenz,Mr.,,Reade Laverenz,Nova Health,2024-02-02T15:50:01.334+0000,True,"List(List(Mail, 183 Dalton Viaduct, Suite 844, South Natalie, Oregon, 77714, , USA), List(Residental, 737 Banks Row, Apt. 505, North Heather, Arkansas, 2341, , USA))","List((994) 4561640, Work)",dlaverenz1@senate.gov,False,782-24-9907,M,Widowed,1941-05-14,1941,False,,,"List(Swati, Danish)","List(null, Staff Scientist, active, 1958-05-18)"
70194,40498,Dalston,,Bagby,Mr.,,Dalston Bagby,Nova Health,2024-02-02T15:50:01.334+0000,True,"List(List(Mail, 261 Joel Camp Apt. 145, Apt. 117, Latoyaport, New Mexico, 58491, , USA), List(Residental, 65940 Richard Plains, Apt. 120, Port Markmouth, Virginia, 15631, , USA))","List((147) 6180375, Work)",mbagby5d@newsvine.com,False,799-49-8877,M,Divorced,1998-02-26,1998,False,,,"List(Romanian, null)","List(null, Assistant Media Planner, active, 2008-06-27)"
70697,41194,Nathanial,Carter,Scryne,Mr.,,Nathanial Scryne Carter,Nova Health,2024-02-02T15:50:01.334+0000,True,"List(List(Residental, 1052 Williams Path Apt. 728, Suite 749, Kathleenside, Louisiana, 39995, , USA))","List((242) 8171222, Work)",cscrynejc@tuttocitta.it,False,199-05-0092,M,Widowed,1911-01-24,1911,False,,,"List(Spanish, null)","List(Nathanial Scryne Carter, Teacher, active, 1995-06-07)"
70878,41247,Marillin,Lorraine,Korda,not known,,Marillin Korda Lorraine,Nova Health,2024-02-02T15:50:01.334+0000,True,"List(List(Residental, 45669 Caleb Walk Suite 343, null, Michaelburgh, North Carolina, 50648, , USA))","List((283) 8578830, Work)",lkordaod@oracle.com,False,268-80-3994,F,Married,1959-01-15,1959,False,,,"List(French, null)","List(Marillin Korda Lorraine, Senior Editor, active, 1979-03-19)"
70925,41190,Wyatt,,Wye,Mx.,,Wyatt Wye,Nova Health,2024-02-02T15:50:01.334+0000,True,"List(List(Residental, 29782 Melissa Wells Suite 619, Apt. 066, Mayborough, Ohio, 19521, , USA))","List((119) 5609333, Work)",wwyepo@apache.org,False,500-21-9961,,,1987-09-15,1987,False,,,"List(Macedonian, MÄori)","List(null, Cost Accountant, active, 1948-09-18)"
70975,41420,Vivyan,Robinett,Dumphrey,not known,,Vivyan Dumphrey Robinett,Nova Health,2024-02-02T15:50:01.334+0000,True,"List(List(Residental, 049 Steven Branch Apt. 183, Apt. 259, South Brianmouth, Tennessee, 93743, , USA))","List((741) 6569990, Work)",rdumphreyr2@paginegialle.it,False,444-73-3694,F,Widowed,1992-02-03,1992,False,,,"List(Assamese, null)","List(Vivyan Dumphrey Robinett, Developer III, active, 2021-08-31)"
71100,41111,Bayard,Berkie,Martonfi,Mr.,,Bayard Martonfi Berkie,Nova Health,2024-02-02T15:50:01.334+0000,True,"List(List(Residental, 144 Brian Pass, Apt. 111, New Stephen, Virginia, 14317, , USA))","List((554) 6288822, Work)",bmartonfi2r@wisc.edu,False,770-30-7327,M,Single,1931-08-12,1931,False,,,"List(Polish, Icelandic)","List(Bayard Martonfi Berkie, Marketing Assistant, active, 2013-03-05)"
71233,41314,Aaren,Kristina,Paslow,not known,Er.,Aaren Paslow Kristina,Nova Health,2024-02-02T15:50:01.334+0000,True,"List(List(Residental, 92614 Robert Village, Suite 502, West Amystad, Arizona, 20690, , USA))","List((737) 5432817, Work)",kpaslow6g@skype.com,False,723-28-0641,F,Divorced,1990-05-24,1990,False,,,"List(GuaranÃ­, null)","List(Aaren Paslow Kristina, Structural Analysis Engineer, active, 1942-02-17)"
71385,41561,Carlee,,Currer,not known,,Carlee Currer,Nova Health,2024-02-02T15:50:01.334+0000,True,"List(List(Residental, 517 Ramos Ranch, Suite 482, Mooremouth, Hawaii, 88978, , USA))","List((230) 8283104, Work)",lcurrerao@examiner.com,False,650-68-2344,F,Married,1958-10-18,1958,False,,,"List(Hebrew, Belarusian)","List(null, Desktop Support Technician, active, 1961-04-04)"
71487,41672,Christophorus,Brennan,Dudin,Mr.,,Christophorus Dudin Brennan,Nova Health,2024-02-02T15:50:01.334+0000,True,"List(List(Residental, 20479 Johnston Highway, null, Hawkinsbury, Maryland, 35261, , USA))","List((761) 2742602, Work)",bdudindi@plala.or.jp,False,794-52-3683,M,Married,1909-04-09,1909,False,,,"List(Yiddish, Catalan)","List(Christophorus Dudin Brennan, Assistant Professor, inactive, null)"
