# Loading data from excel file

In [0]:
%python
xlsx_path = "dbfs:/FileStore/tables/Address.xlsx"
df_xlsx = spark.read.format("com.crealytics.spark.excel") \
                    .option("header", "true") \
                    .option("inferSchema", "true") \
                    .load(xlsx_path)

df_xlsx.write.mode("overwrite").saveAsTable("Address")

In [0]:
%sql
select * from Address Limit 10

id,city,issued_date,address_line_1,address_line_2,state,address_type,zipcode
70001.0,Griffinhaven,2007-06-10T00:00:00.000+0000,046 Cox Lights,Suite 600,South Carolina,Residental,71709
70001.0,New Kyle,2016-05-26T00:00:00.000+0000,4307 Ashley Village Suite 758,,North Dakota,Mail,87337
70002.0,South Natalie,2019-09-15T00:00:00.000+0000,183 Dalton Viaduct,Suite 844,Oregon,Mail,77714
70002.0,North Heather,2022-12-01T00:00:00.000+0000,737 Banks Row,Apt. 505,Arkansas,Residental,2341
70003.0,West Ruth,2003-11-25T00:00:00.000+0000,27634 Miller Prairie,,North Dakota,Residental,73151
70003.0,Churchbury,2007-03-09T00:00:00.000+0000,807 Jesus Mills Suite 598,Suite 735,Texas,Mail,97223
70004.0,Thomasborough,2010-07-05T00:00:00.000+0000,36717 Philip Common,Suite 278,Idaho,Mail,91582-4725
70004.0,Michelleland,2011-11-06T00:00:00.000+0000,9831 Robert Falls,Apt. 086,Oregon,Residental,05921-5281
70005.0,North Robertborough,2005-08-11T00:00:00.000+0000,0861 Caldwell Dam,Suite 783,New Mexico,Residental,31718
70005.0,Kimberlymouth,2019-12-29T00:00:00.000+0000,95855 Davis Lodge,Suite 059,Louisiana,Mail,33733


# Loading data from csv file

In [0]:
%python
# Assuming the CSV file is stored in DBFS
csv_path = "dbfs:/FileStore/tables/Detail.csv"

# Read CSV file into a DataFrame
df_csv = spark.read.csv(csv_path, header=True, inferSchema=True)
df_csv.write.mode("overwrite").saveAsTable("Details")


In [0]:
%sql
select * from details limit 5

id,first_name,middle_name,last_name,email,ssn,gender,religion,marital_status,date_of_birth,deceased_date,spoken_language_1,spoken_language_2,company,job_role,job_hiredate
70001,Hettie,,Keenlayside,jkeenlayside0@disqus.com,168-92-1075,F,Buddhism,Widowed,1939-08-05,,West Frisian,Swahili,Gabcube,Clinical Specialist,1964-01-29
70002,Reade,,Laverenz,dlaverenz1@senate.gov,782-24-9907,M,Christianity,Widowed,1941-05-14,,Swati,Danish,Skibox,Staff Scientist,1958-05-18
70003,Minnnie,,Baack,dbaack2@sina.com.cn,726-01-1271,F,Buddhism,Married,1982-11-20,,Swati,,Dabjam,Paralegal,2011-06-10
70004,Tana,Agata,Aiken,aaiken3@nydailynews.com,492-62-0968,F,,,1929-02-18,,New Zealand Sign Language,Punjabi,Aimbu,VP Marketing,2014-10-08
70005,Cyndia,,Tolomelli,ltolomelli4@istockphoto.com,802-24-1062,F,,,1920-05-31,,,Albanian,Edgepulse,Senior Developer,1931-01-16


# Loading data from json file

In [0]:
%python

json_path = "dbfs:/FileStore/tables/header.json"

# Read JSON file into a DataFrame
df_json = spark.read.option("multiline", "true").json(json_path)

# Perform any necessary transformations or cleaning operations on df_json

# Save DataFrame as a table in Databricks
df_json.write.mode("overwrite").saveAsTable("Header")



In [0]:
%sql
select * from header limit 10

id,insurer_id,relationship
70001,40184,child
70002,40092,friend
70003,40233,spouse
70004,40058,spouse
70005,40088,friend
70006,40170,child
70007,40194,parent
70008,40079,spouse
70009,40466,sibling
70010,40061,child


# Loading data from text file

In [0]:
%python


# Assuming the TXT file is stored in DBFS
txt_path = "dbfs:/FileStore/tables/contactinfo.txt"


df_txt = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", "\t") \
    .load(txt_path)

pandas_dftxt = df_txt.toPandas()
spark_dftxt = spark.createDataFrame(pandas_dftxt)
spark_dftxt.createOrReplaceTempView("tempo")

# Create or replace a table using Spark SQL
spark.sql("CREATE OR REPLACE TABLE contactinfo AS SELECT * FROM tempo")


Out[4]: DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
%sql
select * from contactinfo

id,phone,usage_type
70901,(297) 9784148,Home
70902,(821) 6472448,Work
70903,(691) 7308596,Work
70903,(391) 6538528,Home
70904,(576) 1030201,Work
70904,(907) 9520747,Home
70905,(893) 8095275,Work
70906,(541) 6944929,Work
70906,(162) 3551912,Home
70907,(402) 8369745,Work


In [0]:
%sql
CREATE TABLE member_table (
    source_id STRING,
    subscriber_id STRING,
    first_name STRING,
    middle_name STRING,
    last_name STRING,
    prefix_name STRING,
    suffix_name STRING,
    name STRING,
    record_source STRING,
    record_created_ts TIMESTAMP,
    is_verified BOOLEAN,
    addresses ARRAY<STRUCT<
        address_type: STRING,
        address_line_1: STRING,
        address_line_2: STRING,
        city: STRING,
        state_province: STRING,
        postal_code: STRING,
        zip_code_extension: STRING,
        country: STRING
    >>,
    phones ARRAY<STRUCT<
        phone_type: STRING,
        number: STRING
    >>,
    email STRING,
    privacy_preference BOOLEAN,
    national_id STRING,
    gender STRING,
    maritial_status STRING,
    date_of_birth DATE,
    year_of_birth INT,
    deceased_ind BOOLEAN,
    deceased_age INT,
    deceased_date DATE,
    languages ARRAY<STRING>,
    employment STRUCT<
        employer_name: STRING,
        employee_role: STRING,
        employee_status: STRING,
        employee_hiredate: DATE
    >,
    additional_source_value MAP<STRING,STRING>
);

In [0]:
%sql
Truncate table member_table

In [0]:
%sql
INSERT Into member_table
SELECT DISTINCT
    header.id AS source_id,
    header.insurer_id AS subscriber_id,
    details.first_name AS first_name,
    details.middle_name AS middle_name,
    details.last_name AS last_name,
    CASE
      WHEN details.gender = 'F' AND details.marital_status IN ('Widowed', 'Divorced', 'Single') THEN 'Ms.'
      WHEN details.gender = 'F' AND details.marital_status = 'Married' THEN 'Mrs.'
      WHEN details.gender = 'M' THEN 'Mr.'
      ELSE NULL
    END AS prefix_name,
    CASE 
      WHEN details.job_role = "Registered Nurse" THEN "RN"
      WHEN details.job_role = "Doctor" THEN "Dr."
      WHEN details.job_role = "Professor" THEN "Prof."
      WHEN details.job_role LIKE "%Engineer" THEN "Er."
      WHEN details.job_role LIKE "% I" THEN "Jr."
      WHEN details.job_role LIKE "% II" THEN "Sr."
      WHEN details.job_role LIKE "% III" THEN "III"
      WHEN details.job_role LIKE "% IV" THEN "IV"
      WHEN details.job_role LIKE "% V" THEN "V"
    ELSE ""
    END AS suffix_name,
    COALESCE(details.first_name || ' ' || details.last_name || ' ' || details.middle_name,coalesce(details.first_name,details.middle_name,details.last_name )) AS name,
    'nova' AS record_source,
    CURRENT_TIMESTAMP() AS record_created_ts,
    CASE 
      WHEN email LIKE '%@%.%'
      AND LENGTH(email) - LENGTH(REPLACE(email, '@', '')) = 1
      AND POSITION('@' IN email) < POSITION('.' IN email)
      AND SUBSTRING_INDEX(email, '@', 1) RLIKE '^[a-zA-Z0-9._]+$'
      AND SUBSTRING_INDEX(email, '@', -1) RLIKE '^[a-zA-Z0-9.-]+$'
      THEN 'true'
      ELSE 'false' 
    END AS is_verified,
    ARRAY_AGG(STRUCT(
      address.address_type AS address_type,
      address.address_line_1 AS address_line_1,
      address.address_line_2 AS address_line_2,
      address.city AS city,
      address.state AS state_province,
      CASE
        WHEN LOCATE('-', address.zipcode) > 0
          THEN SUBSTRING_INDEX(address.zipcode, '-', -1)
        WHEN LENGTH(address.zipcode) = 4  -- Assuming 4-digit postal code
          THEN address.zipcode
        ELSE NULL  -- or an appropriate default value for postalcode
      END AS postal_code,
      CASE
        WHEN LOCATE('-', address.zipcode) > 0
          THEN SUBSTRING_INDEX(address.zipcode, '-', 1)
        WHEN LENGTH(address.zipcode) = 5  -- Assuming 4-digit postal code
          THEN address.zipcode
        ELSE NULL
      END AS zip_code_extension,
      'US' AS country
    )) OVER (PARTITION BY address.id) AS addresses,
    ARRAY_AGG(STRUCT(
      contactinfo.usage_type AS phone_type,
      contactinfo.phone as number
    )) OVER (PARTITION BY contactinfo.id) as phones,
    details.email AS email,
    true as privacy_preference,
    details.ssn as national_id,
    details.gender as gender,
    details.marital_status as maritial_status,
    -- correct maritail status spell
    to_date(details.date_of_birth, 'M/d/yyyy') as date_of_birth,
    year(to_date(details.date_of_birth, 'M/d/yyyy')) as year_of_birth,
    CASE
      WHEN details.deceased_date Is NULL
        THEN false
        ELSE true
      END AS deceased_ind,
  CASE
    WHEN details.deceased_date IS NOT NULL
      THEN YEAR(to_date(details.deceased_date, 'M/d/yyyy')) - YEAR(to_date(details.date_of_birth, 'M/d/yyyy'))
      ELSE NULL
    END AS deceased_age,
 
    to_date(details.deceased_date,'M/d/yyyy') as deceased_date,
    ARRAY(details.spoken_language_1, details.spoken_language_2) AS languages,
    struct(
      details.company as employer_name,
      details.job_role as employee_role,
      NULL as employee_status,
      details.job_hiredate as employee_hiredate
    ),
    Null as additional_source_value 
  from header
  LEFT JOIN details ON header.id = details.id
  LEFT JOIN contactinfo ON header.id = contactinfo.id
  LEFT JOIN address ON header.id = address.id


num_affected_rows,num_inserted_rows
1500,1500


In [0]:
%sql
select * from member_table

source_id,subscriber_id,first_name,middle_name,last_name,prefix_name,suffix_name,name,record_source,record_created_ts,is_verified,addresses,phones,email,privacy_preference,national_id,gender,maritial_status,date_of_birth,year_of_birth,deceased_ind,deceased_age,deceased_date,languages,employment,additional_source_value
70014,40484,Skippie,Andrew,Lawles,Mr.,Sr.,Skippie Lawles Andrew,nova,2024-03-06T06:44:38.556+0000,False,"List(List(Residental, 955 Richard Alley, null, Emilybury, Maine, null, 90348, US), List(Mail, 7012 Natalie Brooks Suite 964, Suite 115, Juliestad, Hawaii, null, 42456, US))","List(List(Work, (844) 3426438), List(Work, (844) 3426438))",alawles dvistaprint.com,True,121-37-4739,M,Married,1997-03-08,1997,False,,,"List(Belarusian, null)","List(Mynte, Programmer Analyst II, null, 2001-08-18)",
70321,40672,Delcina,,Tomsu,Ms.,,Delcina,nova,2024-03-06T06:44:38.556+0000,True,"List(List(Mail, 37295 Reginald Road Suite 606, Suite 806, East Paul, Maine, null, 41730, US), List(Residental, 234 Shaffer Falls, null, South Ericaburgh, Illinois, null, 58977, US))","List(List(Work, (204) 8726919), List(Work, (204) 8726919))",ttomsu8w@yellowpages.com,True,405-52-2344,F,Divorced,1960-01-19,1960,False,,,"List(Yiddish, Norwegian)","List(Tagopia, Technical Writer, null, 1967-04-28)",
70452,40886,Marius,Braden,Hartop,Mr.,Prof.,Marius Hartop Braden,nova,2024-03-06T06:44:38.556+0000,True,"List(List(Residental, 614 Helen Cliffs Suite 220, Apt. 422, Perkinsmouth, Indiana, 5331, null, US), List(Mail, 53895 Catherine Circle Apt. 499, Apt. 564, Gonzalezbury, Washington, null, 34268, US))","List(List(Work, (919) 9564855), List(Work, (919) 9564855))",bhartopcj@ask.com,True,517-28-3726,M,Single,1987-07-02,1987,False,,,"List(West Frisian, Montenegrin)","List(Podcat, Professor, null, null)",
70574,40685,Johnnie,Richy,Kleingrub,Mr.,,Johnnie Kleingrub Richy,nova,2024-03-06T06:44:38.556+0000,True,"List(List(Residental, 331 Flores Estate, null, Lake Lisa, Texas, null, 74373, US))","List(List(Work, (898) 4841919))",rkleingrubfx@vimeo.com,True,749-31-8779,M,,1989-04-10,1989,False,,,"List(Moldovan, Dhivehi)","List(Gigabox, Community Outreach Specialist, null, 1933-09-05)",
70616,41023,Evonne,Winonah,Whitham,Mrs.,,Evonne Whitham Winonah,nova,2024-03-06T06:44:38.556+0000,True,"List(List(Residental, 36521 Ryan Parkway Suite 670, null, South Maryburgh, Mississippi, null, 98898, US))","List(List(Work, (532) 6925372))",wwhithamh3@apple.com,True,307-78-5162,F,Married,1973-11-17,1973,False,,,"List(Indonesian, Moldovan)","List(Quinu, Clinical Specialist, null, 1953-02-20)",
70849,41076,Shirlene,,Absolom,Ms.,,Shirlene,nova,2024-03-06T06:44:38.556+0000,True,"List(List(Residental, 4001 Parker Green Suite 193, Suite 532, Amandaland, Louisiana, 5510, null, US), List(Residental, 4001 Parker Green Suite 193, Suite 532, Amandaland, Louisiana, 5510, null, US))","List(List(Home, (588) 2473686), List(Work, (897) 8834671))",sabsolomnk@shutterfly.com,True,688-43-6726,F,Single,1976-05-15,1976,False,,,"List(Kashmiri, null)","List(JumpXS, Desktop Support Technician, null, 1921-04-27)",
71067,41202,Shandeigh,Jayme,Van der Velden,,,Shandeigh Van der Velden Jayme,nova,2024-03-06T06:44:38.556+0000,True,"List(List(Residental, 09282 Marc Street, null, West Noah, Wisconsin, null, 56536, US))","List(List(Work, (209) 2747575))",jvandervelden1u@cargocollective.com,True,500-31-4203,,Widowed,1963-02-16,1963,False,,,"List(Korean, null)","List(Leexo, Director of Sales, null, 2014-12-30)",
71159,41587,Antony,,Raffan,Mr.,,Antony,nova,2024-03-06T06:44:38.556+0000,True,"List(List(Residental, 215 Steven Springs Apt. 499, Suite 756, Theresahaven, Delaware, null, 56959, US), List(Residental, 215 Steven Springs Apt. 499, Suite 756, Theresahaven, Delaware, null, 56959, US))","List(List(Home, (623) 7003873), List(Work, (832) 2833341))",nraffan4e@multiply.com,True,211-95-6523,M,Married,1967-09-26,1967,False,,,"List(Irish Gaelic, null)","List(Zoomlounge, Teacher, null, null)",
71214,41258,Glynis,Kelvin,Ainge,,,Glynis Ainge Kelvin,nova,2024-03-06T06:44:38.556+0000,True,"List(List(Residental, 24910 Marshall Ville Suite 619, Apt. 487, East Jenniferton, Arkansas, null, 89071, US))","List(List(Work, (994) 8738237))",kainge5x@fc2.com,True,852-78-6696,,,1944-01-11,1944,False,,,"List(French, null)","List(Topicstorm, VP Quality Control, null, 1965-02-27)",
70036,40485,Douglass,,Dallender,Mr.,Er.,Douglass,nova,2024-03-06T06:44:38.556+0000,True,"List(List(Residental, 819 Cochran Mountains Suite 190, Apt. 871, Lake Debbiechester, Ohio, null, 75680, US), List(Mail, 69073 Amber Squares, Suite 240, North Austin, Ohio, null, 72538, US))","List(List(Work, (457) 2054505), List(Work, (457) 2054505))",vdallenderz@umn.edu,True,633-66-5388,M,Divorced,1917-04-27,1917,False,,,"List(Bengali, null)","List(Thoughtsphere, Chemical Engineer, null, 1962-07-13)",
