In [0]:
%sql

-- drop table if exists default.dimrequesterstemp;

select * from dimruncycle
order by runcycleid desc
limit 1;

In [0]:
%sql

-- select * from default.dimrequesters
-- order by requesterid desc
-- limit 100;

-- -- Add two new columns
-- ALTER TABLE default.dimrequesters
-- ADD COLUMNS (
--     foirequestapplicantid INT,
--     applicantprofileid INT
-- );

-- Step 1: Enable column mapping (if not already enabled)
-- ALTER TABLE default.dimrequesters
-- SET TBLPROPERTIES (
--   'delta.minReaderVersion' = '2',
--   'delta.minWriterVersion' = '5',
--   'delta.columnMapping.mode' = 'name'
-- );

-- Step 2: Rename the column
-- ALTER TABLE default.dimrequesters
-- RENAME COLUMN applicantprofileid TO foirequestversion;


-- ALTER TABLE default.dimrequesters
-- ADD COLUMNS (
--     axisapplicantid INT,
--     requestortypeid INT
-- );

In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
import datetime
import sys
sys.path.insert(0, '/Workspace/Shared')
import etl_helpers 

tablename = "dimrequesters"
runcycleid = etl_helpers.start_run_cycle(tablename)
os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:

    df_lastrun = spark.sql(f"SELECT runcyclestartat as createddate FROM dimruncycle WHERE packagename = \"{tablename}\" AND success = 't' ORDER BY runcycleid DESC LIMIT 1")
    
    if df_lastrun.count() > 0:
        lastruntime = df_lastrun.first().createddate.strftime("%Y-%m-%d %H:%M:%S")
    else:
        lastruntime = "2019-01-01 00:00:00"
    print(lastruntime)

    query = f"""
        SELECT
            a.foirequestapplicantid AS requesterid,
            a.firstname,
            a.lastname,
            a.middlename,
            CONCAT_WS(' ', a.firstname, a.lastname) AS fullname,
            CAST(NULL AS STRING) AS jobtitle,
            contacts.addressline1,
            contacts.addressline2,
            contacts.city,
            contacts.zipcode,
            contacts.statecode,
            contacts.statecode AS statename,
            contacts.countryname,
            contacts.workphone1,
            contacts.workphone2,
            contacts.mobile,
            contacts.home,
            CAST(NULL AS STRING) AS fax,
            contacts.email,
            a.businessname AS company,
            CAST(NULL AS STRING) AS notes,
            a.created_at as createddate,
            a.updated_at as modifieddate,
            't' AS cactive,
            CAST(NULL AS STRING) AS reasonid,
            CAST(NULL AS STRING) AS maildue,
            contacts.foirequestid,
            contacts.max_foirequestversion_id as foirequestversion,
            CASE
                WHEN a.axisapplicantid = 'NULL' THEN CAST(NULL AS INTEGER)
            ELSE
                a.axisapplicantid
            END AS axisapplicantid,
            mappings.requestortypeid,
            'FOIMOD' AS sourceoftruth
        FROM (
            SELECT
                t.foirequest_id AS foirequestid,
                MAX(CASE WHEN t.dataformat = 'email' THEN t.contactinformation END) AS email,
                MAX(CASE WHEN t.dataformat = 'country' THEN t.contactinformation END) AS countryname,
                MAX(CASE WHEN t.dataformat = 'postal' THEN t.contactinformation END) AS zipcode,
                MAX(CASE WHEN t.dataformat = 'province' THEN t.contactinformation END) AS statecode,
                MAX(CASE WHEN t.dataformat = 'city' THEN t.contactinformation END) AS city,
                MAX(CASE WHEN t.dataformat = 'address' THEN t.contactinformation END) AS addressline1,
                MAX(CASE WHEN t.dataformat = 'addressSecondary' THEN t.contactinformation END) AS addressline2,
                MAX(CASE WHEN t.dataformat = 'workPhonePrimary' THEN t.contactinformation END) AS workphone1,
                MAX(CASE WHEN t.dataformat = 'workPhoneSecondary' THEN t.contactinformation END) AS workphone2,
                MAX(CASE WHEN t.dataformat = 'phonePrimary' THEN t.contactinformation END) AS mobile,
                MAX(CASE WHEN t.dataformat = 'phoneSecondary' THEN t.contactinformation END) AS home,
                MAX(t.foirequestversion_id) AS max_foirequestversion_id
            FROM
                foi_mod.foirequestcontactinformation t
            INNER JOIN (
                -- Subquery to find the maximum foirequestversion_id for each foirequest_id
                SELECT
                    foirequest_id,
                    MAX(foirequestversion_id) AS latest_version
                FROM
                    foi_mod.foirequestapplicantmappings
                GROUP BY
                    foirequest_id
            ) AS latest_versions ON t.foirequest_id = latest_versions.foirequest_id
                                AND t.foirequestversion_id = latest_versions.latest_version
            WHERE
                t.dataformat IN ('email', 'country', 'postal', 'province', 'city', 'address', 'addressSecondary', 'workPhonePrimary', 'workPhoneSecondary', 'phonePrimary', 'phoneSecondary')
                and (t.created_at > '{lastruntime}' or t.updated_at > '{lastruntime}')
            GROUP BY
                t.foirequest_id
        ) AS contacts
        INNER JOIN
            foi_mod.foirequestapplicantmappings mappings ON contacts.foirequestid = mappings.foirequest_id AND contacts.max_foirequestversion_id = mappings.foirequestversion_id
        INNER JOIN
            foi_mod.foirequestapplicants a ON a.foirequestapplicantid = mappings.foirequestapplicantid
        ORDER BY a.foirequestapplicantid
    """

    # print(query)

    df = spark.sql(query)
    df.show()

    # order of columns here is important!
    df_mapped = df.selectExpr(
            "requesterid AS requesterid",
            "firstname AS firstname",
            "lastname AS lastname",
            "middlename AS middlename",
            "fullname AS fullname",
            "jobtitle AS jobtitle",
            "addressline1 AS addressline1",
            "addressline2 AS addressline2",
            "city AS city",
            "zipcode AS zipcode",
            "statecode AS statecode",
            "statecode AS statename",
            "countryname AS countryname",
            "workphone1 AS workphone1",
            "workphone2 AS workphone2",
            "mobile AS mobile",
            "home AS home",
            "fax AS fax",
            "email AS email",
            "company AS company",
            "notes AS notes",
            "createddate AS createddate",
            "modifieddate AS modifieddate",
            "cactive AS cactive",
            "reasonid AS reasonid",
            "maildue AS maildue",
            "foirequestid AS foirequestid",
            "foirequestversion AS foirequestversion",
            "axisapplicantid AS axisapplicantid",
            "requestortypeid AS requestortypeid",
            "sourceoftruth AS sourceoftruth"
        )
    df_mapped.show()

    from delta.tables import DeltaTable
    delta_table = DeltaTable.forName(spark, f"hive_metastore.default.{tablename}")
    delta_table.alias("target").merge(
        df_mapped.alias("source"),
        "target.requesterid = source.requesterid AND target.foirequestid = source.foirequestid AND target.foirequestversion = source.foirequestversion AND target.requestortypeid = source.requestortypeid AND target.sourceoftruth = source.sourceoftruth"
    ).whenMatchedUpdate(set = {
        "firstname": "source.firstname",
        "lastname": "source.lastname",
        "middlename": "source.middlename",
        "fullname": "source.fullname",
        "jobtitle": "source.jobtitle",
        "addressline1": "source.addressline1",
        "addressline2": "source.addressline2",
        "city": "source.city",
        "zipcode": "source.zipcode",
        "statecode": "source.statecode",
        "statename": "source.statename",
        "countryname": "source.countryname",
        "workphone1": "source.workphone1",
        "workphone2": "source.workphone2",
        "mobile": "source.mobile",
        "home": "source.home",
        "fax": "source.fax",
        "email": "source.email",
        "company": "source.company",
        "notes": "source.notes",
        "createddate": "source.createddate",
        "modifieddate": "source.modifieddate",
        "cactive": "source.cactive",
        "reasonid": "source.reasonid",
        "maildue": "source.maildue",
        "axisapplicantid": "source.axisapplicantid"
    }).whenNotMatchedInsert(values = {
        "requesterid": "source.requesterid",
        "firstname": "source.firstname",
        "lastname": "source.lastname",
        "middlename": "source.middlename",
        "fullname": "source.fullname",
        "jobtitle": "source.jobtitle",
        "addressline1": "source.addressline1",
        "addressline2": "source.addressline2",
        "city": "source.city",
        "zipcode": "source.zipcode",
        "statecode": "source.statecode",
        "statename": "source.statename",
        "countryname": "source.countryname",
        "workphone1": "source.workphone1",
        "workphone2": "source.workphone2",
        "mobile": "source.mobile",
        "home": "source.home",
        "fax": "source.fax",
        "email": "source.email",
        "company": "source.company",
        "notes": "source.notes",
        "createddate": "source.createddate",
        "modifieddate": "source.modifieddate",
        "cactive": "source.cactive",
        "reasonid": "source.reasonid",
        "maildue": "source.maildue",
        "foirequestid": "source.foirequestid",
        "foirequestversion": "source.foirequestversion",
        "axisapplicantid": "source.axisapplicantid",
        "requestortypeid": "source.requestortypeid",
        "sourceoftruth": "source.sourceoftruth"
    }).execute()

    etl_helpers.end_run_cycle(runcycleid, 't', tablename)
except NoCredentialsError:
    print("Credentials not available")
    etl_helpers.end_run_cycle(runcycleid, 'f', tablename, "Credentials not available")
    dbutils.notebook.exit("Error: Something went wrong.")
except Exception as e:
    if (str(e) == "no changes for today"):
        print("here")
        etl_helpers.end_run_cycle(runcycleid, 't', tablename)
    else:
        print(f"An error occurred: {e}")    
        etl_helpers.end_run_cycle(runcycleid, 'f', tablename, f"An error occurred: {e}")
        dbutils.notebook.exit("Error: Something went wrong.")