In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
import datetime
import sys
sys.path.insert(0, '/Workspace/Shared')
import etl_helpers 

runcycleid = etl_helpers.start_run_cycle("dimaddress")

os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:

    df_lastrun = spark.sql("SELECT runcyclestartat as createddate FROM dimruncycle WHERE packagename = 'dimaddress'  AND success = 't' ORDER BY runcycleid DESC LIMIT 1")
    
    if df_lastrun.count() > 0:
        lastruntime = df_lastrun.first().createddate.strftime("%Y-%m-%d %H:%M:%S")
    else:
        lastruntime = "2019-01-01 00:00:00"
    print(lastruntime)

    query = f"""
        SELECT
            t.foirequest_id AS foirequestid,
            MAX(CASE WHEN t.dataformat = 'email' THEN t.contactinformation END) AS email,
            MAX(CASE WHEN t.dataformat = 'country' THEN t.contactinformation END) AS country,
            MAX(CASE WHEN t.dataformat = 'postal' THEN t.contactinformation END) AS postal,
            MAX(CASE WHEN t.dataformat = 'province' THEN t.contactinformation END) AS province,
            MAX(CASE WHEN t.dataformat = 'city' THEN t.contactinformation END) AS city,
            MAX(CASE WHEN t.dataformat = 'address' THEN t.contactinformation END) AS address1,
            MAX(CASE WHEN t.dataformat = 'addressSecondary' THEN t.contactinformation END) AS address2,
            MAX(t.foirequestversion_id) AS max_foirequestversion_id
        FROM
            foi_mod.foirequestcontactinformation t
        INNER JOIN (
            -- Subquery to find the maximum foirequestversion_id for each foirequest_id
            SELECT
                foirequest_id,
                MAX(foirequestversion_id) AS latest_version
            FROM
                foi_mod.foirequestcontactinformation
            GROUP BY
                foirequest_id
        ) AS latest_versions ON t.foirequest_id = latest_versions.foirequest_id
                            AND t.foirequestversion_id = latest_versions.latest_version
        WHERE
            t.dataformat IN ('email', 'country', 'postal', 'province', 'city', 'address', 'addressSecondary')
            and (t.created_at > '{lastruntime}' or t.updated_at > '{lastruntime}')
        GROUP BY
            t.foirequest_id
        ORDER BY
            t.foirequest_id desc    
        """

    # print(query)

    df = spark.sql(query)
    df.show()

    # order of columns here is important!
    df_mapped = df.selectExpr(
            "address1 AS address1",
            "address2 AS address2",
            "city AS city",
            "province AS state",
            "country AS country",
            # "email AS email",
            "postal AS zipcode",
            "foirequestid AS foirequestid",
            # f"{runcycleid} as runcycleid",
        )
    df_mapped.show()

    from delta.tables import DeltaTable
    delta_table = DeltaTable.forName(spark, "hive_metastore.default.dimaddress")
    delta_table.alias("target").merge(
        df_mapped.alias("source"),
        "target.foirequestid = source.foirequestid"
    ).whenMatchedUpdate(set = {
        "address1": "source.address1",
        "address2": "source.address2",
        "city": "source.city",
        "state": "source.state",
        "country": "source.country",
        "zipcode": "source.zipcode",
        "foirequestid": "source.foirequestid"
    }).whenNotMatchedInsert(values = {
        "address1": "source.address1",
        "address2": "source.address2",
        "city": "source.city",
        "state": "source.state",
        "country": "source.country",
        "zipcode": "source.zipcode",
        "foirequestid": "source.foirequestid"
    }).execute()

    etl_helpers.end_run_cycle(runcycleid, 't', "dimaddress")
except NoCredentialsError:
    print("Credentials not available")
    etl_helpers.end_run_cycle(runcycleid, 'f', "dimaddress", "Credentials not available")
except Exception as e:
    if (str(e) == "no changes for today"):
        print("here")
        etl_helpers.end_run_cycle(runcycleid, 't', "dimaddress")
    else:
        print(f"An error occurred: {e}")    
        etl_helpers.end_run_cycle(runcycleid, 'f', "dimaddress", f"An error occurred: {e}")