In [0]:
%sql

-- drop table if exists default.dimrequesterstemp;

select * from dimruncycle
order by runcycleid desc
limit 5;

-- select * from foi_mod.foirequests r
-- inner join foi_mod.foirawrequests raw on r.foirawrequestid = raw.requestid
-- where foirequestid = 12735 or foirequestid = 12736
-- order by r.foirequestid, raw.version;




In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
import datetime
import sys
sys.path.insert(0, '/Workspace/Shared')
import etl_helpers 

tablename = "dimrequests"
runcycleid = etl_helpers.start_run_cycle(f"{tablename}")
os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:

    df_lastrun = spark.sql(f"SELECT runcyclestartat as createddate FROM dimruncycle WHERE packagename = \"{tablename}\" AND success = 't' ORDER BY runcycleid DESC LIMIT 1")
    
    if df_lastrun.count() > 0:
        lastruntime = df_lastrun.first().createddate.strftime("%Y-%m-%d %H:%M:%S")
    else:
        lastruntime = "2019-01-01 00:00:00"
    print(lastruntime)

    query = f"""
        SELECT
            r.foirequestid,
            'FOIMOD' AS sourceoftruth,
            2 AS sourceoftruthuniqueid,
            max_raw.axisrequestid AS visualrequestfilenumber,
            CASE
                WHEN r.isactive = TRUE THEN 'Y'
            ELSE
                'N'
            END AS cactive,
            first_versions.created_at AS createddate,
            CASE
                WHEN r.created_at > ministry.created_at THEN r.created_at
            ELSE
                ministry.created_at
            END AS modifieddate
        FROM
            foi_mod.foirequests r
        INNER JOIN (
            -- Subquery to find the first version of foirequest
            SELECT
                foirequestid,
                created_at
            FROM foi_mod.foirequests
            WHERE version = 1
        ) AS first_versions ON r.foirequestid = first_versions.foirequestid
        INNER JOIN (
            -- Subquery to find the max version of foiministryrequest
            SELECT
                foirequestid,
                MAX(version) AS max_version
            FROM foi_mod.foirequests
            GROUP BY foirequestid
        ) AS max_versions ON r.foirequestid = max_versions.foirequestid AND r.version = max_versions.max_version
        INNER JOIN (
            -- Subquery to find the max version of foirequest
            SELECT
                foirequest_id,
                foiministryrequestid,
                MAX(version) AS max_version
            FROM foi_mod.foiministryrequests
            GROUP BY foirequest_id, foiministryrequestid
        ) AS max_ministry_versions ON r.foirequestid = max_ministry_versions.foirequest_id
        INNER JOIN foi_mod.foiministryrequests ministry ON ministry.foiministryrequestid = max_ministry_versions.foiministryrequestid AND ministry.version = max_ministry_versions.max_version
        INNER JOIN (
            SELECT
                requestid,
                axisrequestid
            FROM
                foi_mod.foirawrequests
            WHERE
                axisrequestid != 'NULL'
            GROUP BY
                requestid, axisrequestid
        ) AS max_raw ON r.foirawrequestid = max_raw.requestid
        WHERE
            (r.created_at > '{lastruntime}' or ministry.created_at > '{lastruntime}') and r.foirequestid NOT IN (12735, 12736)
        ORDER BY
            r.foirequestid
        """

    # print(query)

    df = spark.sql(query)
    df.show()

    # order of columns here is important!
    df_mapped = df.selectExpr(
        "foirequestid AS foirequestid",
        "sourceoftruth AS sourceoftruth",
        "sourceoftruthuniqueid AS sourceoftruthuniqueid",
        "visualrequestfilenumber AS visualrequestfilenumber",
        "cactive AS cactive",
        "createddate AS createddate",
        "modifieddate AS modifieddate"
    )
    df_mapped.show()

    from delta.tables import DeltaTable
    delta_table = DeltaTable.forName(spark, f"hive_metastore.default.{tablename}")
    delta_table.alias("target").merge(
        df_mapped.alias("source"),
        "target.foirequestid = source.foirequestid"
    ).whenMatchedUpdate(set = {
        "sourceoftruth": "source.sourceoftruth",
        "sourceoftruthuniqueid": "source.sourceoftruthuniqueid",
        "visualrequestfilenumber": "source.visualrequestfilenumber",
        "cactive": "source.cactive",
        "createddate": "source.createddate",
        "modifieddate": "source.modifieddate"
    }).whenNotMatchedInsert(values = {
        "foirequestid": "source.foirequestid",
        "sourceoftruth": "source.sourceoftruth",
        "sourceoftruthuniqueid": "source.sourceoftruthuniqueid",
        "visualrequestfilenumber": "source.visualrequestfilenumber",
        "cactive": "source.cactive",
        "createddate": "source.createddate",
        "modifieddate": "source.modifieddate"
    }).execute()

    etl_helpers.end_run_cycle(runcycleid, 't', f"{tablename}")
except NoCredentialsError:
    print("Credentials not available")
    etl_helpers.end_run_cycle(runcycleid, 'f', f"{tablename}", "Credentials not available")
except Exception as e:
    if (str(e) == "no changes for today"):
        print("here")
        etl_helpers.end_run_cycle(runcycleid, 't', f"{tablename}")
    else:
        print(f"An error occurred: {e}")    
        etl_helpers.end_run_cycle(runcycleid, 'f', f"{tablename}", f"An error occurred: {e}")