In [0]:
# %sql

# select * from dimruncycle
# order by runcycleid desc
# limit 1;

In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
import datetime
import sys
sys.path.insert(0, '/Workspace/Shared')
import etl_helpers 
from pyspark.sql.functions import col, lit

tablename = "factrequestrequesters"
runcycleid = etl_helpers.start_run_cycle(tablename)
os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:

    df_lastrun = spark.sql(f"SELECT runcyclestartat as createddate FROM dimruncycle WHERE packagename = \"{tablename}\" AND success = 't' ORDER BY runcycleid DESC LIMIT 1")
    
    if df_lastrun.count() > 0:
        lastruntime = df_lastrun.first().createddate.strftime("%Y-%m-%d %H:%M:%S")
    else:
        lastruntime = "2019-01-01 00:00:00"
    print(lastruntime)

    query = f"""
        SELECT
            mappings.foirequest_id as foirequestid,
            {runcycleid} as runcycleid,
            CASE
                WHEN r.requesttype = 'personal' THEN 33
            ELSE
                31
            END AS requesttypeid,
            r.applicantcategoryid as applicantcategoryid,
            mappings.requestortypeid as requestertypeid,
            a.foirequestapplicantid AS requesterid,
            a.created_at as createddate,
            CASE
                WHEN a.updated_at = 'NULL' THEN NULL
            ELSE
                a.updated_at
            END as modifieddate,
            't' AS activeflag,
            'FOIMOD' AS sourceoftruth
        FROM foi_mod.foirequestapplicantmappings mappings
        INNER JOIN (
            SELECT
                foirequest_id,
                MAX(foirequestversion_id) AS max_foirequestversion_id
            FROM foi_mod.foirequestapplicantmappings
            GROUP BY foirequest_id
        ) AS maxversion ON mappings.foirequest_id = maxversion.foirequest_id AND mappings.foirequestversion_id = maxversion.max_foirequestversion_id
        INNER JOIN
            foi_mod.foirequestapplicants a ON a.foirequestapplicantid = mappings.foirequestapplicantid
        INNER JOIN
            foi_mod.foirequests r ON r.foirequestid = mappings.foirequest_id AND r.version = mappings.foirequestversion_id
        WHERE
            a.created_at > '{lastruntime}' OR a.updated_at > '{lastruntime}'
        ORDER BY a.foirequestapplicantid
    """

    # print(query)

    df = spark.sql(query)
    df.show()

    # order of columns here is important!
    df_mapped = df.selectExpr(
            "foirequestid AS foirequestid",
            "runcycleid AS runcycleid",
            "requesttypeid AS requesttypeid",
            "applicantcategoryid AS applicantcategoryid",
            "requestertypeid AS requestertypeid",
            "requesterid AS requesterid",
            "createddate AS createddate",
            "modifieddate AS modifieddate",
            "activeflag AS activeflag",
            "sourceoftruth AS sourceoftruth"
        )
    df_mapped.show()

    from delta.tables import DeltaTable
    delta_table = DeltaTable.forName(spark, f"hive_metastore.default.{tablename}")
    delta_table.alias("target").merge(
        df_mapped.alias("source"),
        "target.foirequestid = source.foirequestid AND target.sourceoftruth = source.sourceoftruth AND target.requestertypeid = source.requestertypeid"
    ).whenMatchedUpdate(
        condition = "target.activeflag = 't'",
        set = {
            "activeflag": lit("f"),
        }
    ).whenNotMatchedInsert(
        values = {
            "foirequestid": "source.foirequestid",
            "runcycleid": "source.runcycleid",
            "requesttypeid": "source.requesttypeid",
            "applicantcategoryid": "source.applicantcategoryid",
            "requestertypeid": "source.requestertypeid",
            "requesterid": "source.requesterid",
            "createddate": "source.createddate",
            "modifieddate": "source.modifieddate",
            "activeflag": "source.activeflag",
            "sourceoftruth": "source.sourceoftruth"
        }
    ).execute()

    etl_helpers.end_run_cycle(runcycleid, 't', tablename)
except NoCredentialsError:
    print("Credentials not available")
    etl_helpers.end_run_cycle(runcycleid, 'f', tablename, "Credentials not available")
except Exception as e:
    if (str(e) == "no changes for today"):
        print("here")
        etl_helpers.end_run_cycle(runcycleid, 't', tablename)
    else:
        print(f"An error occurred: {e}")    
        etl_helpers.end_run_cycle(runcycleid, 'f', tablename, f"An error occurred: {e}")