In [0]:
%sql

select * from dimruncycle
order by runcycleid desc
limit 1;

-- DROP TABLE IF EXISTS dimrequestfordocumentsstatus;

-- CREATE TABLE dimrequestfordocumentsstatus
-- USING DELTA AS
--     SELECT
--         stageid AS reqfordocstatusid,
--         name AS reqfordocstatus,
--         name AS description,
--         CASE
--             WHEN isactive = true THEN 'Y'
--         ELSE
--             'N'
--         END AS cactive,
--         CAST(NULL AS STRING) AS cimport,
--         createdby,
--         created_at AS createddate,
--         CAST(NULL AS STRING) AS modifiedby,
--         CAST(NULL AS TIMESTAMP) AS modifieddate,
--         CAST(NULL AS STRING) AS ctype,
--         CAST(NULL AS STRING) AS cmarkcompleted,
--         'FOIMOD' AS sourceoftruth
--     FROM foi_mod.programareadivisionstages
--     order by stageid;


In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
import datetime
import sys
sys.path.insert(0, '/Workspace/Shared')
import etl_helpers
from pyspark.sql.functions import col #lit

tablename = "dimrequestfordocumentsstatus"
runcycleid = etl_helpers.start_run_cycle(tablename)
os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:
    query = f"""
        SELECT
            stageid AS reqfordocstatusid,
            name AS reqfordocstatus,
            name AS description,
            CASE
                WHEN isactive = true THEN 'Y'
            ELSE
                'N'
            END AS cactive,
            CAST(NULL AS STRING) AS cimport,
            createdby,
            created_at AS createddate,
            CAST(NULL AS STRING) AS modifiedby,
            CAST(NULL AS TIMESTAMP) AS modifieddate,
            CAST(NULL AS STRING) AS ctype,
            CAST(NULL AS STRING) AS cmarkcompleted,
            'FOIMOD' AS sourceoftruth
        FROM foi_mod.programareadivisionstages
        order by stageid;
        """

    # print(query)

    df = spark.sql(query)
    df.show()

    # order of columns here is important!
    df_mapped = df.selectExpr(
            "reqfordocstatusid AS reqfordocstatusid",
            "reqfordocstatus AS reqfordocstatus",
            "description AS description",
            "cactive AS cactive",
            "cimport AS cimport",
            "createdby AS createdby",
            "createddate AS createddate",
            "modifiedby AS modifiedby",
            "modifieddate AS modifieddate",
            "ctype AS ctype",
            "cmarkcompleted AS cmarkcompleted",
            "sourceoftruth AS sourceoftruth"
        )
    df_mapped.show()

    from delta.tables import DeltaTable
    delta_table = DeltaTable.forName(spark, f"hive_metastore.default.{tablename}")
    delta_table.alias("target").merge(
        df_mapped.alias("source"),
        "target.reqfordocstatusid = source.reqfordocstatusid AND target.sourceoftruth = source.sourceoftruth"
    ).whenMatchedUpdate(set = {
        "reqfordocstatus": col("source.reqfordocstatus"),
        "description": col("source.description"),
        "cactive": col("source.cactive"),
        "cimport": col("source.cimport"),
        "createdby": col("source.createdby"),
        "createddate": col("source.createddate"),
        "modifiedby": col("source.modifiedby"),
        "modifieddate": col("source.modifieddate"),
        "ctype": col("source.ctype"),
        "cmarkcompleted": col("source.cmarkcompleted")
    }).whenNotMatchedInsert(values = {
        "reqfordocstatusid": col("source.reqfordocstatusid"),
        "reqfordocstatus": col("source.reqfordocstatus"),
        "description": col("source.description"),
        "cactive": col("source.cactive"),
        "cimport": col("source.cimport"),
        "createdby": col("source.createdby"),
        "createddate": col("source.createddate"),
        "modifiedby": col("source.modifiedby"),
        "modifieddate": col("source.modifieddate"),
        "ctype": col("source.ctype"),
        "cmarkcompleted": col("source.cmarkcompleted"),
        "sourceoftruth": col("source.sourceoftruth")
    }).execute()

    etl_helpers.end_run_cycle(runcycleid, 't', tablename)
except NoCredentialsError:
    print("Credentials not available")
    etl_helpers.end_run_cycle(runcycleid, 'f', tablename, "Credentials not available")
except Exception as e:
    if (str(e) == "no changes for today"):
        print("here")
        etl_helpers.end_run_cycle(runcycleid, 't', tablename)
    else:
        print(f"An error occurred: {e}")    
        etl_helpers.end_run_cycle(runcycleid, 'f', tablename, f"An error occurred: {e}")