In [0]:
# %sql

# CREATE TABLE dimextensiontypes
# USING DELTA AS
#     SELECT
#         extensionreasonid AS extensiontypeid,
#         extensiontype AS extensiontypename,
#         CAST(NULL AS STRING) AS createdby,
#         CAST(NULL AS TIMESTAMP) AS createddate,
#         CAST(NULL AS STRING) AS modifiedby,
#         CAST(NULL AS TIMESTAMP) AS modifieddate,
#         CASE
#             WHEN a.isactive = TRUE THEN 'Y'
#         ELSE
#             'N'
#         END AS cactive,
#         CAST(NULL AS STRING) AS cdelete,
#         CAST(NULL AS STRING) AS cacttype,
#         reason AS extensionreason,
#         CAST(NULL AS STRING) AS extensionactdesc,
#         CAST(NULL AS STRING) AS vcshowninsr,
#         CAST(NULL AS STRING) AS clsthirdparty,
#         CAST(NULL AS STRING) AS vcdaysallowed,
#         defaultextendedduedays AS iextensiondays,
#         "FOIMOD" AS sourceoftruth
#     FROM foi_mod.extensionreasons a
#     order by extensionreasonid;

In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
import datetime
import sys
sys.path.insert(0, '/Workspace/Shared')
import etl_helpers
from pyspark.sql.functions import col #lit
# from pyspark.sql.types import StringType, TimestampType #IntegerType, BooleanType

tablename = "dimextensiontypes"
runcycleid = etl_helpers.start_run_cycle(tablename)
os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:
    query = f"""
        SELECT
            extensionreasonid AS extensiontypeid,
            extensiontype AS extensiontypename,
            CAST(NULL AS STRING) AS createdby,
            CAST(NULL AS TIMESTAMP) AS createddate,
            CAST(NULL AS STRING) AS modifiedby,
            CAST(NULL AS TIMESTAMP) AS modifieddate,
            CASE
                WHEN a.isactive = TRUE THEN 'Y'
            ELSE
                'N'
            END AS cactive,
            CAST(NULL AS STRING) AS cdelete,
            CAST(NULL AS STRING) AS cacttype,
            reason AS extensionreason,
            CAST(NULL AS STRING) AS extensionactdesc,
            CAST(NULL AS STRING) AS vcshowninsr,
            CAST(NULL AS STRING) AS clsthirdparty,
            CAST(NULL AS STRING) AS vcdaysallowed,
            defaultextendedduedays AS iextensiondays,
            'FOIMOD' AS sourceoftruth
        FROM foi_mod.extensionreasons a
        order by extensionreasonid;  
        """

    # print(query)

    df = spark.sql(query)
    df.show()

    # order of columns here is important!
    df_mapped = df.selectExpr(
            "extensiontypeid AS extensiontypeid",
            "extensiontypename AS extensiontypename",
            "createdby AS createdby",
            "createddate AS createddate",
            "modifiedby AS modifiedby",
            "modifieddate AS modifieddate",
            "cactive AS cactive",
            "cdelete as cdelete",
            "cacttype AS cacttype",
            "extensionreason AS extensionreason",
            "extensionactdesc AS extensionactdesc",
            "vcshowninsr AS vcshowninsr",
            "clsthirdparty AS clsthirdparty",
            "vcdaysallowed as vcdaysallowed",
            "iextensiondays as iextensiondays",
            "sourceoftruth AS sourceoftruth"
        )
    # df_mapped = df_mapped.withColumn("createdby", col("createdby").cast(StringType()))
    # df_mapped = df_mapped.withColumn("createddate", col("createddate").cast(TimestampType()))
    # df_mapped = df_mapped.withColumn("modifiedby", col("modifiedby").cast(StringType()))
    # df_mapped = df_mapped.withColumn("modifieddate", col("modifieddate").cast(TimestampType()))
    # df_mapped = df_mapped.withColumn("cdelete", col("cdelete").cast(StringType()))
    # df_mapped = df_mapped.withColumn("cacttype", col("cacttype").cast(StringType()))
    # df_mapped = df_mapped.withColumn("extensionactdesc", col("extensionactdesc").cast(StringType()))
    # df_mapped = df_mapped.withColumn("vcshowninsr", col("vcshowninsr").cast(StringType()))
    # df_mapped = df_mapped.withColumn("clsthirdparty", col("clsthirdparty").cast(StringType()))
    # df_mapped = df_mapped.withColumn("vcdaysallowed", col("vcdaysallowed").cast(StringType()))
    df_mapped.show()

    from delta.tables import DeltaTable
    delta_table = DeltaTable.forName(spark, f"hive_metastore.default.{tablename}")
    delta_table.alias("target").merge(
        df_mapped.alias("source"),
        "target.extensiontypeid = source.extensiontypeid AND target.sourceoftruth = source.sourceoftruth"
    ).whenMatchedUpdate(set = {
        "extensiontypename": col("source.extensiontypename"),
        "createdby": col("source.createdby"),
        "createddate": col("source.createddate"),
        "modifiedby": col("source.modifiedby"),
        "modifieddate": col("source.modifieddate"),
        "cactive": col("source.cactive"),
        "cdelete": col("source.cdelete"),
        "cacttype": col("source.cacttype"),
        "extensionreason": col("source.extensionreason"),
        "extensionactdesc": col("source.extensionactdesc"),
        "vcshowninsr": col("source.vcshowninsr"),
        "clsthirdparty": col("source.clsthirdparty"),
        "vcdaysallowed": col("source.vcdaysallowed"),
        "iextensiondays": col("source.iextensiondays")
    }).whenNotMatchedInsert(values = {
        "extensiontypeid": col("source.extensiontypeid"),
        "extensiontypename": col("source.extensiontypename"),
        "createdby": col("source.createdby"),
        "createddate": col("source.createddate"),
        "modifiedby": col("source.modifiedby"),
        "modifieddate": col("source.modifieddate"),
        "cactive": col("source.cactive"),
        "cdelete": col("source.cdelete"),
        "cacttype": col("source.cacttype"),
        "extensionreason": col("source.extensionreason"),
        "extensionactdesc": col("source.extensionactdesc"),
        "vcshowninsr": col("source.vcshowninsr"),
        "clsthirdparty": col("source.clsthirdparty"),
        "vcdaysallowed": col("source.vcdaysallowed"),
        "iextensiondays": col("source.iextensiondays"),
        "sourceoftruth": col("source.sourceoftruth")
    }).execute()

    etl_helpers.end_run_cycle(runcycleid, 't', tablename)
except NoCredentialsError:
    print("Credentials not available")
    etl_helpers.end_run_cycle(runcycleid, 'f', tablename, "Credentials not available")
except Exception as e:
    if (str(e) == "no changes for today"):
        print("here")
        etl_helpers.end_run_cycle(runcycleid, 't', tablename)
    else:
        print(f"An error occurred: {e}")    
        etl_helpers.end_run_cycle(runcycleid, 'f', tablename, f"An error occurred: {e}")