In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
from pyspark.sql import functions as F
import datetime
import sys
sys.path.insert(0, '/Workspace/Shared')
import etl_helpers 

tablename = "factmodrequestdocumentdetails"
runcycleid = etl_helpers.start_run_cycle(tablename)

os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:
    df_lastrun = spark.sql(f"SELECT runcyclestartat as createddate FROM dimruncycle WHERE packagename = \"{tablename}\" AND success = 't' ORDER BY runcycleid DESC LIMIT 1")
    
    if df_lastrun.count() > 0:
        lastruntime = df_lastrun.first().createddate.strftime("%Y-%m-%d %H:%M:%S")
    else:
        lastruntime = "2019-01-01 00:00:00"
    print(lastruntime)

    query = f"""
        SELECT
            DISTINCT d.documentid,
            d.pagecount,
            d.originalpagecount,
            d.filename,
            dm.ministryrequestid,
            dm.documentmasterid,
            dm.filepath,
            dh.rank1hash,
            dh.rank2hash,
            dd.deleted,
            d.created_at,
            d.updated_at,
            0 AS runcycleid,
            'Y' AS isactive,
            CASE
                WHEN d.incompatible = 'f' OR d.incompatible = 'false' THEN 'Y'
            ELSE
                'N'
            END AS compatible,
            'FOIMOD' AS sourceoftruth
        FROM docreviewer.Documents d
        JOIN docreviewer.DocumentMaster dm ON dm.documentmasterid = d.documentmasterid
        JOIN docreviewer.DocumentHashCodes dh ON d.documentid = dh.documentid
        LEFT JOIN docreviewer.DocumentDeleted dd ON dm.filepath LIKE CONCAT(dd.filepath, '%')
        WHERE d.created_at > CAST('{lastruntime}' AS TIMESTAMP)
        or CAST(d.updated_at AS TIMESTAMP) > CAST('{lastruntime}' AS TIMESTAMP)
        """

    print(query)

    df = spark.sql(query)
    df.show()

    # if (df.count() == 0):
    #     raise Exception("no changes for today")

    # documentids = df.agg(F.concat_ws(",", F.collect_list("documentid"))).first()[0]
    # print(documentids)

    # result_df = spark.sql("""
    #     update factmodrequestdocumentdetails
    #     set isactive = 'N' 
    #     where documentid in (""" + documentids + """)
    # """)

    # order of columns here is important!
    df_mapped = df.selectExpr(
        "documentid AS documentid",
        "pagecount AS pagecount",
        "originalpagecount AS originalpagecount",
        "filename AS filename",
        "ministryrequestid AS foiministryrequestid",
        "documentmasterid AS documentmasterid",
        "filepath AS filepath",
        "rank1hash AS rank1hash",
        "rank2hash AS rank2hash",
        "deleted AS deleted",
        "created_at AS created_at",
        "updated_at AS updated_at",
        f"{runcycleid} as runcycleid",
        "'Y' as isactive",
        "compatible AS compatible",
        "sourceoftruth AS sourceoftruth"
    )
    df_mapped.show()
    # df_mapped.write.format("delta").mode("append").option("mergeSchema", "false").insertInto(tablename)  

    from delta.tables import DeltaTable
    delta_table = DeltaTable.forName(spark, f"hive_metastore.default.{tablename}")
    delta_table.alias("target").merge(
        df_mapped.alias("source"),
        "target.documentid = source.documentid"
    ).whenMatchedUpdate(
        condition = "target.isactive = 'Y'",
        set = {
            "isactive": lit("N"),
        }
    ).execute()

    print("Matched records deactivated.")

    df_mapped.write.format("delta").mode("append").saveAsTable(f"hive_metastore.default.{tablename}") 

    etl_helpers.end_run_cycle(runcycleid, 't', tablename)
except NoCredentialsError:
    print("Credentials not available")
    etl_helpers.end_run_cycle(runcycleid, 'f', tablename, "Credentials not available")
    raise Exception("notebook failed") from e
except Exception as e:
    if (str(e) == "no changes for today"):
        print("here")
        etl_helpers.end_run_cycle(runcycleid, 't', tablename)
    else:
        print(f"An error occurred: {e}")    
        etl_helpers.end_run_cycle(runcycleid, 'f', tablename, f"An error occurred: {e}")
        raise Exception("notebook failed") from e