In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
from pyspark.sql import functions as F
import datetime
import sys
sys.path.insert(0, '/Workspace/Shared')
import etl_helpers 

runcycleid = etl_helpers.start_run_cycle("factmodrequestdocumentdetails")

os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:
    today = str(datetime.date.today())

    df_existing = spark.sql("SELECT max(cast(created_at as timestamp)) as created_at, max(cast(created_at as timestamp)) as updated_at FROM factmodrequestdocumentdetails")
    maxcreatedate = df_existing.first().created_at
    print(maxcreatedate)
    maxcreatedate_str = maxcreatedate.strftime("%Y-%m-%d %H:%M:%S")
    maxupdatedate = df_existing.first().updated_at
    maxupdatedate_str = maxupdatedate.strftime("%Y-%m-%d %H:%M:%S")

    query = f"""
        SELECT
        DISTINCT d.documentid,
        d.pagecount,
        d.originalpagecount,
        d.filename,
        dm.ministryrequestid,
        dm.documentmasterid,
        dm.filepath,
        dh.rank1hash,
        dh.rank2hash,
        dd.deleted,
        d.created_at,
        d.updated_at,
        0 AS runcycleid,
        'Y' AS isactive
        FROM docreviewer.Documents d
        JOIN docreviewer.DocumentMaster dm ON dm.documentmasterid = d.documentmasterid
        JOIN docreviewer.DocumentHashCodes dh ON d.documentid = dh.documentid
        LEFT JOIN docreviewer.DocumentDeleted dd ON dm.filepath LIKE CONCAT(dd.filepath, '%')
        -- WHERE d.created_at > CAST('{maxcreatedate_str}' AS TIMESTAMP)
        --or DATE(d.updated_at) > CAST('{maxupdatedate_str}' AS TIMESTAMP)
        --OR DATE(d.updated_at) = DATE('2025-05-16')
        
        WHERE DATE(d.created_at) = DATE('2025-05-16') and d.filename in ('test2.pdf', 'lorem.pdf');
        """

    df = spark.sql(query)
    df.show()

    documentids = df.agg(F.concat_ws(",", F.collect_list("documentid"))).first()[0]
    print(documentids)

    result_df = spark.sql("""
        update factmodrequestdocumentdetails
        set isactive = 'N' 
        where documentid in (""" + documentids + """)
    """)


    # order of columns here is important!
    df_mapped = df.selectExpr(
        "documentid AS documentid",
        "pagecount AS pagecount",
        "originalpagecount AS originalpagecount",
        "filename AS filename",
        "ministryrequestid AS foiministryrequestid",
        "documentmasterid AS documentmasterid",
        "filepath AS filepath",
        "rank1hash AS rank1hash",
        "rank2hash AS rank2hash",
        "deleted AS deleted",
        "created_at AS created_at",
        "updated_at AS updated_at",
        f"{runcycleid} as runcycleid",
        "'Y' as isactive",        
    )
    df_mapped.show()
    df_mapped.write.format("delta").mode("append").option("mergeSchema", "false").insertInto("factmodrequestdocumentdetails")  
    etl_helpers.end_run_cycle(runcycleid, 't', "factmodrequestdocumentdetails")
except NoCredentialsError:
    print("Credentials not available")
    etl_helpers.end_run_cycle(runcycleid, 'f', "factmodrequestdocumentdetails", "Credentials not available")
    dbutils.notebook.exit("Error: Something went wrong.")
except Exception as e:
    if (str(e) == "no changes for today"):
        print("here")
        etl_helpers.end_run_cycle(runcycleid, 't', "factmodrequestdocumentdetails")
    else:
        print(f"An error occurred: {e}")    
        etl_helpers.end_run_cycle(runcycleid, 'f', "factmodrequestdocumentdetails", f"An error occurred: {e}")
        dbutils.notebook.exit("Error: Something went wrong.")