In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
import sys
sys.path.insert(0, '/Workspace/Shared')
import etl_helpers 
from datetime import datetime, timedelta

runcycleid = etl_helpers.start_run_cycle("factMODRequestDocumentPageFlags")

os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:
    # today = str(datetime.date.today())

    # yesterday = (datetime.now() - timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0) # placeholder, if there is a way to get last run time then we should use that
    # print(yesterday.isoformat())

    df_existing = spark.sql("SELECT max(TIMESTAMP(runcycleendat)) as runcycleendat from dimruncycle where packagename = 'factRequestDocumentsDetails' and success = 't'")
    df_existing.show()
    maxcreatedate = df_existing.first().runcycleendat
    print(maxcreatedate)
    maxcreatedate_str = maxcreatedate.strftime("%Y-%m-%d %H:%M:%S")



    query = f"""
        -- Step 1: Parse pageflag as JSON array of structs and explode it
        WITH exploded_flags AS (
        SELECT
            documentid,
            EXPLODE(from_json(pageflag, 'ARRAY<STRUCT<flagid: INT, page: INT, programareaid: ARRAY<INT>, other: ARRAY<STRING>>>')) AS flag
        FROM docreviewer.DocumentPageflags
        WHERE created_at > CAST('{maxcreatedate_str}' AS TIMESTAMP) OR DATE(updated_at) > CAST('{maxcreatedate_str}' AS TIMESTAMP)
        -- where DATE(created_at) = '2025-05-29' -- test value
        ),

        -- Step 2: Select the needed fields
        parsed_flags AS (
        SELECT
            documentid,
            flag.flagid
        FROM exploded_flags
        WHERE flag.flagid IS NOT NULL  -- only count flags that have a flagid
        )

        -- Step 3: Aggregate
        SELECT
        pf.documentid,
        pf.flagid,
        COUNT(*) AS pageflagcount,
        dpf.foiministryrequestid,
        'Y' AS isactive
        FROM parsed_flags pf
        JOIN docreviewer.DocumentPageflags dpf
        ON pf.documentid = dpf.documentid
        GROUP BY pf.documentid, pf.flagid, dpf.foiministryrequestid
        ORDER BY pf.documentid DESC;
        """

    df = spark.sql(query)
    df.show()


    # order of columns here is important!
    df_mapped = df.selectExpr(
        "documentid AS documentid",
        "flagid AS foidocreviewerpageflagid",
        "pageflagcount AS pageflagcount",
        "foiministryrequestid AS foiministryrequestid",
        f"{runcycleid} as runcycleid",
        "isactive as isactive"
        
    )
    df_mapped.show()
    df_mapped.write.format("delta").mode("append").option("mergeSchema", "false").insertInto("factmodrequestdocumentpageflags")  
    etl_helpers.end_run_cycle(runcycleid, 't', "factMODRequestDocumentPageFlags")
except NoCredentialsError:
    print("Credentials not available")
    etl_helpers.end_run_cycle(runcycleid, 'f', "factMODRequestDocumentPageFlags", "Credentials not available")
    raise Exception("notebook failed") from e
except Exception as e:
    if (str(e) == "no changes for today"):
        print("here")
        etl_helpers.end_run_cycle(runcycleid, 't', "factMODRequestDocumentPageFlags")
    else:
        print(f"An error occurred: {e}")    
        etl_helpers.end_run_cycle(runcycleid, 'f', "factMODRequestDocumentPageFlags", f"An error occurred: {e}")
        raise Exception("notebook failed") from e