In [0]:
# %run /Workspace/Shared/etl_helpers

In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
from pyspark.sql import functions as F
import datetime
import sys
sys.path.insert(0, '/Workspace/Shared')
import etl_helpers 
from pyspark.sql.functions import collect_list, concat_ws

runcycleid = etl_helpers.start_run_cycle("factRequestDocumentsDetails")

os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:
    # today = str(datetime.date.today())

    # last successful run time
    df_lastrun = spark.sql(f"SELECT runcyclestartat as createddate FROM dimruncycle WHERE packagename = 'factRequestDocumentsDetails' AND success = 't' ORDER BY runcycleid DESC LIMIT 1")
    
    if df_lastrun.count() > 0:
        maxcreatedate_str = df_lastrun.first().createddate.strftime("%Y-%m-%d %H:%M:%S")
    else:
        # maxcreatedate_str = "2019-01-01 00:00:00"
        maxcreatedate_str = "2025-07-11 00:00:00"
    # maxcreatedate_str = "2025-08-06 00:00:00"
    print(maxcreatedate_str)

    # temp table/view for new/update requestids
    df_existing = spark.sql(f"""
        SELECT DISTINCT
            foiministryrequestid,
            foirequest_id
        FROM
            foi_mod.foiministryrequests
        WHERE
            created_at > '{maxcreatedate_str}' OR try_cast(updated_at AS DATE) > '{maxcreatedate_str}'
    """)
    df_existing.createOrReplaceTempView("temp_requests")
    print(df_existing.count())
    df_existing.show()

    if df_existing.count() == 0:
        raise Exception("no changes for today")

    # set existing records to inactive
    df_existing = spark.sql(f"""MERGE INTO default.factRequestDocumentsDetails dd
        USING (
            SELECT foirequest_id AS foirequestid
            FROM temp_requests
        ) AS temp
        ON temp.foirequestid = dd.foirequestid
        WHEN MATCHED and dd.sourceoftruth = 'FOIMOD' THEN

        UPDATE 
        SET dd.activeflag = 'N'
    """);
    df_existing.show()


    query = f"""
        select
            sq1.foiministryrequestid,
            count, 
            case when requeststatuslabel = 'closed' then count else 0 end as pagesreleased,
            sq2.created_at,
            sq2.updated_at,
            sq2.foirequest_id,
            estimatedelectronicpages,
            estimatedhardcopypages,
            pagecount,
            dedupepagecount
        from 
        (
          (
            SELECT
                dpf.foiministryrequestid,
                SUM(
                    SIZE(
                        FROM_JSON(
                            pageflag,
                            'ARRAY<STRUCT<flagid: INT, page: INT, programareaid: ARRAY<INT>, other: ARRAY<STRING>>>'
                        )
                    )
                ) AS count
            FROM docreviewer.documentpageflags dpf
            JOIN temp_requests ON temp_requests.foiministryrequestid = dpf.foiministryrequestid
            JOIN docreviewer.documents d ON dpf.documentid = d.documentid
            JOIN docreviewer.documentmaster dm ON dm.documentmasterid = d.documentmasterid
            LEFT JOIN docreviewer.documentdeleted dd ON dd.filepath LIKE CONCAT(dm.filepath, '%')
            WHERE dd.deleted IS NULL OR dd.deleted IS FALSE
            GROUP BY
                dpf.foiministryrequestid
          ) sq1

          join (
            WITH ranked AS (
              SELECT
                *,
                ROW_NUMBER() OVER (PARTITION BY foiministryrequestid ORDER BY version DESC) AS rn
              FROM foi_mod.FOIMinistryRequests fmr
            )
            SELECT ranked.foiministryrequestid, ranked.requeststatuslabel, ranked.created_at, ranked.updated_at, ranked.foirequest_id
            FROM ranked
            join temp_requests on temp_requests.foiministryrequestid = ranked.foiministryrequestid
            WHERE rn = 1
          ) sq2 on sq1.foiministryrequestid = sq2.foiministryrequestid
        )

        left join
        (
          WITH ranked AS (
            SELECT
              ministryrequestid,
              get_json_object(feedata, '$.estimatedelectronicpages') AS estimatedelectronicpages,
              get_json_object(feedata, '$.estimatedhardcopypages') AS estimatedhardcopypages,
              ROW_NUMBER() OVER (PARTITION BY cfrfeeid ORDER BY version DESC) AS rn
            FROM foi_mod.foirequestcfrfees
          )
          SELECT *
          FROM ranked
          WHERE rn = 1
        ) sq3 on sq3.ministryrequestid = sq1.foiministryrequestid

        join (
          select sq3.*, sq2.dedupepagecount
          from 
          (
            select foiministryrequestid, sum(pagecount) as pagecount from docreviewer.documents d
            group by foiministryrequestid
          ) sq3
          join
          (
            select sum(pagecount) as dedupepagecount, foiministryrequestid
            from docreviewer.documents d1
            join 
            (
              select rank1hash, min(d.documentid) as docid  from docreviewer.documenthashcodes dhc
              join docreviewer.documents d on d.documentid = dhc.documentid
              group by rank1hash
            ) sq on sq.docid = d1.documentid
            group by foiministryrequestid
          ) sq2 on sq2.foiministryrequestid = sq3.foiministryrequestid
        ) sq4 on sq4.foiministryrequestid = sq1.foiministryrequestid
        """

    print(query)

    df = spark.sql(query)
    df.show()


    # order of columns here is important!
    df_mapped = df.selectExpr(
        "foirequest_id AS foirequestid",
        f"{runcycleid} as runcycleid",
        "count AS noofpagesreviewed",
        "pagesreleased AS noofpagesreleased",
        "dedupepagecount AS noofpagesdeduplicated",
        "NULL AS noofpagesintherequest",
        "estimatedelectronicpages AS electronicpageestimate",
        "estimatedhardcopypages AS physicalpageestimate",
        "'' AS noofpagesinreviewlog",
        "'' AS noofpagesinredactionlayer",
        "'Y' AS activeflag",
        "'FOIMOD' AS sourceoftruth",
    )
    df_mapped.show()
    df_mapped.write.format("delta").mode("append").option("mergeSchema", "false").insertInto("factrequestdocumentsdetails")  
    etl_helpers.end_run_cycle(runcycleid, 't', "factRequestDocumentsDetails")
except NoCredentialsError:
    print("Credentials not available")
    etl_helpers.end_run_cycle(runcycleid, 'f', "factRequestDocumentsDetails", "Credentials not available")
    raise Exception("notebook failed") from e
except Exception as e:
    if (str(e) == "no changes for today"):
        etl_helpers.end_run_cycle(runcycleid, 't', "factRequestDocumentsDetails")
    else:
        print(f"An error occurred: {e}")    
        etl_helpers.end_run_cycle(runcycleid, 'f', "factRequestDocumentsDetails", f"An error occurred: {e}")
        raise Exception("notebook failed") from e