In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
from pyspark.sql import functions as F
import datetime
import sys
sys.path.insert(0, '/Workspace/Shared')
import etl_helpers 
from pyspark.sql.functions import collect_list, concat_ws

runcycleid = etl_helpers.start_run_cycle("factRequestDocumentsDetails")

os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:
    today = str(datetime.date.today())

    df_existing = spark.sql("SELECT max(TIMESTAMP(runcycleendat)) as runcycleendat from dimruncycle where packagename = 'factRequestDocumentsDetails' and success = 't'")
    df_existing.show()
    maxcreatedate = df_existing.first().runcycleendat
    print(maxcreatedate)
    maxcreatedate_str = maxcreatedate.strftime("%Y-%m-%d %H:%M:%S")

    df_existing = spark.sql(f"SELECT distinct foiministryrequestid, min(foirequest_id) as foirequest_id from foi_mod.foiministryrequests where (created_at > '{maxcreatedate_str}' or DATE(updated_at) > '{maxcreatedate_str}') group by foiministryrequestid")

    df_existing.show()

    comma_list_df = df_existing.select(concat_ws(",", collect_list("foirequest_id")).alias("comma_list"))
    comma_list = comma_list_df.collect()[0]["comma_list"]

    

    print(comma_list)

    if comma_list == "":
        raise Exception("no changes for today")

    query = f"""MERGE INTO default.factRequestDocumentsDetails dd
        USING dimRequests r
        ON r.foirequestid = dd.foirequestid
        WHEN MATCHED and dd.foirequestid in ({comma_list}) and sourceoftruth = 'FOIMOD' THEN

        UPDATE 
        SET dd.activeflag = 'N'""";

        
    comma_list_df = df_existing.select(concat_ws(",", collect_list("foiministryrequestid")).alias("comma_list"))
    comma_list = comma_list_df.collect()[0]["comma_list"]

    query = f"""
        select sq1.foiministryrequestid, count, 
        case when requeststatuslabel = 'closed' then count else 0 end as pagesreleased,
        sq2.created_at, sq2.updated_at, sq2.foirequest_id,

        estimatedelectronicpages, estimatedhardcopypages,
        pagecount, dedupepagecount
        from 
        ((select foiministryrequestid, count(*) as count
        from (
        select 
        EXPLODE(
            FROM_JSON(
                pageflag,
                'ARRAY<STRUCT<flagid: INT, page: INT, programareaid: ARRAY<INT>, other: ARRAY<STRING>>>'
            )
            ) as pageflags,
        -- explode(split(pageflag, ',')) as pageflags,
        -- pageflag, 
        dpf.foiministryrequestid from docreviewer.documentpageflags dpf
        join docreviewer.documents d on dpf.documentid = d.documentid
        join docreviewer.documentmaster dm on dm.documentmasterid = d.documentmasterid
        left join docreviewer.documentdeleted dd on dd.filepath || '%' ilike dm.filepath
        where dd.deleted is null or dd.deleted is false
        )
        group by foiministryrequestid ) sq1

        join (WITH ranked AS (
        SELECT *,
                ROW_NUMBER() OVER (PARTITION BY foiministryrequestid ORDER BY version DESC) AS rn
        FROM foi_mod.FOIMinistryRequests fmr
        )
        SELECT ranked.foiministryrequestid, ranked.requeststatuslabel, ranked.created_at, ranked.updated_at, ranked.foirequest_id
        FROM ranked
        WHERE rn = 1 and (foiministryrequestid in ({comma_list}))) sq2 on sq1.foiministryrequestid = sq2.foiministryrequestid)
        left join

        (WITH ranked AS (
        SELECT ministryrequestid, get_json_object(feedata, '$.estimatedelectronicpages') AS estimatedelectronicpages,
        get_json_object(feedata, '$.estimatedhardcopypages') AS estimatedhardcopypages,
                ROW_NUMBER() OVER (PARTITION BY cfrfeeid ORDER BY version DESC) AS rn
        FROM foi_mod.foirequestcfrfees
        )
        SELECT *
        FROM ranked
        WHERE rn = 1) sq3 on sq3.ministryrequestid = sq1.foiministryrequestid

        join (

        select sq3.*, sq2.dedupepagecount from 
        (select foiministryrequestid, sum(pagecount) as pagecount from docreviewer.documents d
        group by foiministryrequestid) sq3
        join

        (select sum(pagecount) as dedupepagecount, foiministryrequestid from docreviewer.documents d1
        join 
        (select rank1hash, min(d.documentid) as docid  from docreviewer.documenthashcodes dhc
        join docreviewer.documents d on d.documentid = dhc.documentid
        group by rank1hash) sq on sq.docid = d1.documentid
        group by foiministryrequestid) sq2 on sq2.foiministryrequestid = sq3.foiministryrequestid) sq4 on sq4.foiministryrequestid = sq1.foiministryrequestid
        """

    print(query)

    df = spark.sql(query)
    df.show()


    # order of columns here is important!
    df_mapped = df.selectExpr(
        "foirequest_id AS foirequestid",
        f"{runcycleid} as runcycleid",
        "count AS noofpagesreviewed",
        "pagesreleased AS noofpagesreleased",
        "dedupepagecount AS noofpagesdeduplicated",
        "NULL AS noofpagesintherequest",
        "estimatedelectronicpages AS electronicpageestimate",
        "estimatedhardcopypages AS physicalpageestimate",
        "'' AS noofpagesinreviewlog",
        "'' AS noofpagesinredactionlayer",
        "'Y' AS activeflag",
    )
    df_mapped.show()
    df_mapped.write.format("delta").mode("append").option("mergeSchema", "false").insertInto("factrequestdocumentsdetails")  
    etl_helpers.end_run_cycle(runcycleid, 't', "factRequestDocumentsDetails")
except NoCredentialsError:
    print("Credentials not available")
    etl_helpers.end_run_cycle(runcycleid, 'f', "factRequestDocumentsDetails", "Credentials not available")
    raise Exception("notebook failed") from e
except Exception as e:
    if (str(e) == "no changes for today"):
        print("here")
        etl_helpers.end_run_cycle(runcycleid, 't', "factRequestDocumentsDetails")
    else:
        print(f"An error occurred: {e}")    
        etl_helpers.end_run_cycle(runcycleid, 'f', "factRequestDocumentsDetails", f"An error occurred: {e}")
        raise Exception("notebook failed") from e