In [0]:
from pyspark.sql import SparkSession, DataFrame, functions as F
from pyspark.sql.window import Window
import uuid
from pyspark.sql.functions import lit, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from delta.tables import DeltaTable



def append_with_incrementing_id(
    new_df: DataFrame,
    table_name: str,
    id_column: str = "id",
    order_by_column: str = None,
    database: str = "default"
):
    """
    Appends new_df to a Delta table with string auto-increment IDs like '1', '2', '3', ...

    Parameters:
    - new_df: The new data to insert.
    - table_name: Target Delta table name (must exist).
    - id_column: Name of the ID column (default: 'id').
    - order_by_column: Optional: Column to use for ordering (for deterministic IDs).
    - database: Databricks database containing the table.
    """
    spark = SparkSession.getActiveSession()
    full_table_name = f"{database}.{table_name}"

    if not spark._jsparkSession.catalog().tableExists(full_table_name):
        raise Exception(f"Table {full_table_name} does not exist. Please create it first.")

    existing_df = spark.table(full_table_name)

    if id_column in existing_df.columns:
        numeric_part_expr = F.regexp_extract(F.col(id_column), r"(\d+)$", 1).cast("long")
        max_id_row = existing_df.select(F.max(numeric_part_expr)).collect()[0][0]
        max_id = max_id_row if max_id_row is not None else 0
    else:
        max_id = 0

    if order_by_column and order_by_column in new_df.columns:
        windowSpec = Window.orderBy(F.col(order_by_column))
    else:
        windowSpec = Window.orderBy(F.lit(1))

    new_df_with_number = new_df.withColumn(
        "__rownum", F.row_number().over(windowSpec) + max_id
    )

    new_df_with_id = new_df_with_number.withColumn(
        id_column, F.col("__rownum")
    ).drop("__rownum")

    cols = new_df_with_id.columns
    ordered_cols = [id_column] + [c for c in cols if c != id_column]
    new_df_with_id = new_df_with_id.select(ordered_cols)

    # new_df_with_id.write.mode("append").saveAsTable(full_table_name)

    new_df_with_id.printSchema()
    
    new_df_with_id.write.format("delta").mode("append").option("mergeSchema", "false").insertInto(full_table_name)  

    print(f"✅ Appended {new_df_with_id.count()} rows to {full_table_name} with IDs like '1', '2', '3', ...")


def start_run_cycle(
    packagename: str,
):
    """
    Inserts a new row into the run cycle table to mark the start of a cycle.

    Parameters:
    - table_name: name of the target Delta table
    - description: description of this run cycle (string)
    - packageid: package identifier (string)
    - packagename: package name (string)
    - database: optional database name (default: 'default')
    """
    spark = SparkSession.getActiveSession()
    df_existing = spark.sql("SELECT max(cast(runcycleid as int)) as runcycleid FROM dimruncycle")
    runcycleid = df_existing.first().runcycleid + 1
    full_table = f"default.dimruncycle"

    description = "package: " + packagename + " started"

    # Generate a UUID object
    uuid_obj = uuid.uuid4()

    # Convert the UUID object to a string and make it uppercase
    packageid = str(uuid_obj).upper()

    # Build single-row DataFrame
    data = spark.createDataFrame(
        [
            (
                runcycleid,
                None,  # runcyclestartat (will be filled below)
                description,
                packageid,
                packagename,
                None,  # runcycleendat
                "NULL"  # success
            )
        ],
        schema = StructType([
                StructField("runcycleid", IntegerType(), False),         # int
                StructField("runcyclestartat", TimestampType(), True),   # timestamp
                StructField("description", StringType(), True),          # string
                StructField("packageid", StringType(), True),            # string
                StructField("packagename", StringType(), True),          # string
                StructField("runcycleendat", StringType(), True),        # string (you may want to make this a TimestampType too)
                StructField("success", StringType(), True),              # string
        ])
    )

    # Set current timestamp as runcyclestartat
    df_with_timestamp = data.withColumn("runcyclestartat", current_timestamp())

    # Append to table
    df_with_timestamp.write.mode("append").saveAsTable(full_table)

    print(f"✅ Run cycle '{runcycleid}' inserted into {full_table}.")
    return runcycleid

def end_run_cycle(
    runcycleid: str,
    success: str,
    packagename: str,
    error: str = None,
):
    """
    Updates the run cycle row to mark the end of the run.

    Parameters:
    - table_name: name of the target Delta table
    - runcycleid: ID of the run cycle to update
    - success: True or False indicating run success
    - database: optional database name (default: 'default')
    """
    spark = SparkSession.getActiveSession()
    full_table = f"default.dimruncycle"

    delta_table = DeltaTable.forName(spark, full_table)

    if success == 't':
        description = "package: " + packagename + " complete"
    else:        
        description = "package: " + packagename + " error " + error

    

    # Perform update
    delta_table.update(
        condition=f"runcycleid = '{runcycleid}'",
        set={
            "description": lit(str(description)),
            "runcycleendat": current_timestamp().cast("string"),
            "success": lit(str(success).lower()),
        }
    )

    print(f"✅ Run cycle '{runcycleid}' marked as ended with success={success}.")

In [0]:
# src/utils/job_tracker.py

import uuid
from datetime import datetime, timedelta
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, max
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

DEFAULT_START_DATE = datetime(2025, 7, 11).date()

def _ensure_job_tracker_table_exists(spark: SparkSession, job_tracker_table_path: str):
    create_table_sql = f"""
        CREATE TABLE IF NOT EXISTS hive_metastore.default.job_tracker (
            job_name STRING NOT NULL,
            run_id STRING NOT NULL,
            start_time TIMESTAMP NOT NULL,
            end_time TIMESTAMP,
            status STRING NOT NULL,
            message STRING
        ) USING DELTA
        LOCATION '{job_tracker_table_path}'
    """
    try:
        spark.sql(create_table_sql)
        print(f"Ensured job tracker table exists at: {job_tracker_table_path}")
    except Exception as e:
        print(f"Error ensuring job tracker table exists: {e}")
        raise # Re-raise to prevent job from proceeding without tracker


def get_last_successful_run_time(spark: SparkSession, job_tracker_table_path: str, job_name: str) -> datetime | None:
    try:
        _ensure_job_tracker_table_exists(spark, job_tracker_table_path)
        tracker_df = spark.read.format("delta").load(job_tracker_table_path)

        last_run_df = tracker_df.filter(
            (col("job_name") == job_name) & (col("status") == "SUCCEEDED")
        ).orderBy(col("start_time").desc())

        if last_run_df.count() > 0:
            last_successful_time = last_run_df.first()["start_time"]
            print(f"Found last successful run for '{job_name}' at: {last_successful_time}")
            return last_successful_time
        else:
            print(f"No previous successful run found for '{job_name}'.")
            return None
    except Exception as e:
        print(f"Error reading job tracker for last successful run for '{job_name}': {e}")
        return None


def record_job_status(spark: SparkSession, job_tracker_table_path: str, job_name: str, run_id: str, status: str,
                      start_time: datetime, end_time: datetime = None,
                      message: str = None):
    _ensure_job_tracker_table_exists(spark, job_tracker_table_path) # Ensure table before writing

    schema = StructType([
        StructField("job_name", StringType(), False),
        StructField("run_id", StringType(), False),
        StructField("start_time", TimestampType(), False),
        StructField("end_time", TimestampType(), True),
        StructField("status", StringType(), False),
        StructField("message", StringType(), True)
    ])

    data = [(job_name, run_id, start_time, end_time, status, message)]

    new_status_df = spark.createDataFrame(data, schema=schema)
    new_status_df.createOrReplaceTempView("new_status_df_temp_view") # Create a temp view for MERGE

    try:
        merge_sql = f"""
            MERGE INTO delta.`{job_tracker_table_path}` AS target
            USING new_status_df_temp_view AS source
            ON target.job_name = source.job_name AND target.run_id = source.run_id
            WHEN MATCHED THEN
                UPDATE SET
                    end_time = source.end_time,
                    status = source.status,
                    message = source.message
            WHEN NOT MATCHED THEN
                INSERT (job_name, run_id, start_time, end_time, status, message)
                VALUES (source.job_name, source.run_id, source.start_time, source.end_time, source.status, source.message)
        """
        spark.sql(merge_sql)
        print(f"Job status for '{job_name}' (run_id: {run_id}) recorded as: {status}")
    except Exception as e:
        print(f"ERROR: Failed to record job status for '{job_name}' (run_id: {run_id}): {e}")
        raise # Re-raise to ensure the job failure is propagated

def get_current_run_id(spark: SparkSession) -> str:
    """
    Get the Databricks run ID from Spark conf, otherwise generates a UUID.
    """
    try:
        return spark.conf.get("spark.databricks.driver.runId")
    except Exception:
        run_id = str(uuid.uuid4())
        print(f"Warning: spark.databricks.driver.runId not found. Using generated UUID: {run_id}")
        return run_id

def generate_date_range_json(last_successful_run_date: datetime | None, current_job_date: datetime) -> list[str]:
    """
    Generates a JSON array of dates (YYYY-MM-DD) between the last successful run date and the current job date.
    """
    date_list = []
    
    # Ensure it's date only, not time
    end_date = current_job_date.date()

    if last_successful_run_date:
        # start from a day before the last successful run date
        start_date = last_successful_run_date.date()
        # Ensure start_date is not after end_date
        if start_date > end_date:
            start_date = end_date
    else:
        # If no last successful run, start from DEFAULT_START_DATE.
        start_date = DEFAULT_START_DATE

    current_date = start_date
    while current_date <= end_date:
        date_list.append(current_date.strftime("%Y-%m-%d"))
        current_date += timedelta(days=1)

    return date_list

In [0]:
%restart_python
%pip install boto3
import boto3
import os
from botocore.exceptions import NoCredentialsError
from pyspark.sql import functions as F
import datetime
import sys
sys.path.insert(0, '/Workspace/Shared')
# import etl_helpers
from pyspark.sql.functions import lit, col

tablename = "factmodrequestdocumentdetails"
runcycleid = start_run_cycle(tablename)

os.makedirs("/dbfs/foi/dataload", exist_ok=True)  # make sure directory exists

try:
    df_lastrun = spark.sql(f"SELECT runcyclestartat as createddate FROM dimruncycle WHERE packagename = \"{tablename}\" AND success = 't' ORDER BY runcycleid DESC LIMIT 1")
    
    # if df_lastrun.count() > 0:
    #     lastruntime = df_lastrun.first().createddate.strftime("%Y-%m-%d %H:%M:%S")
    # else:
    #     lastruntime = "2019-01-01 00:00:00"

    lastruntime = "2019-01-01 00:00:00"
    endtime = "2026-03-01 00:00:00"
    print(lastruntime)

    query = f"""
        SELECT
            DISTINCT d.documentid,
            d.pagecount,
            d.originalpagecount,
            d.filename,
            dm.ministryrequestid,
            dm.documentmasterid,
            dm.filepath,
            dh.rank1hash,
            dh.rank2hash,
            CASE
                WHEN dd.deleted is not NULL THEN 'Y'
            ELSE
                'N'
            END AS deleted,
            TRY_CAST(d.created_at AS DATE) AS created_at,
            TRY_CAST(d.updated_at AS STRING) AS updated_at,
            0 AS runcycleid,
            'Y' AS isactive,
            CASE
                WHEN d.incompatible = 'f' OR d.incompatible = 'false' THEN 'Y'
            ELSE
                'N'
            END AS compatible,
            'FOIMOD' AS sourceoftruth,
            mr.foirequestid
        FROM docreviewer.Documents d
        JOIN docreviewer.DocumentMaster dm ON dm.documentmasterid = d.documentmasterid
        JOIN docreviewer.DocumentHashCodes dh ON d.documentid = dh.documentid
        LEFT JOIN docreviewer.DocumentDeleted dd ON dm.ministryrequestid = dd.ministryrequestid AND dm.filepath LIKE CONCAT(dd.filepath, '%')
        LEFT JOIN (
            SELECT
                foiministryrequestid,
                foirequest_id AS foirequestid
            FROM foi_mod.foiministryrequests
            QUALIFY ROW_NUMBER() OVER (PARTITION BY foiministryrequestid ORDER BY version DESC) = 1
        ) mr on dm.ministryrequestid = mr.foiministryrequestid
        WHERE ( d.created_at > TRY_CAST('{lastruntime}' AS TIMESTAMP) and d.created_at <= TRY_CAST('{endtime}' AS TIMESTAMP) )
        or ( TRY_CAST(d.updated_at AS TIMESTAMP) > TRY_CAST('{lastruntime}' AS TIMESTAMP) and TRY_CAST(d.updated_at AS TIMESTAMP) <= TRY_CAST('{endtime}' AS TIMESTAMP) )
        """

    print(query)

    df = spark.sql(query)
    df.show()

    # if (df.count() == 0):
    #     raise Exception("no changes for today")

    # documentids = df.agg(F.concat_ws(",", F.collect_list("documentid"))).first()[0]
    # print(documentids)

    # result_df = spark.sql("""
    #     update factmodrequestdocumentdetails
    #     set isactive = 'N' 
    #     where documentid in (""" + documentids + """)
    # """)

    # order of columns here is important!
    df_mapped = df.selectExpr(
        "documentid AS documentid",
        "pagecount AS pagecount",
        "originalpagecount AS originalpagecount",
        "filename AS filename",
        "ministryrequestid AS foiministryrequestid",
        "documentmasterid AS documentmasterid",
        "filepath AS filepath",
        "rank1hash AS rank1hash",
        "rank2hash AS rank2hash",
        "deleted AS deleted",
        "created_at AS created_at",
        "updated_at AS updated_at",
        f"{runcycleid} as runcycleid",
        "'Y' as isactive",
        "compatible AS compatible",
        "sourceoftruth AS sourceoftruth",
        "foirequestid AS foirequestid"
    )
    df_mapped.show()
    # df_mapped.write.format("delta").mode("append").option("mergeSchema", "false").insertInto(tablename)  

    from delta.tables import DeltaTable
    delta_table = DeltaTable.forName(spark, f"hive_metastore.default.{tablename}")
    delta_table.alias("target").merge(
        df_mapped.alias("source"),
        "target.documentid = source.documentid"
    ).whenMatchedUpdate(
        condition = "target.isactive = 'Y'",
        set = {
            "isactive": lit("N"),
        }
    ).execute()

    print("Matched records deactivated.")

    df_mapped.write.format("delta").mode("append").saveAsTable(f"hive_metastore.default.{tablename}") 

    end_run_cycle(runcycleid, 't', tablename)
except NoCredentialsError:
    print("Credentials not available")
    end_run_cycle(runcycleid, 'f', tablename, "Credentials not available")
    raise Exception("notebook failed") from e
except Exception as e:
    if (str(e) == "no changes for today"):
        print("here")
        end_run_cycle(runcycleid, 't', tablename)
    else:
        print(f"An error occurred: {e}")    
        end_run_cycle(runcycleid, 'f', tablename, f"An error occurred: {e}")
        raise Exception("notebook failed") from e