In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, LongType, DoubleType,
    BooleanType, TimestampType, DateType, ArrayType, MapType
)
from delta.tables import DeltaTable # For merge operations
from pyspark.sql.functions import current_timestamp, col
from datetime import datetime

import sys
repo_root_path = "/Workspace/Users/richard.qi@gov.bc.ca/foi-reporting/databricks"
# Add the repository root to sys.path if it's not already there
if repo_root_path not in sys.path:
    sys.path.insert(0, repo_root_path) # Insert at the beginning to prioritize

from Shared.job_tracker import (
    get_last_successful_run_time,
    record_job_status,
    get_current_run_id,
    generate_date_range_json
)

# -------------------- Configuration ----------------------
JOB_NAME = "Daily_DOCREVIEWER_DB_Loading"
JOB_TRACKER_TABLE_PATH = "/mnt/delta/tables/job_tracker"

# Initialize Spark Session
spark = SparkSession.builder.appName(f"{JOB_NAME}_APP").getOrCreate()

# Get the current Databricks run ID (or a generated UUID)
current_run_id = get_current_run_id(spark)
print(f"Starting job: {JOB_NAME} with run_id: {current_run_id}")
job_start_timestamp = datetime.now()
status_message = None

# --- S3 Configuration ---
s3_bucket = spark.conf.get("spark.s3_bucket")
S3_BASE_PATH = f"s3a://{s3_bucket}/daily_exports/dev/foidb/"
# now = datetime.now()
# formatted_date = now.strftime("%Y-%m-%d")
# S3_BASE_PATH = f"s3a://{s3_bucket}/daily_exports/dev/docreviewer/{formatted_date}/"
# S3_BASE_PATH = f"s3a://{s3_bucket}/daily_exports/dev/docreviewer/2025-06-03/"

# Base path for your Delta tables
DELTA_BASE_PATH = "dbfs:/user/hive/warehouse/docreviewer.db"

# Array of S3 JSON file paths.
json_files_to_process = [
    "Annotations.json",
    "AnnotationSections.json",
    "DeduplicationJob.json",
    "DocumentAttributes.json",
    "DocumentDeleted.json",
    "DocumentDeletedPages.json",
    "DocumentExtractionJob.json",
    "DocumentHashCodes.json",
    "DocumentMaster.json",
    "DocumentPageFlagHistory.json",
    "DocumentPageFlags.json",
    "DocumentPathMapper.json",
    "Documents.json",
    "DocumentStatus.json",
    "FileConversionJob.json",
    "Keywords.json",
    "OperatingTeamS3ServiceAccounts.json",
    "PageCalculatorJob.json",
    "PageFlags.json",
    "PdfStitchJob.json",
    "PdfStitchJobAttributes.json",
    "PdfStitchPackage.json",
    "ProgramAreaDivisions.json",
    "ProgramAreas.json",
    "RedactionLayers.json",
    "RedlineContents.json",
    "Sections.json",
    # New tables
    "CompressionJob.json",
    "DocumentOCRJob.json",
    "DocumentProcesses.json",
    "OCRActiveMQJob.json"
]

# --- table_mapping ---
table_mappings = {
    "annotations": {
        "target_table_name": "annotations",
        "primary_keys": ["annotationid", "version"],
        "transform_map": {
            "annotationid": ("annotationid", IntegerType()),
            "annotationname": ("annotationname", StringType()),
            "documentid": ("documentid", IntegerType()),
            "documentversion": ("documentversion", IntegerType()),
            "annotation": ("annotation", StringType()),
            "pagenumber": ("pagenumber", IntegerType()),
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "createdby": ("createdby", StringType()),
            "created_at": ("created_at", TimestampType()),
            "updatedby": ("updatedby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "redactionlayerid": ("redactionlayerid", IntegerType()),
            "version": ("version", IntegerType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "annotationsections": {
        "target_table_name": "annotationsections",
        "primary_keys": ["id", "version"],
        "transform_map": {
            "id": ("id", IntegerType()),
            "annotationname": ("annotationname", StringType()),
            "foiministryrequestid": ("foiministryrequestid", IntegerType()),
            "section": ("section", StringType()),
            "created_at": ("created_at", TimestampType()),
            "createdby": ("createdby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "updatedby": ("updatedby", StringType()),
            "version": ("version", IntegerType()),
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "redactionlayerid": ("redactionlayerid", IntegerType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "deduplicationjob": {
        "target_table_name": "deduplicationjob",
        "primary_keys": ["deduplicationjobid", "version"],
        "transform_map": {
            "deduplicationjobid": ("deduplicationjobid", IntegerType()),
            "version": ("version", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "createdat": ("createdat", TimestampType()),
            "batch": ("batch", StringType()),
            "trigger": ("trigger", StringType()),
            "type": ("type", StringType()),
            "filename": ("filename", StringType()),
            "status": ("status", StringType()),
            "message": ("message", StringType()),
            "documentmasterid": ("documentmasterid", IntegerType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documentattributes": {
        "target_table_name": "documentattributes",
        "primary_keys": ["attributeid"],
        "transform_map": {
            "attributeid": ("attributeid", IntegerType()),
            "documentmasterid": ("documentmasterid", IntegerType()),
            "attributes": ("attributes", StringType()),
            "createdby": ("createdby", StringType()),
            "created_at": ("created_at", TimestampType()),
            "version": ("version", IntegerType()),
            "updatedby": ("updatedby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documentdeleted": {
        "target_table_name": "documentdeleted",
        "primary_keys": ["documentdeletedid"],
        "transform_map": {
            "documentdeletedid": ("documentdeletedid", IntegerType()),
            "filepath": ("filepath", StringType()),
            "deleted": ("deleted", StringType()), # Consider BooleanType
            "createdby": ("createdby", StringType()),
            "created_at": ("created_at", TimestampType()),
            "updatedby": ("updatedby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "removedfromsolr": ("removedfromsolr", StringType()), # Consider BooleanType
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documentdeletedpages": {
        "target_table_name": "documentdeletedpages",
        "primary_keys": ["id"],
        "transform_map": {
            "id": ("id", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "redactionlayerid": ("redactionlayerid", IntegerType()),
            "documentid": ("documentid", IntegerType()),
            "pagemetadata": ("pagemetadata", StringType()),
            "created_at": ("created_at", TimestampType()),
            "createdby": ("createdby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documentextractionjob": {
        "target_table_name": "documentextractionjob",
        "primary_keys": ["extractionjobid"],
        "transform_map": {
            "extractionjobid": ("extractionjobid", IntegerType()),
            "version": ("version", IntegerType()),
            "documentid": ("documentid", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "status": ("status", StringType()),
            "message": ("message", StringType()),
            "createdat": ("createdat", TimestampType()),
            "createdby": ("createdby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documenthashcodes": {
        "target_table_name": "documenthashcodes",
        "primary_keys": ["documentid"],
        "transform_map": {
            "documentid": ("documentid", IntegerType()),
            "rank1hash": ("rank1hash", StringType()),
            "rank2hash": ("rank2hash", StringType()),
            "created_at": ("created_at", TimestampType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documentmaster": {
        "target_table_name": "documentmaster",
        "primary_keys": ["documentmasterid"],
        "transform_map": {
            "documentmasterid": ("documentmasterid", IntegerType()),
            "filepath": ("filepath", StringType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "recordid": ("recordid", StringType()),
            "processingparentid": ("processingparentid", StringType()),
            "parentid": ("parentid", StringType()),
            "isredactionready": ("isredactionready", StringType()), # Consider BooleanType
            "created_at": ("created_at", TimestampType()),
            "createdby": ("createdby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "updatedby": ("updatedby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documentpageflaghistory": {
        "target_table_name": "documentpageflaghistory",
        "primary_keys": ["id"],
        "transform_map": {
            "id": ("id", IntegerType()),
            "documentpageflagid": ("documentpageflagid", IntegerType()),
            "foiministryrequestid": ("foiministryrequestid", IntegerType()),
            "documentid": ("documentid", IntegerType()),
            "documentversion": ("documentversion", IntegerType()),
            "pageflag": ("pageflag", StringType()),
            "attributes": ("attributes", StringType()),
            "created_at": ("created_at", TimestampType()),
            "createdby": ("createdby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "updatedby": ("updatedby", StringType()),
            "redactionlayerid": ("redactionlayerid", IntegerType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documentpageflags": {
        "target_table_name": "documentpageflags",
        "primary_keys": ["id"],
        "transform_map": {
            "id": ("id", IntegerType()),
            "foiministryrequestid": ("foiministryrequestid", IntegerType()),
            "documentid": ("documentid", IntegerType()),
            "documentversion": ("documentversion", IntegerType()),
            "pageflag": ("pageflag", StringType()),
            "attributes": ("attributes", StringType()),
            "created_at": ("created_at", TimestampType()),
            "createdby": ("createdby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "updatedby": ("updatedby", StringType()),
            "redactionlayerid": ("redactionlayerid", IntegerType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documentpathmapper": {
        "target_table_name": "documentpathmapper",
        "primary_keys": ["documentpathid"],
        "transform_map": {
            "documentpathid": ("documentpathid", IntegerType()),
            "category": ("category", StringType()),
            "bucket": ("bucket", StringType()),
            "attributes": ("attributes", StringType()),
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "createdby": ("createdby", StringType()),
            "created_at": ("created_at", TimestampType()), # Changed to TimestampType
            "updatedby": ("updatedby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documents": {
        "target_table_name": "documents",
        "primary_keys": ["documentid", "version"],
        "transform_map": {
            "documentid": ("documentid", IntegerType()),
            "version": ("version", IntegerType()),
            "filename": ("filename", StringType()),
            "attributes": ("attributes", StringType()),
            "foiministryrequestid": ("foiministryrequestid", IntegerType()),
            "createdby": ("createdby", StringType()),
            "created_at": ("created_at", TimestampType()),
            "updatedby": ("updatedby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "statusid": ("statusid", IntegerType()),
            "pagecount": ("pagecount", IntegerType()),
            "documentmasterid": ("documentmasterid", IntegerType()),
            "incompatible": ("incompatible", StringType()), # Consider BooleanType
            "originalpagecount": ("originalpagecount", IntegerType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documentstatus": {
        "target_table_name": "documentstatus",
        "primary_keys": ["statusid"],
        "transform_map": {
            "statusid": ("statusid", IntegerType()),
            "name": ("name", StringType()),
            "description": ("description", StringType()),
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "fileconversionjob": {
        "target_table_name": "fileconversionjob",
        "primary_keys": ["fileconversionjobid", "version"],
        "transform_map": {
            "fileconversionjobid": ("fileconversionjobid", IntegerType()),
            "version": ("version", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "createdat": ("createdat", TimestampType()),
            "batch": ("batch", StringType()),
            "trigger": ("trigger", StringType()),
            "filename": ("filename", StringType()),
            "status": ("status", StringType()),
            "message": ("message", StringType()),
            "inputdocumentmasterid": ("inputdocumentmasterid", IntegerType()),
            "outputdocumentmasterid": ("outputdocumentmasterid", StringType()), # Consider IntegerType if it's an ID
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "keywords": {
        "target_table_name": "keywords",
        "primary_keys": ["keywordid"],
        "transform_map": {
            "keywordid": ("keywordid", IntegerType()),
            "version": ("version", IntegerType()),
            "keyword": ("keyword", StringType()),
            "category": ("category", StringType()),
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "created_at": ("created_at", TimestampType()),
            "createdby": ("createdby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "updatedby": ("updatedby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "operatingteams3serviceaccounts": {
        "target_table_name": "operatingteams3serviceaccounts",
        "primary_keys": ["teamid"],
        "transform_map": {
            "teamid": ("teamid", IntegerType()),
            "usergroup": ("usergroup", StringType()),
            "accesskey": ("accesskey", IntegerType()),
            "secret": ("secret", IntegerType()),
            "type": ("type", StringType()),
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "pagecalculatorjob": {
        "target_table_name": "pagecalculatorjob",
        "primary_keys": ["pagecalculatorjobid", "version"], # Added primary key based on common patterns
        "transform_map": {
            "pagecalculatorjobid": ("pagecalculatorjobid", IntegerType()),
            "version": ("version", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "inputmessage": ("inputmessage", StringType()),
            "pagecount": ("pagecount", StringType()), # Consider IntegerType
            "status": ("status", StringType()),
            "message": ("message", StringType()),
            "createdat": ("createdat", TimestampType()),
            "createdby": ("createdby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "pageflags": {
        "target_table_name": "pageflags",
        "primary_keys": ["pageflagid"],
        "transform_map": {
            "pageflagid": ("pageflagid", IntegerType()),
            "name": ("name", StringType()),
            "description": ("description", StringType()),
            "sortorder": ("sortorder", IntegerType()),
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "created_at": ("created_at", TimestampType()),
            "createdby": ("createdby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "updatedby": ("updatedby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "pdfstitchjob": {
        "target_table_name": "pdfstitchjob",
        "primary_keys": ["pdfstitchjobid", "version"],
        "transform_map": {
            "pdfstitchjobid": ("pdfstitchjobid", IntegerType()),
            "version": ("version", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "category": ("category", StringType()),
            "inputfiles": ("inputfiles", StringType()),
            "outputfiles": ("outputfiles", StringType()),
            "status": ("status", StringType()),
            "message": ("message", StringType()),
            "createdat": ("createdat", TimestampType()),
            "createdby": ("createdby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "pdfstitchjobattributes": {
        "target_table_name": "pdfstitchjobattributes",
        "primary_keys": ["attributesid"],
        "transform_map": {
            "attributesid": ("attributesid", IntegerType()),
            "pdfstitchjobid": ("pdfstitchjobid", IntegerType()),
            "version": ("version", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "attributes": ("attributes", StringType()),
            "createdat": ("createdat", TimestampType()),
            "createdby": ("createdby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "pdfstitchpackage": {
        "target_table_name": "pdfstitchpackage",
        "primary_keys": ["pdfstitchpackageid"],
        "transform_map": {
            "pdfstitchpackageid": ("pdfstitchpackageid", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "category": ("category", StringType()),
            "finalpackagepath": ("finalpackagepath", StringType()),
            "createdat": ("createdat", TimestampType()),
            "createdby": ("createdby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "programareadivisions": {
        "target_table_name": "programareadivisions",
        "primary_keys": ["divisionid"],
        "transform_map": {
            "divisionid": ("divisionid", IntegerType()),
            "programareaid": ("programareaid", IntegerType()),
            "name": ("name", StringType()),
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "createdby": ("createdby", StringType()),
            "created_at": ("created_at", TimestampType()),
            "sortorder": ("sortorder", StringType()), # Consider IntegerType
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "programareas": {
        "target_table_name": "programareas",
        "primary_keys": ["programareaid"],
        "transform_map": {
            "programareaid": ("programareaid", IntegerType()),
            "name": ("name", StringType()),
            "type": ("type", StringType()),
            "bcgovcode": ("bcgovcode", StringType()),
            "iaocode": ("iaocode", StringType()),
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "redactionlayers": {
        "target_table_name": "redactionlayers",
        "primary_keys": ["redactionlayerid"],
        "transform_map": {
            "redactionlayerid": ("redactionlayerid", IntegerType()),
            "name": ("name", StringType()),
            "description": ("description", StringType()),
            "sortorder": ("sortorder", IntegerType()),
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "created_at": ("created_at", TimestampType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "createdby": ("createdby", StringType()),
            "updatedby": ("updatedby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "redlinecontents": {
        "target_table_name": "redlinecontents",
        "primary_keys": ["id"],
        "transform_map": {
            "id": ("id", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "redlineid": ("redlineid", StringType()), # Consider IntegerType if it's an ID
            "annotationid": ("annotationid", StringType()), # Consider IntegerType if it's an ID
            "pagenumber": ("pagenumber", IntegerType()),
            "documentid": ("documentid", IntegerType()),
            "type": ("type", StringType()),
            "section": ("section", StringType()),
            "content": ("content", StringType()),
            "category": ("category", StringType()),
            "createdat": ("createdat", TimestampType()),
            "createdby": ("createdby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "sections": {
        "target_table_name": "sections",
        "primary_keys": ["sectionid"],
        "transform_map": {
            "sectionid": ("sectionid", IntegerType()),
            "section": ("section", StringType()),
            "description": ("description", StringType()),
            "sortorder": ("sortorder", IntegerType()),
            "isactive": ("isactive", StringType()), # Consider BooleanType
            "created_at": ("created_at", TimestampType()),
            "createdby": ("createdby", StringType()),
            "updated_at": ("updated_at", TimestampType()), # Changed to TimestampType
            "updatedby": ("updatedby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    # New tables
    # "OCRActiveMQJob.json"
    "compressionjob": {
        "target_table_name": "compressionjob",
        "primary_keys": ["compressionjobid", "version"],
        "transform_map": {
            "compressionjobid": ("compressionjobid", IntegerType()),
            "version": ("version", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "createdat": ("createdat", TimestampType()),
            "batch": ("batch", StringType()),
            "trigger": ("trigger", StringType()),
            "filename": ("filename", StringType()),
            "status": ("status", StringType()),
            "message": ("message", StringType()),
            "documentmasterid": ("documentmasterid", IntegerType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documentocrjob": {
        "target_table_name": "documentocrjob",
        "primary_keys": ["ocrjobid"],
        "transform_map": {
            "ocrjobid": ("ocrjobid", IntegerType()),
            "version": ("version", IntegerType()),
            "documentid": ("documentid", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "documentmasterid": ("documentmasterid", IntegerType()),
            "status": ("status", StringType()),
            "message": ("message", StringType()),
            "createdat": ("createdat", TimestampType()),
            "createdby": ("createdby", StringType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "documentprocesses": {
        "target_table_name": "documentprocesses",
        "primary_keys": ["processid"],
        "transform_map": {
            "processid": ("processid", IntegerType()),
            "name": ("name", StringType()),
            "description": ("description", StringType()),
            "isactive": ("isactive", BooleanType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    },
    "ocractivemqjob": {
        "target_table_name": "ocractivemqjob",
        "primary_keys": ["ocractivemqjobid", "version"],
        "transform_map": {
            "ocractivemqjobid": ("ocractivemqjobid", IntegerType()),
            "version": ("version", IntegerType()),
            "ministryrequestid": ("ministryrequestid", IntegerType()),
            "createdat": ("createdat", TimestampType()),
            "batch": ("batch", StringType()),
            "trigger": ("trigger", StringType()),
            "filename": ("filename", StringType()),
            "status": ("status", StringType()),
            "message": ("message", StringType()),
            "documentmasterid": ("documentmasterid", IntegerType()),
            "load_timestamp": (None, TimestampType(), lambda: current_timestamp())
        }
    }
}

In [0]:
# Helper Functions

def get_table_config_from_path(file_path, mappings):
    for key, config in mappings.items():
        if (f"{key}.json") in file_path.lower():
            return config
    raise ValueError(f"No table mapping found for file path: {file_path}")

def get_dynamic_select_expressions(source_df, target_schema_actual: StructType, transform_map):
    select_exprs = []
    source_schema_dict = {f.name.lower(): f.dataType for f in source_df.schema.fields}

    for target_field in target_schema_actual.fields:
        target_col_name = target_field.name
        target_type = target_field.dataType
        
        transform_rule = transform_map.get(target_col_name)

        if transform_rule:
            src_col_name_or_none, rule_target_type, *custom_transform_func = transform_rule
            
            if custom_transform_func:
                select_exprs.append(custom_transform_func[0]().alias(target_col_name))
            elif src_col_name_or_none:
                if "." in src_col_name_or_none:
                    select_exprs.append(col(src_col_name_or_none).cast(target_type).alias(target_col_name))
                elif src_col_name_or_none.lower() in source_schema_dict:
                    select_exprs.append(col(src_col_name_or_none).cast(target_type).alias(target_col_name))
                else:
                    print(f"Warning: Source column '{src_col_name_or_none}' not found for target column '{target_col_name}'. Adding as NULL.")
                    select_exprs.append(lit(None).cast(target_type).alias(target_col_name))
            else:
                print(f"Warning: No source column specified for target column '{target_col_name}' and no custom transform. Adding as NULL.")
                select_exprs.append(lit(None).cast(target_type).alias(target_col_name))
        else:
            if target_col_name.lower() in source_schema_dict:
                select_exprs.append(col(target_col_name).cast(target_type).alias(target_col_name))
                print(f"Info: Target column '{target_col_name}' in actual schema found in source but not in transform_map. Selecting directly.")
            else:
                select_exprs.append(lit(None).cast(target_type).alias(target_col_name))
                print(f"Info: Target column '{target_col_name}' in actual schema not found in source or transform_map. Adding as NULL.")
            
    return select_exprs


def create_or_get_delta_table(spark_session, base_path, table_name):
    """
    Retrieves the DeltaTable object and its schema if it exists.
    If the table does not exist, it returns (None, None).
    """
    full_table_path = f"{base_path}/{table_name}"
    
    if not DeltaTable.isDeltaTable(spark_session, full_table_path):
        print(f"  Delta table '{table_name}' does NOT exist at {full_table_path}. Skipping this file.")
        return None, None # Signal that the table was not found
    else:
        print(f"  Delta table '{table_name}' already exists at {full_table_path}. Retrieving schema.")
        actual_table_schema = spark_session.read.format("delta").load(full_table_path).schema
        print(f"  Retrieved actual schema for {table_name}:")
        spark_session.createDataFrame([], actual_table_schema).printSchema()
        delta_table_ref = DeltaTable.forPath(spark_session, full_table_path)
        return delta_table_ref, actual_table_schema

In [0]:
try:
    last_successful_run_time = get_last_successful_run_time(spark, JOB_TRACKER_TABLE_PATH, JOB_NAME)
    date_range = generate_date_range_json(last_successful_run_time, job_start_timestamp)
    # print(f"Type of date_range ({date_range}): {type(date_range)}")

    # Main Processing Loop
    print(f"Starting JSON to Delta merge process... Date range: {date_range}")

    for date_str in date_range:
        print(f"Processing data for date: {date_str}")

        for json_file_relative_path in json_files_to_process:
            full_json_path = f"{S3_BASE_PATH}{date_str}/{json_file_relative_path}"
            print(f"\n--- Processing file: {full_json_path} ---")

            try:
                # 1. get target table configuration
                table_config = get_table_config_from_path(json_file_relative_path, table_mappings)
                target_table_name = table_config["target_table_name"]
                primary_keys = table_config["primary_keys"]
                transform_map = table_config["transform_map"]
                
                full_delta_table_path = f"{DELTA_BASE_PATH}{target_table_name}"

                print(f"  Target table: {target_table_name} at {full_delta_table_path}")
                print(f"  Primary Keys: {primary_keys}")

                # 2. Get the Delta table reference and schema
                delta_table, actual_target_schema = create_or_get_delta_table(
                    spark, DELTA_BASE_PATH, target_table_name # No initial_schema_for_creation passed here
                )
                
                # --- Skip if table does not exist ---
                if delta_table is None:
                    continue # Skip to the next file in the loop

                # 3. Read the JSON file
                try:
                    source_df = spark.read.json(full_json_path)
                    print("  Source JSON schema (inferred):")
                    source_df.printSchema()
                except Exception as e:
                    # Skip to next file/table if JSON file not exist
                    if "Path does not exist" in str(e) or "No such file or directory" in str(e):
                        print(f" S3 JSON file '{full_json_path}' not found. Skipping this file.")
                        continue # Skip to the next file in the loop
                    else:
                        raise e

                # 4. Transform source data to match target schema
                select_expressions = get_dynamic_select_expressions(source_df, actual_target_schema, transform_map)
                
                transformed_df = source_df.select(*select_expressions)
                print("  Transformed DataFrame schema (before merge):")
                transformed_df.printSchema()
                
                # 5. UPSERT/MERGE
                merge_condition = " AND ".join([f"target.{pk} = source.{pk}" for pk in primary_keys])

                print(f"  Merge condition: {merge_condition}")
                
                (delta_table.alias("target")
                    .merge(
                        transformed_df.alias("source"),
                        merge_condition
                    )
                    .whenMatchedUpdateAll()
                    .whenNotMatchedInsertAll()
                    .execute()
                )
                print(f"  Successfully merged data from {json_file_relative_path} into {target_table_name}.")

            except ValueError as ve:
                print(f"Error processing {json_file_relative_path}: {ve}")
                # raise ve
            except Exception as e:
                print(f"An unexpected error occurred for {json_file_relative_path}: {e}")
                raise e
            
        print(f"Processing date: {date_str} - completed")

    print("\nJSON to Delta merge process completed.")
    job_status = "SUCCEEDED"
    status_message = f"Job completed successfully. Dates processed: {date_range}" # Optionally add to message

except Exception as e:
    job_status = "FAILED"
    status_message = f"Job failed: {e}"
    print(f"ERROR: {status_message}")
    sys.exit(1) # Exit with a non-zero code to indicate failure in Databricks Jobs

finally:
    # 3. After the job, save job status (add end time and update the status)
    job_end_timestamp = datetime.now()
    
    record_job_status(
        spark=spark,
        job_tracker_table_path=JOB_TRACKER_TABLE_PATH,
        job_name=JOB_NAME,
        run_id=current_run_id,
        status=job_status,
        start_time=job_start_timestamp,
        end_time=job_end_timestamp,
        message=status_message # Include the message with date range if desired
    )
    
    # spark.stop()
    print(f"Job '{JOB_NAME}' (run_id: {current_run_id}) finished with status: {job_status}")