In [0]:
import sys, os

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

from datetime import datetime
import os

### Framework specific imports

In [None]:
# This is required as there is no way to pass a libary to DLT right now.
sys.path.append(os.path.abspath('../../src'))

from framework.udf_utils import *
from framework.utils import *

### Get task variables


In [0]:
dimension_table = dbutils.jobs.taskValues.get(taskKey = "setup", key = "dimension_table", debugValue = "")
dlt_runtime_config_table = dbutils.jobs.taskValues.get(taskKey = "setup", key = "dlt_runtime_config_table", debugValue = "")
# source_system = dbutils.jobs.taskValues.get(taskKey = "setup", key = "source_system", debugValue = "")
catalog = dbutils.jobs.taskValues.get(taskKey = "setup", key = "catalog", debugValue = "")
schema = dbutils.jobs.taskValues.get(taskKey = "setup", key = "schema", debugValue = "")
job_id = dbutils.jobs.taskValues.get(taskKey = "setup", key = "job_id", debugValue = "")
run_id = dbutils.jobs.taskValues.get(taskKey = "setup", key = "run_id", debugValue = "")
# only used for testing
table_suffix = dbutils.jobs.taskValues.get(taskKey = "setup", key = "table_suffix", debugValue = "")
test_mode = dbutils.jobs.taskValues.get(taskKey = "setup", key = "test_mode", debugValue = "")

schema_location = "/Volumes/edl_dev_ctlg/rawfiles/raw/dlt_pilot/schema"

dlt_metadata_cdc_config = "dlt_metadata_cdc_config"

#  Possible values: normal, apply_correction
scenario_type = dbutils.jobs.taskValues.get(taskKey = "setup", key = "scenario_type", debugValue = "") 

# Possible values: full_refresh, incremental_refresh, delta_correction, file_correction
refresh_type = dbutils.jobs.taskValues.get(taskKey = "setup", key = "refresh_type", debugValue = "")

print(f"job_load_landing_zone_data_notebook::Getting task variables::\n"
      f"dimension_table: {dimension_table}\n"
      f"dlt_runtime_config_table: {dlt_runtime_config_table}\n"
      f"refresh_type: {refresh_type}"
      f"catalog: {catalog}\n"
      f"schema: {schema}\n"      
      f"job_id: {job_id}\n"
      f"run_id: {run_id}\n"
      f"table_suffix: {table_suffix}"
      f"schema_location: {schema_location}")

### Static variables

In [0]:
landing_table_name_prefix = "dlt_landing"
delta_landing_table_name = f"dlt_landing_[TABLE_NAME]{table_suffix}"
source_metadata_table = "dlt_meta_source_config"
# for historical load for DF_Group and DF_GROUP_COUNT
default_business_date = datetime(1900, 1, 1)

### API: delete_data_from_landing_table
This API is called for apply_correction use cases.

In [0]:
def delete_data_from_landing_table(p_catalog, p_schema, p_landing_table, p_business_date):
    print (f"Calling delete_data_from_landing_table")

    query = f"""
        Delete from {p_catalog}.{p_schema}.{p_landing_table} where business_date = '{p_business_date}'
    """
    df = spark.sql (query)
    return df.first()


### Main Flow

In [0]:

runtime_params_row = get_runtime_parameters(
    spark, catalog, schema, dlt_runtime_config_table, dimension_table
)

dimension_table = runtime_params_row["dimension_table"]
source_params = runtime_params_row["source_params"]  # this is a dict
scenario = runtime_params_row["scenario"]

volume_path = None
file_format = None


if scenario == "apply_correction" and scenario_type == "delta_correction":
    print (f"scenario::{scenario}, scenario_type::{scenario_type}. This use case is for production support where we update the data directly in the landing table")
else:

    for key, param_value_map in source_params.items():

        print (f"key:: {key} param_value_map::{param_value_map}")
        # CDM table E.g. DF_GROUP, DF_GROUP_COUNT
        target_table = param_value_map["target_table"]
        # Facets, VHP 
        source_system = param_value_map["source_system"]

        # business date for which the data is loaded
        business_date_as_str = param_value_map["business_date"]

        # default business date to use for historical (audit data load)
        # business_date_lit = default_business_date
        business_date_lit = datetime.strptime(business_date_as_str, "%Y-%m-%d")

        source_metadata = get_source_metadata(
            spark,
            catalog,
            schema,
            dimension_table,
            source_metadata_table,
            target_table,
            business_date_as_str,
            scenario,
            source_system,
            p_test_mode=test_mode
        )

        # source metadata row
        source_metadata_row = source_metadata
        # source name (E.g. DF_GROUP_FACETS, DF_GROUP_VHP)
        # source = source_metadata_row.source
        # source name (E.g. dbo, audit, cdc)
        source_schema = source_metadata_row.source_schema
    
        derived_input_table_list = [
            item.strip() for item in source_metadata_row.derived_input_tables.split(",")
        ]
        # business_date_column_name is specifically used for historical load. This column will be mapped to business_date
        business_date_column_name = source_metadata_row.business_date_column_to_load

        # This will passed from the runtime_params if scenario = 'apply_correction' or 'apply_test'
        if "source_details" in param_value_map and param_value_map["source_details"] != "":
            volume_path = param_value_map["source_details"]
        else:
            volume_path = source_metadata_row.source_details

        if "file_format" in param_value_map and param_value_map["file_format"] != "":
            file_format = param_value_map["file_format"]
        else:
            file_format = source_metadata_row.file_format

        scd_row = get_scd_attributes(
            p_spark=spark,
            p_catalog=catalog,
            p_schema=schema,
            p_dlt_metadata_cdc_config=dlt_metadata_cdc_config,
            p_target_table=target_table,
        )

        print(
            f"load_landing_zone::for \nsource_schema::{source_schema} \nbusiness_date::{business_date_lit}\nvolume_path::{volume_path}"
        )

        # Read data from source
        header_option = "true" if file_format == "csv" else "false"
        print(f"file_format:{file_format}, header_option:{header_option}")
        df = (
            spark.read.format("cloudFiles")
            .format(file_format)
            .option("header", header_option)
            .load(volume_path)
            .withColumn("source", F.lit("Not Used"))
            .withColumn("source_schema", F.lit(source_schema))
        )

        #  Process each CDM entity 
        for index, input_table_name in enumerate(derived_input_table_list):
            
            landing_table_name = delta_landing_table_name.replace(
                "[TABLE_NAME]", input_table_name
            )
            print(f"landing_table_name:{landing_table_name}")
                    
            # Legacy source has some columns with __$.
            # This step to cast is required as otherwise some fields (__$lstart etc) throw error as they are not casted to string
            for col_name in df.columns:
                if col_name.startswith("__$"):
                    df = df.withColumn(col_name, F.col(col_name).cast("string"))

            # if the source does not send CDC information (E.g. DF_GROUP_COUNT), then we have to compute inserts, udpates and deletes
            if "__$operation" not in df.columns:
                # read target landing table  
                df = df.withColumn("__$operation", F.lit("5"))  
                # compute delete            
                df = compute_deletes_from_snapshots(spark, scenario, catalog, schema, landing_table_name, df, scd_row)
                
            #  While loading historical data, this is useful to get the business date from that column value.
            if business_date_column_name and business_date_column_name.strip():
                df = df.withColumn(
                    "business_date", F.col(business_date_column_name).cast("date")
                )
            else:
                df = df.withColumn(
                    "business_date", F.lit(business_date_lit).cast("date")
                )

            # Prod support use case: Handle scenario='apply_correction'
            # Delete the existing data in the landing table 
            if scenario == "apply_correction" and scenario_type == "file_correction":
                delete_result = delete_data_from_landing_table (catalog, schema, landing_table_name, business_date_lit)
                print (f" delete_data_from_landing_table for delete_result::{delete_result} for scenario:{scenario}")

            print (f"refresh_type::{refresh_type}")
            if refresh_type == "incremental_refresh":
                (
                    df.write.format("delta")
                    .mode("append")
                    .option("mergeSchema", "true")
                    .option("delta.columnMapping.mode", "name")
                    .option("delta.enableChangeDataFeed", "false")
                    .saveAsTable(f"{catalog}.{schema}.{landing_table_name}")
                )
            # CAUTION: This condition should ony be used when we want to do a full refresh of the pipeline. 
            # This will reset the DLT pipeline data. Only used first time the data load starts , historical data load.
            elif refresh_type == "full_refresh":
                (
                    df.write.format("delta")
                    .mode("overwrite")
                    .option("mergeSchema", "true")
                    .option("delta.columnMapping.mode", "name")
                    .option("delta.enableChangeDataFeed", "false")
                    .saveAsTable(f"{catalog}.{schema}.{landing_table_name}")
                )



        