## Control-M to pass these values to the Databricks workflow

### Global imports

In [0]:
import requests

### Dynamic variables

In [0]:
# Created a secret scope databricks secrets create-scope --scope dlt-pilot-scope
# databricks secrets put-secret dlt-pilot-scope api_access_pat_token
DATABRICKS_TOKEN = dbutils.secrets.get(scope="dlt-pilot-scope", key="api_access_pat_token")

# job id of the DLT workflow
job_id = dbutils.widgets.getArgument("job_id")

db_workspace_url = dbutils.widgets.getArgument("db_workspace_url")

catalog = dbutils.widgets.getArgument("catalog")

bronze_schema = dbutils.widgets.getArgument("bronze_schema")

dlt_runtime_config_table = "dlt_runtime_config"

table_suffix = dbutils.widgets.getArgument("table_suffix")
if table_suffix == "N/A":
    table_suffix = ""
else:
    # only set for testing or individual developer testing their DLT's
    table_suffix = f"_{table_suffix}"

selected_dimension = dbutils.widgets.getArgument("dimension_table")
selected_scenario = dbutils.widgets.getArgument("scenario")

In [None]:
# get the distinct scenarios and dimensions from the dim_integration_test_scenarios table
# df = spark.sql("SELECT dimension_table, scenario FROM edl_dev_ctlg.structured.dim_integration_test_scenarios")
# dimensions_list = sorted([row['dimension_table'] for row in df.select("dimension_table").distinct().collect()])
# scenario_list = sorted([row['scenario'] for row in df.select("scenario").distinct().collect()])

# dbutils.widgets.dropdown("dimension_table", dimensions_list[0], dimensions_list)
# dbutils.widgets.dropdown("scenario", scenario_list[0], scenario_list)

# selected_dimension = dbutils.widgets.get("dimension_table")
# selected_scenario = dbutils.widgets.get("scenario")
# print(f"selected_dimension::{selected_dimension}")
# print(f"selected_scenario::{selected_scenario}")

# job_id = None
# if selected_dimension == "D_GROUP":
#     job_id = d_group_job_id


### cleanup

In [0]:
def cleanup(p_catalog, p_schema, cleanup_table_list, cleanup_audit_table='Y', cleanup_runtime_config_table='Y'):
    
    if cleanup_runtime_config_table == 'Y':
        print(f"Deleting data from {p_catalog}.{p_schema}.{dlt_runtime_config_table}")
        spark.sql(f"DELETE FROM {p_catalog}.{p_schema}.{dlt_runtime_config_table}")

    # cleanup audit table
    if cleanup_audit_table == 'Y':
        print(f"Deleting data from {p_catalog}.{p_schema}.dlt_dq_audit")
        spark.sql(f"DELETE FROM {p_catalog}.{p_schema}.dlt_dq_audit")

    # Drop all landing zone tables
    # tables_df = spark.sql(f"SHOW TABLES IN {p_catalog}.{p_schema}").filter(
    #     "(tableName LIKE 'dlt_landing%') AND (tableName LIKE '%group%')"
    # )
    for table in cleanup_table_list:
        print(f"cleanup_table_list::{cleanup_table_list}")
        # tables_df.select("tableName").collect():
        print(f"Dropping table {p_catalog}.{p_schema}.{table}{table_suffix}")
        spark.sql(f"DROP TABLE IF EXISTS {p_catalog}.{p_schema}.{table}{table_suffix}")

### API: populate_runtime_metadata

Populate the dim_group_runtime_metadata to test different scenarios

In [0]:
import uuid

def populate_runtime_metadata(
    p_catalog,
    p_schema,
    p_dlt_runtime_config_table,
    p_dimension_table,
    p_scenario,
    p_input_list,
):
    # Build sources_metadata part
    sources_metadata_entries = []
    for input in p_input_list:
        scenario_input_source_system = input["scenario_input_source_system"]
        scenario_input_target_table = input["scenario_input_target_table"]
        scenario_input_business_date = input["scenario_input_business_date"]
        scenario_input_source_details = input["scenario_input_source_details"]
        scenario_input_format = input["scenario_input_format"]
        key = uuid.uuid4()
        
        sources_metadata_entries.append(
            f"'{key}', named_struct('business_date', '{scenario_input_business_date}', 'source_system', '{scenario_input_source_system}' , 'table_name', '{scenario_input_target_table}', 'source_details', '{scenario_input_source_details}','format', '{scenario_input_format}')"
        )

    sources_metadata_map = "map(\n  " + ",\n  ".join(sources_metadata_entries) + "\n)"

    print(f"populate_runtime_metadata::sources_metadata_entries::\n{sources_metadata_entries}")

    # Final query
    query = f"""
      INSERT INTO {p_catalog}.{p_schema}.{p_dlt_runtime_config_table} (
        dimension_table,
        scenario,
        run_date,
        source_params
      )
      VALUES (
        '{p_dimension_table}',
        '{p_scenario}',
        current_timestamp(),
        {sources_metadata_map}
      )
    """
    spark.sql(query)

In [0]:
def get_full_refresh_selection(cdm_source):
    source_map = {
        "DF_GROUP": ["table1", "table2", "table3"],
        "DF_GROUP_HISTORY": ["table1", "table2", "table3"],
        "DF_GROUP_COUNT": ["table6", "table7", "table8", "table9"]
    }
    return source_map.get(cdm_source, [])

#### Trigger DLT pipeline API

In [0]:
# def run_pipeline(id):
#     print(f"Running pipeline {id}")

#     if cleanup == "Y":
#         print("Cleaning up dim_group_runtime_metadata table")
#         spark.sql(f"DELETE FROM {catalog}.{schema}.{dlt_runtime_config_table}")

#     populate_runtime_metadata(catalog, schema, dlt_runtime_config_table, scenario)

#     if scenario == "normal_run" and business_date == "N/A":
#         payload = {
#             "full_refresh": True,
#             "cause": f"Triggered from ControlM simulator for scenario {scenario}",
#         }
#     elif scenario == "normal_run" and business_date != "N/A":
#         payload = {
#             "full_refresh": False,
#             "cause": f"Triggered from ControlM simulator for scenario {scenario}",
#         }
#     elif scenario == "full_refresh":
#         payload = {
#             "full_refresh": True,
#             "cause": f"Triggered from ControlM simulator for scenario {scenario}",
#         }
#     elif scenario == "full_refresh_selection":
#         payload = {
#             "full_refresh_selection": get_full_refresh_selection(scenario),
#             "cause": f"Triggered from ControlM simulator for scenario {scenario}",
#         }
#     else:
#         raise Exception(f"Invalid scenario: {scenario}")

#     response = requests.post(
#         f"{DATABRICKS_INSTANCE}/api/2.0/pipelines/{id}/updates",
#         headers={
#             "Authorization": f"Bearer {DATABRICKS_TOKEN}",
#             "Content-Type": "application/json",
#         },
#         json=payload,
#     )

#     if response.status_code == 200:
#         print(
#             f"message: Pipleline triggered successfully with update_id: {response.json()['update_id']}"
#         )
#     else:
#         print(f"error: {response.text}, {response.status_code}")

In [0]:
def run_job(
    p_db_url,
    p_api_token,
    p_catalog,
    p_schema,
    p_dlt_runtime_config_table,
    p_job_id,
    selected_dimension,
    selected_scenario,
):
    print(f"run_job: Running job {p_job_id}")

    query = f"""
    Select  
    scenario_type, 
    refresh_type, 
    cleanup_table_list, 
    scenario_input_source_system, 
    scenario_input_target_table,
    scenario_input_business_date,
    scenario_input_source_details,
    scenario_input_format 
    FROM  edl_dev_ctlg.structured.dim_integration_test_scenarios 
    WHERE dimension_table = '{selected_dimension}' and scenario = '{selected_scenario}'"""
    
    inputs = spark.sql(query).collect()
    input_list = []
    for input in inputs:
        input_list.append(input)
        scenario_type = input["scenario_type"]
        refresh_type = input["refresh_type"]
        cleanup_table_list = input["cleanup_table_list"]        
        # TODO handle proerly later
        if cleanup_table_list is not None or cleanup_table_list is not []:           
            cleanup(p_catalog, p_schema, cleanup_table_list)

        populate_runtime_metadata(
            p_catalog,
            p_schema,
            p_dlt_runtime_config_table,
            selected_dimension,
            scenario_type,
            input_list,
        )

    payload = {
        "job_id": p_job_id,
        "job_parameters": {
            "dimension_table": selected_dimension,
            "dlt_runtime_config_table": dlt_runtime_config_table,
            "catalog": p_catalog,
            "schema": "structured",
            "scenario": selected_scenario,
            "scenario_type": scenario_type,
            "refresh_type": refresh_type,
            "source_system": "",
        },
    }

    response = requests.post(
        f"{p_db_url}/api/2.1/jobs/run-now",
        headers={
            "Authorization": f"Bearer {p_api_token}",
            "Content-Type": "application/json",
        },
        json=payload,
    )

    if response.status_code == 200:
        print(
            f"run_job: message: Job triggered successfully with run_id: {response.json()['run_id']}"
        )
    else:
        print(f"run_job: error: {response.text}, {response.status_code}")

### Run pipeline
Multiple DLT pipelines cannot be triggered at the same time


In [0]:
run_job(
    db_workspace_url,
    DATABRICKS_TOKEN,
    catalog,
    bronze_schema,
    dlt_runtime_config_table,
    job_id,
    selected_dimension,
    selected_scenario
)

run_job: Running job 825821145657127
dimension_table: D_GROUP
  scenario: apply_correction
  refresh_type: incremental_refresh
  source_info: {'key1': {'business_date': '20250408', 'source_system': 'Facets', 'target_table': 'DF_GROUP'}} 
 scenario_type: delta_correction
populate_runtime_metadata::sources_metadata_entries::
["'key1', named_struct('business_date', '20250408', 'source_system', 'Facets' , 'table_name', 'DF_GROUP', 'source_details', 'None','format', 'None')"]
run_job: message: Job triggered successfully with run_id: 587934046247698
