In [0]:
%pip install databricks-labs-dqx==0.6.0


In [0]:
import json
from pathlib import Path
from functions.utility import get_function, create_bad_records_table, apply_job_type, schema_exists, print_settings
from functions.history import build_and_merge_file_history, transaction_history
from functions.quality import apply_dqx_checks, count_records

# Variables
color                       = dbutils.widgets.get('color')
job_settings                = json.loads(dbutils.widgets.get('job_settings'))
table                       = job_settings['table']
settings                    = json.loads(next(Path().glob(f'./layer_*_{color}/{table}.json')).read_text())
settings                    = apply_job_type(settings)
dst_table_name              = settings['dst_table_name']

# Print job and table settings
print_settings(job_settings, settings, color, table)

# One function for pipeline
if 'pipeline_function' in settings:
    pipeline_function = get_function(settings['pipeline_function'])
    pipeline_function(settings, spark)

# Individual functions for each step
elif all(k in settings for k in ['read_function', 'transform_function', 'write_function']):
    read_function = get_function(settings['read_function'])
    transform_function = get_function(settings['transform_function'])
    write_function = get_function(settings['write_function'])

    df = read_function(settings, spark)
    df = transform_function(df, settings, spark)
    df, bad_df = apply_dqx_checks(df, settings, spark)
    n_bad = count_records(bad_df, spark)
    if n_bad > 0:
        raise Exception(f"DQX checks failed: {n_bad} failing records")
    write_function(df, settings, spark)
else:
    raise Exception(f'Could not find any ingest function name in settings.')

# Create a bad records table if bad records files found
if color == 'bronze':
    create_bad_records_table(settings, spark)

# Build history if history settings are provided (default is true for bronze only)
history = job_settings.get('history', {})
if str(history.get('build_history', 'false')).lower() == 'true':
    full_table_name = history.get('full_table_name', dst_table_name)
    history_schema = history.get('history_schema')
    catalog = full_table_name.split('.')[0]
    if history_schema is None:
        print('Skipping history build: no history_schema provided')
    elif schema_exists(catalog, history_schema, spark):
        build_and_merge_file_history(full_table_name, history_schema, spark)
        transaction_history(full_table_name, history_schema, spark)
    else:
        print(f'Skipping history build: schema {catalog}.{history_schema} not found')
