In [0]:
import json
from glob import glob

paths = glob('../layer_02_silver/*.json')
tables = [path.split('/')[-1].split('.')[0] for path in paths]
tables.sort()
tables.append('None Selected')

dbutils.widgets.combobox('table', 'None Selected', tables)

In [0]:
%pip install databricks-labs-dqx==0.6.0

In [0]:
dbutils.library.restartPython()

In [0]:
table_name = dbutils.widgets.get('table')


In [0]:
import os
import sys
import json
from pathlib import Path

sys.path.append(f"{os.getcwd()}/..")

from databricks.sdk import WorkspaceClient
from databricks.labs.dqx.engine import DQEngineCore
from databricks.labs.dqx.profiler.generator import DQGenerator
from databricks.labs.dqx.profiler.profiler import DQProfiler
from pyspark.sql.functions import col, to_date, when, lit
from pyspark.sql.types import TimestampType, DateType
from functions.utility import apply_job_type

if table_name == 'None Selected':
    print('No table selected.')
else:
    settings = json.loads(Path(f'../layer_02_silver/{table_name}.json').read_text())
    settings = apply_job_type(settings)
    dst_table_name = settings['dst_table_name']

    ws = WorkspaceClient()
    df = spark.table(dst_table_name)
    if "valid_to" in df.columns:
        df = df.withColumn(
            "valid_to",
            when(col("valid_to") > to_date(lit("2100-01-01")), None).otherwise(col("valid_to"))
        )

    profiler = DQProfiler(spark)
    _, profiles = profiler.profile(df)

    generator = DQGenerator(ws)
    # checks = generator.generate_dq_rules(profiles)
    checks = generator.generate_dq_rules(profiles, exclude_columns=["_rescued_data"])

    def _profile_to_check(p):
        function_map = {
            'in_list': 'is_in',
            'range': 'min_max',
            'not_null_or_empty': 'is_not_null_or_empty',
            'max_length': 'max_length',
            'matches_regex_list': 'matches_regex_list',
            'nonzero': 'is_nonzero',
            'starts_with_prefixes': 'starts_with_prefixes',
        }
        function_name = function_map.get(p.name, p.name)
        args = {'column': p.column}
        if p.parameters:
            args.update(p.parameters)
        return {
            'name': f"{p.column}_{function_name}",
            'check': {
                'function': function_name,
                'arguments': args,
            }
        }

    checks = [_profile_to_check(p) for p in profiles]
    json_str = json.dumps(checks, indent=4, default=str)
    # display(HTML(f"<pre>{json_str}</pre>"))

    import html

    escaped = html.escape(json_str)

    display(HTML(f"""
    <button onclick="copyJson()">Copy JSON to Clipboard</button>
    <textarea id="json-text" style="display:none;">{escaped}</textarea>
    <pre>{escaped}</pre>
    <script>
    function copyJson() {{
        var textArea = document.getElementById("json-text");
        textArea.style.display = "block";
        textArea.select();
        document.execCommand("copy");
        textArea.style.display = "none";
    }}
    </script>
    """)