In [0]:
# THIS DLT PIPELINE DEMOS HOW TO LOAD A DLT TABLE USING AUTOLOADER
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
from datetime import datetime
import dlt

# get spark config for the table1 and table2
table1 = spark.conf.get("table1", "cust1")
table2 = spark.conf.get("table2", "order1")
catalog = spark.conf.get("catalog", "jri")
schema = spark.conf.get("schema", "default")

In [0]:
data = [
 # tag/table name      name              constraint
 ("bronze_cust1",  "correct_schema", "_rescued_data IS NULL"),
 ("bronze_order1",  "correct_schema", "_rescued_data IS NULL"),
 ("silver_cust1",  "valid_id",       "customer_id IS NOT NULL AND customer_id > 0"),
 ("silver_order1", "valid_order_id", "order_id IS NOT NULL AND order_id > 0"),
 ("silver_order1", "valid_cust_id",  "customer_id IS NOT NULL AND customer_id > 0")
]
#Typically only run once, this doesn't have to be part of the DLT pipeline.
spark.createDataFrame(data=data, schema=["tag", "name", "constraint"]).write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.expectations")

In [0]:
#Return the rules matching the tag as a format ready for DLT annotation.
from pyspark.sql.functions import expr, col

def get_rules(tag):
  """
    loads data quality rules from csv file
    :param tag: tag to match
    :return: dictionary of rules that matched the tag
  """
  rules = {}
  df = spark.table(f"{catalog}.{schema}.expectations").where(f"tag = '{tag}'")
  for row in df.collect():
    rules[row['name']] = row['constraint']
  return rules

In [0]:
@dlt.table(name=f"bronze_{table1}",temporary=False)
#@dlt.expect_all_or_fail({"valid columns" , "col(“A”) = col(“B”)"})
#@dlt.expect_all_or_fail({"valid_columns": "A = B"})
@dlt.expect_all_or_drop(get_rules(f'bronze_{table1}')) #get the rules from our centralized table.
def incremental_view():
    return(spark.readStream.format("cloudFiles") \
        .option("cloudFiles.schemaLocation", f"abfss://schema@jridatalakesng.dfs.core.windows.net/bronze_{table1}/") \
        .option("cloudFiles.format", "csv") \
        .option("cloudFiles.inferColumnTypes", "true") \
        .option("cloudFiles.includeExistingFiles","true") \
        .load(f"abfss://raw@jridatalakesng.dfs.core.windows.net/{table1}/"))
    
@dlt.table(name=f"bronze_{table2}",temporary=False)
@dlt.expect_all_or_drop(get_rules(f'bronze_{table2}')) #get the rules from our centralized table.
#@dlt.expect_all_or_fail({"valid columns" , "col(“A”) = col(“B”)"})
#@dlt.expect_all_or_fail({"valid_columns": "A = B"})
def incremental_view():
    return(spark.readStream.format("cloudFiles") \
        .option("cloudFiles.schemaLocation", f"abfss://schema@jridatalakesng.dfs.core.windows.net/bronze_{table2}/") \
        .option("cloudFiles.format", "csv") \
        .option("cloudFiles.inferColumnTypes", "true") \
        .option("cloudFiles.includeExistingFiles","true") \
        .load(f"abfss://raw@jridatalakesng.dfs.core.windows.net/{table2}/"))

@dlt.table(name=f"silver_{table1}")
@dlt.expect_all_or_drop(get_rules(f'silver_{table1}')) #get the rules from our centralized table.
def incremental_silver():
    df  = spark.sql(f"SELECT customer_id,name,mobile_number,A,B FROM STREAM LIVE.BRONZE_{table1}")
    return(df)

@dlt.table(name=f"silver_{table2}")
@dlt.expect_all_or_drop(get_rules(f'silver_{table2}')) #get the rules from our centralized table.
def incremental_silver():
    df  = spark.sql(f"SELECT order_id,customer_id,product_id,cob FROM STREAM LIVE.BRONZE_{table2}")
    return(df)

@dlt.table(name=f"gold_aggr")
def incremental_gold():
    df  = spark.sql(f"SELECT A.customer_id, order_id,product_id,cob,name,mobile_number  FROM STREAM LIVE.SILVER_{table1} A LEFT JOIN LIVE.SILVER_{table2} B on A.customer_id = B.customer_id;")
    return(df)