In [0]:
import dlt
from pyspark.sql import functions as F
from pyspark.sql.functions import col,expr,lit
from pyspark.sql.types import StructType, StringType, StructField, IntegerType
from pyspark.sql.session import SparkSession

In [0]:
schema = StructType(
    [
        StructField("circuitId", IntegerType(), False),
        StructField("circuitRef", StringType(), False),
        StructField("name", StringType(), False),
        StructField("location", StringType(), False),
        StructField("country", StringType(), False),
        StructField("lat", StringType(), False),
        StructField("lng", StringType(), False),
        StructField("alt", StringType(), False),
        StructField("url", StringType(), False)
    ]
)

cloud_file_options = {
    "cloudFiles.format":"csv",
    "header": True
}

In [0]:
@dlt.table(
    name="brz_table_circuit"
)
def bronze_load():
    df=spark.readStream.format("cloudFiles").options(**cloud_file_options).schema(schema).load("/Volumes/awsdbx_w1_2358208440317044/default/end2end/")
    df=df.withColumn("file_processed_date",F.date_format(F.current_timestamp(),"yyyy-MM-dd HH:mm:ss"))
    return df


In [0]:
checks = {}
checks["validate col circuitId for null vlaues"] = "(circuitId is not null)"
checks["validate col name for null values"] = "(name is not null)"
dq_rules = "({0})".format(" and ".join(checks.values()))

In [0]:
@dlt.table(
    name="stag_silver_load_circuit",
)
@dlt.expect_all(checks)
def stag_silver_table():
    df = dlt.readStream("brz_table_circuit")
    df=df.withColumn("dq_check",F.expr(dq_rules)).filter("dq_check=true")
    return df

In [0]:
dlt.create_streaming_table(name="silver_load_circuit")
dlt.apply_changes(
    target = "silver_load_circuit",
    source = "stag_silver_load_circuit",
    keys=["circuitId"],
    stored_as_scd_type="1",
    sequence_by = "file_processed_date"
)

In [0]:
@dlt.table(
    name="err_silver_load_circuit"
)
@dlt.expect_all(checks)
def err_silver_load_circuit():
    df = dlt.readStream("brz_table_circuit")
    df = df.withColumn("dq_check",F.expr(dq_rules)).filter("dq_check=false")
    return df