## Silver Layer

In [0]:
import dlt
from pyspark.sql import functions as F

In [0]:
# 共通の前処理
def not_null_and_not_blank(col_name: str):
    return (F.col(col_name).isNotNull()) & (F.trim(F.col(col_name)) != "")

In [0]:
@dlt.table(
    name="customers_cleansed",
    comment="Cleansed customers table (silver).",
    table_properties={"quality": "silver"}
)
@dlt.expect_or_fail("pk_must_exist", "customer_id IS NOT NULL AND trim(customer_id) != ''")
@dlt.expect_all_or_drop({
    "state_present": "state IS NOT NULL AND trim(state) != ''",
    "city_present": "city IS NOT NULL AND trim(city) != ''",
    "units_present": "units_purchased IS NOT NULL AND trim(units_purchased) != ''",
    "loyalty_present": "loyalty_segment IS NOT NULL AND trim(loyalty_segment) != ''"
})

def customers_silver():
    base = spark.table("mlops.bronze.customers_raw")

    df = (
        base.select("customer_id","state","city","units_purchased","loyalty_segment")
            .where(
                not_null_and_not_blank("customer_id") &
                not_null_and_not_blank("state") &
                not_null_and_not_blank("city") &
                not_null_and_not_blank("units_purchased") &
                not_null_and_not_blank("loyalty_segment")
            )
    )
    return df

In [0]:
@dlt.table(
    name="sales_cleansed",
    comment="Cleansed sales table (silver).",
    table_properties={"quality": "silver"}
)
@dlt.expect_or_fail("pk_must_exist", "customer_id IS NOT NULL AND trim(customer_id) != ''")
@dlt.expect_all_or_drop({
    "state_present": "state IS NOT NULL AND trim(state) != ''",
    "city_present": "city IS NOT NULL AND trim(city) != ''",
    "units_present": "units_purchased IS NOT NULL AND trim(units_purchased) != ''",
    "loyalty_present": "loyalty_segment IS NOT NULL AND trim(loyalty_segment) != ''"
})

def customers_silver():
    base = spark.table("mlops.bronze.customers")

    df = (
        base.select("customer_id","state","city","units_purchased","loyalty_segment")
            .where(
                not_null_and_not_blank("customer_id") &
                not_null_and_not_blank("state") &
                not_null_and_not_blank("city") &
                not_null_and_not_blank("units_purchased") &
                not_null_and_not_blank("loyalty_segment")
            )
    )
    return df