In [None]:
from datetime import timedelta

from pyspark.sql import functions as F


CATALOG = "main"
SCHEMA = "retail_p1"
NAMESPACE = f"{CATALOG}.{SCHEMA}"

SILVER_ORDERS_TABLE = f"{NAMESPACE}.silver_orders_clean"
SILVER_PRODUCTS_LATEST_TABLE = f"{NAMESPACE}.silver_products_latest"

GOLD_DAILY_REVENUE_TABLE = f"{NAMESPACE}.gold_daily_revenue"
GOLD_CUSTOMER_LTV_TABLE = f"{NAMESPACE}.gold_customer_ltv"
GOLD_CATEGORY_PERFORMANCE_TABLE = f"{NAMESPACE}.gold_category_performance"

CANCELED_STATUSES = [
    "canceled",
    "cancelled",
    "unavailable",
]
RETURNED_STATUSES = [
    "returned",
    "return_requested",
]
CANCELED_OR_RETURNED_STATUSES = CANCELED_STATUSES + RETURNED_STATUSES

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {NAMESPACE}")




In [None]:
orders = (
    spark.table(SILVER_ORDERS_TABLE)
    .select(
        "order_id",
        "customer_id",
        "product_id",
        F.to_timestamp("order_ts").alias("order_ts"),
        F.col("quantity").cast("int").alias("quantity"),
        F.col("price").cast("double").alias("price"),
        F.lower(F.col("status")).alias("status"),
        F.col("channel").cast("string").alias("channel"),
    )
    .withColumn("dt", F.to_date("order_ts"))
    .withColumn("line_revenue", F.col("quantity") * F.col("price"))
    .withColumn(
        "net_line_revenue",
        F.when(
            F.col("status").isin(*CANCELED_OR_RETURNED_STATUSES),
            F.lit(0.0),
        ).otherwise(F.col("line_revenue")),
    )
)




In [None]:
gold_daily_revenue = (
    orders.groupBy("dt")
    .agg(
        F.round(F.sum("line_revenue"), 2).alias("gross_revenue"),
        F.round(F.sum("net_line_revenue"), 2).alias("net_revenue"),
        F.countDistinct("order_id").alias("order_count"),
    )
    .withColumn(
        "aov",
        F.round(
            F.when(F.col("order_count") > 0, F.col("net_revenue") / F.col("order_count")).otherwise(0.0),
            2,
        ),
    )
    .select("dt", "gross_revenue", "net_revenue", "order_count", "aov")
)

(
    gold_daily_revenue.write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_DAILY_REVENUE_TABLE)
)




In [None]:
max_order_ts = orders.agg(F.max("order_ts").alias("max_order_ts")).first()["max_order_ts"]

if max_order_ts is None:
    gold_customer_ltv = spark.createDataFrame(
        [],
        "customer_id string, ltv_90d double, ltv_180d double, ltv_total double",
    )
else:
    ltv_cutoff_90d = max_order_ts - timedelta(days=90)
    ltv_cutoff_180d = max_order_ts - timedelta(days=180)
    gold_customer_ltv = (
        orders.filter(F.col("customer_id").isNotNull())
        .groupBy("customer_id")
        .agg(
            F.round(
                F.sum(
                    F.when(
                        F.col("order_ts") >= F.lit(ltv_cutoff_90d),
                        F.col("net_line_revenue"),
                    ).otherwise(0.0)
                ),
                2,
            ).alias("ltv_90d"),
            F.round(
                F.sum(
                    F.when(
                        F.col("order_ts") >= F.lit(ltv_cutoff_180d),
                        F.col("net_line_revenue"),
                    ).otherwise(0.0)
                ),
                2,
            ).alias("ltv_180d"),
            F.round(F.sum("net_line_revenue"), 2).alias("ltv_total"),
        )
        .select("customer_id", "ltv_90d", "ltv_180d", "ltv_total")
    )

(
    gold_customer_ltv.write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_CUSTOMER_LTV_TABLE)
)




In [None]:
products = spark.table(SILVER_PRODUCTS_LATEST_TABLE).select(
    "product_id",
    F.coalesce(F.col("category"), F.lit("unknown")).alias("category"),
)

orders_with_category = orders.join(products, on="product_id", how="left").withColumn(
    "category",
    F.coalesce(F.col("category"), F.lit("unknown")),
)

gold_category_performance = (
    orders_with_category.groupBy("dt", "category")
    .agg(
        F.round(F.sum("net_line_revenue"), 2).alias("revenue"),
        F.sum("quantity").alias("units_sold"),
        F.countDistinct("order_id").alias("total_orders"),
        F.countDistinct(
            F.when(
                F.col("status").isin(*RETURNED_STATUSES),
                F.col("order_id"),
            )
        ).alias("returned_orders"),
    )
    .withColumn(
        "return_rate",
        F.round(
            F.when(F.col("total_orders") > 0, F.col("returned_orders") / F.col("total_orders")).otherwise(0.0),
            4,
        ),
    )
    .select("dt", "category", "revenue", "units_sold", "return_rate")
)

(
    gold_category_performance.write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_CATEGORY_PERFORMANCE_TABLE)
)




In [None]:
optimization_sql = [
    f"OPTIMIZE {SILVER_ORDERS_TABLE} ZORDER BY (order_ts, customer_id)",
    f"OPTIMIZE {GOLD_DAILY_REVENUE_TABLE} ZORDER BY (dt)",
    f"OPTIMIZE {GOLD_CUSTOMER_LTV_TABLE} ZORDER BY (customer_id)",
    f"OPTIMIZE {GOLD_CATEGORY_PERFORMANCE_TABLE} ZORDER BY (dt, category)",
    f"ANALYZE TABLE {GOLD_DAILY_REVENUE_TABLE} COMPUTE STATISTICS",
    f"ANALYZE TABLE {GOLD_CUSTOMER_LTV_TABLE} COMPUTE STATISTICS",
    f"ANALYZE TABLE {GOLD_CATEGORY_PERFORMANCE_TABLE} COMPUTE STATISTICS",
]

for statement in optimization_sql:
    try:
        spark.sql(statement)
        print(f"[OK] {statement}")
    except Exception as exc:
        print(f"[WARN] {statement} failed: {exc}")

print("[DONE] Gold marts refreshed and optimization commands executed.")
