In [0]:
from pyspark.sql.functions import *

In [0]:
spark.table("silver.orders").limit(10).display()

In [0]:
spark.table("silver.order_items").limit(10).display()

In [0]:
spark.table("silver.payments").limit(10).display()

In [0]:
order_item_agg = (
    spark.table("silver.order_items")\
        .groupBy("order_id")\
            .agg(
                count("*").alias("total_items"),
                first("seller_id", ignorenulls=True).alias("primary_seller_id")
                )
            )

In [0]:
payments_agg = (
    spark.table("silver.payments")
    .groupBy("order_id")
    .agg(
        sum("payment_value").alias("total_order_amount")
    )
)


In [0]:
fact_orders = (
    spark.table("silver.orders").alias("o")\
        .join(order_item_agg.alias("oi"), "order_id", "left")\
        .join(payments_agg.alias("p"), "order_id", "left")
        .select(
            col("o.order_id"),
            col("o.customer_id"),
            col("oi.primary_seller_id"),

            date_format("o.order_purchase_timestamp", "yyyyMMdd")
                .cast("int")
                .alias("order_date_key"),
            
            lit(1).alias("order_count"),
            col("p.total_order_amount"),
            col("oi.total_items"),
            col("o.delivery_days"),

            datediff(
                "o.order_delivered_customer_date",
                "o.order_approved_at"
                ).alias("shipping_days"),
            
            when(col("o.order_status") == "delivered", 1).otherwise(0)
                .alias("is_delivered"),
            
            when(col("o.order_status") == "canceled", 1).otherwise(0)
                .alias("is_canceled")
        )
)

In [0]:
fact_orders.count()

In [0]:
fact_orders.select("order_id").distinct().count()

In [0]:
fact_orders.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in fact_orders.columns
]).display()

In [0]:
fact_orders.select("order_count").count()

In [0]:
fact_orders.filter(
    (col("total_order_amount") < 0) |
    (col("total_items") <= 0)
).count()

In [0]:
fact_orders.display()

In [0]:
fact_orders.write\
    .format("delta")\
    .mode("overwrite")\
    .option("overwriteSchema", "true")\
    .save("abfss://olist-data@retailds.dfs.core.windows.net/gold/fact_orders")

In [0]:
spark.sql("""select count(*) from gold.fact_orders""").display()