In [0]:
# ============================================================
# CELL 1: INFRASTRUCTURE CONNECTION & AUTHENTICATION
# ============================================================
# Compatible with: Shared Clusters / Unity Catalog
# ============================================================

# 1. Retrieve Service Principal credentials from Key Vault
client_id     = dbutils.secrets.get(scope="shopsmart-scope", key="datalake-sp-client-id")
client_secret = dbutils.secrets.get(scope="shopsmart-scope", key="datalake-sp-client-secret")
tenant_id     = dbutils.secrets.get(scope="shopsmart-scope", key="datalake-sp-tenant-id")

storage_account_name = "dlsshopsmartdev123"

# 2. Configure Spark for OAuth 2.0 (this is the ONLY method needed)
spark.conf.set(f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# 3. Define base paths
BRONZE = f"abfss://bronze@{storage_account_name}.dfs.core.windows.net"
SILVER = f"abfss://silver@{storage_account_name}.dfs.core.windows.net"
GOLD   = f"abfss://gold@{storage_account_name}.dfs.core.windows.net"

print(f"✅ Authentication configured for: {storage_account_name}")
print(f"   📁 Bronze: {BRONZE}")
print(f"   📁 Silver: {SILVER}")
print(f"   📁 Gold:   {GOLD}")

In [0]:
# ============================================================
# CELL 11: GOLD LAYER - dim_date (Date Dimension)
# ============================================================
#
# WHAT IS A DATE DIMENSION?
# -------------------------
# A pre-built calendar table with one row per day.
# Instead of calculating "is this a weekend?" or 
# "what quarter is this?" in every query, we calculate
# it ONCE and store it.
#
# Every fact table joins to dim_date via a date key.
# Example: fact_sales.order_date_key -> dim_date.date_key
#
# WHY NOT JUST USE THE DATE COLUMN DIRECTLY?
# 1. Performance: pre-computed attributes avoid runtime functions
# 2. Consistency: everyone uses the same fiscal year definition
# 3. Filtering: "Show Q4 2025" is just WHERE quarter = 4
# 4. Custom attributes: holidays, fiscal periods, pay days
#    can't be derived from a raw date
#
# THIS IS ASKED IN EVERY DATA ENGINEERING INTERVIEW.
# "How would you design a date dimension?" is a classic question.
#
# We generate dates from 2024-01-01 to 2026-12-31 (3 years)
# to cover all historical and future data.
# ============================================================

from pyspark.sql.functions import *
from pyspark.sql.types import *

# ----------------------------------------------------------
# Step 1: Generate a sequence of dates
# ----------------------------------------------------------
# HOW THIS WORKS:
# 1. sequence() creates an array of dates from start to end
#    sequence(2024-01-01, 2026-12-31) = [2024-01-01, 2024-01-02, ...]
# 2. explode() converts the array into individual rows
#    One array of 1096 dates -> 1096 rows
#
# This is a common Spark pattern for generating reference data
# without needing an external source.

df_dates = spark.sql("""
    SELECT explode(sequence(
        to_date('2024-01-01'),
        to_date('2026-12-31'),
        interval 1 day
    )) as date
""")

total_dates = df_dates.count()
print("STEP 1: Generated " + str(total_dates) + " dates (2024-2026)")


# ----------------------------------------------------------
# Step 2: Build all date attributes
# ----------------------------------------------------------
# Each attribute serves a specific analytics purpose:
#
# date_key (int): 20250115 format
#   Used as JOIN key with fact tables.
#   Integer keys are faster than date JOINs.
#   Format: YYYYMMDD
#
# year, quarter, month, day: Basic components
#   "Revenue by quarter" -> GROUP BY quarter
#
# month_name, day_name: Human-readable labels
#   Dashboards show "January" not "1"
#
# week_of_year: For weekly reporting
#   "Week-over-week growth" needs this
#
# is_weekend: Saturday/Sunday flag
#   "Weekend vs weekday sales patterns"
#   dayofweek returns 1=Sunday, 7=Saturday
#
# is_month_start, is_month_end: Boundary flags
#   Financial reporting often focuses on month boundaries
#
# quarter_label: "Q1-2025" format
#   Clean label for dashboard filters
#
# day_of_year: 1-365
#   Useful for year-over-year comparison at the day level

df_dim_date = df_dates.select(
    date_format(col("date"), "yyyyMMdd").cast("int").alias("date_key"),
    col("date").alias("full_date"),
    year("date").alias("year"),
    quarter("date").alias("quarter"),
    month("date").alias("month"),
    dayofmonth("date").alias("day"),
    date_format("date", "MMMM").alias("month_name"),
    date_format("date", "MMM").alias("month_short"),
    dayofweek("date").alias("day_of_week"),
    date_format("date", "EEEE").alias("day_name"),
    date_format("date", "EEE").alias("day_short"),
    weekofyear("date").alias("week_of_year"),
    dayofyear("date").alias("day_of_year"),
    when(dayofweek("date").isin(1, 7), lit(True)).otherwise(lit(False)).alias("is_weekend"),
    when(dayofmonth("date") == 1, lit(True)).otherwise(lit(False)).alias("is_month_start"),
    when(col("date") == last_day("date"), lit(True)).otherwise(lit(False)).alias("is_month_end"),
    when(month("date").isin(11, 12, 1), lit(True)).otherwise(lit(False)).alias("is_holiday_season"),
    concat(lit("Q"), quarter("date").cast("string"), lit("-"), year("date").cast("string")).alias("quarter_label"),
    concat(date_format("date", "MMM"), lit("-"), year("date").cast("string")).alias("month_year_label"),
    when(month("date") <= 6, lit(1)).otherwise(lit(2)).alias("half_year"),
    when(month("date") <= 6, lit("H1")).otherwise(lit("H2")).alias("half_year_label")
)

print("STEP 2: Date attributes built - " + str(len(df_dim_date.columns)) + " columns")


# ----------------------------------------------------------
# Step 3: Write to Gold layer
# ----------------------------------------------------------
gold_dim_date_path = GOLD + "/dim_date"

df_dim_date.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", True) \
    .save(gold_dim_date_path)


# ----------------------------------------------------------
# Step 4: Verify
# ----------------------------------------------------------
df_verify = spark.read.format("delta").load(gold_dim_date_path)
final_count = df_verify.count()

print("")
print("=" * 65)
print("GOLD dim_date - COMPLETE")
print("=" * 65)
print("  Total dates:  " + str(final_count) + " rows")
print("  Date range:   2024-01-01 to 2026-12-31")
print("  Columns:      " + str(len(df_verify.columns)))
print("  Path:         " + gold_dim_date_path)

print("\n  Schema:")
df_verify.printSchema()

print("\n  Sample (first 5 days of 2025):")
df_verify.filter(
    (col("year") == 2025) & (col("month") == 1) & (col("day") <= 5)
).orderBy("full_date").show(truncate=False)

print("\n  Weekend vs Weekday count:")
df_verify.groupBy("is_weekend").count().show()

print("\n  Records per year:")
df_verify.groupBy("year").count().orderBy("year").show()

print("\n  Quarter labels sample:")
df_verify.select("quarter_label").distinct().orderBy("quarter_label").show(12)

print("[DONE] Gold dim_date complete!")
print("[NEXT] Cell 12 - Gold dim_customer")

In [0]:
# ============================================================
# CELL 12: GOLD LAYER - dim_customer (Customer Dimension)
# ============================================================
#
# WHAT IS dim_customer?
# ---------------------
# The customer dimension describes WHO made the purchase.
# Every row in fact_sales will JOIN to dim_customer to answer:
#   "Revenue by loyalty tier"
#   "Orders by age group"
#   "Customer count by state"
#
# SOURCE: silver/customers (already cleaned & PII masked)
#
# WHAT WE ADD IN GOLD:
# Gold layer is about BUSINESS PERSPECTIVE, not cleaning.
# We select only the columns analysts need and add
# surrogate keys.
#
# SURROGATE KEY vs NATURAL KEY:
# - Natural key: customer_id ("CUST001") - from source system
# - Surrogate key: customer_sk (1, 2, 3...) - generated by us
#
# WHY SURROGATE KEYS?
# 1. Performance: integer JOINs faster than string JOINs
# 2. Independence: if source changes CUST001 to C-001, 
#    our surrogate key stays the same
# 3. History: enables SCD Type 2 (tracking changes over time)
# 4. Standard practice: every data warehouse uses them
#
# SCD TYPE 2 (Slowly Changing Dimension):
# When a customer moves from "Silver" to "Gold" loyalty tier,
# we want to keep BOTH versions:
#   customer_sk=1, CUST001, Silver, effective 2024-01-01 to 2025-06-01
#   customer_sk=2, CUST001, Gold,   effective 2025-06-01 to 9999-12-31
#
# This lets us analyze: "What was the customer's tier WHEN 
# they placed that order in March?" — historical accuracy.
#
# For this project, we implement a SIMPLIFIED SCD Type 2
# (single version per customer since we have point-in-time data).
# In production, you'd run this incrementally with MERGE.
# ============================================================

from pyspark.sql.functions import *
from pyspark.sql.types import *

# ----------------------------------------------------------
# Step 1: Read Silver Customers
# ----------------------------------------------------------
df_cust_silver = spark.read.format("delta").load(SILVER + "/customers")

silver_count = df_cust_silver.count()
print("STEP 1: Silver Customers read - " + str(silver_count) + " rows")


# ----------------------------------------------------------
# Step 2: Build dim_customer with surrogate key
# ----------------------------------------------------------
# monotonically_increasing_id() generates unique IDs.
# NOTE: These IDs are NOT sequential (1, 2, 3...).
# They're globally unique across partitions.
# For a true sequential key, we use row_number() instead.
#
# WHY row_number() OVER orderBy(customer_id)?
# - Guarantees sequential: 1, 2, 3, 4, ...
# - Deterministic: same input always gives same keys
# - Sorted: customer_sk order matches customer_id order
# This is important for debugging and testing.

from pyspark.sql.window import Window

window_sk = Window.orderBy("customer_id")

df_dim_customer = df_cust_silver \
    .withColumn("customer_sk", row_number().over(window_sk)) \
    .select(
        col("customer_sk"),
        col("customer_id"),
        col("first_name"),
        col("last_name"),
        col("first_name_initial"),
        col("last_name_initial"),
        col("email_hash"),
        col("email_domain"),
        col("phone_masked"),
        col("gender"),
        col("date_of_birth"),
        col("age"),
        col("age_group"),
        col("loyalty_tier"),
        col("registration_date"),
        col("customer_tenure_days"),
        col("tenure_category"),
        col("address_city"),
        col("address_state"),
        col("address_zip"),
        col("address_country"),
        col("pref_categories"),
        col("pref_communication"),
        col("has_email"),
        # SCD Type 2 columns
        col("registration_date").alias("effective_start_date"),
        to_date(lit("9999-12-31")).alias("effective_end_date"),
        lit(True).alias("is_current"),
        current_timestamp().alias("_gold_processed_at"),
        lit("1.0").alias("_gold_version")
    )

print("STEP 2: dim_customer built with surrogate key and SCD2 columns")
print("  Columns: " + str(len(df_dim_customer.columns)))


# ----------------------------------------------------------
# Step 3: Write to Gold layer
# ----------------------------------------------------------
gold_dim_customer_path = GOLD + "/dim_customer"

df_dim_customer.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", True) \
    .save(gold_dim_customer_path)


# ----------------------------------------------------------
# Step 4: Verify
# ----------------------------------------------------------
df_verify = spark.read.format("delta").load(gold_dim_customer_path)
final_count = df_verify.count()

print("")
print("=" * 65)
print("GOLD dim_customer - COMPLETE")
print("=" * 65)
print("  Source (Silver):  " + str(silver_count) + " rows")
print("  Final Gold:       " + str(final_count) + " rows")
print("  Columns:          " + str(len(df_verify.columns)))
print("  Path:             " + gold_dim_customer_path)

print("\n  Schema:")
df_verify.printSchema()

# Show surrogate key assignment
print("\n  Surrogate key sample:")
df_verify.select(
    "customer_sk", "customer_id", "first_name", "last_name",
    "gender", "age_group", "loyalty_tier"
).orderBy("customer_sk").show(5, truncate=False)

# Show SCD2 columns
print("\n  SCD Type 2 columns:")
df_verify.select(
    "customer_sk", "customer_id", "loyalty_tier",
    "effective_start_date", "effective_end_date", "is_current"
).orderBy("customer_sk").show(5, truncate=False)

# Show geographic distribution
print("\n  Top 10 states by customer count:")
df_verify.groupBy("address_state").count().orderBy(desc("count")).show(10)

# Loyalty by age group
print("\n  Loyalty tier by age group:")
df_verify.groupBy("age_group", "loyalty_tier") \
    .count() \
    .orderBy("age_group", "loyalty_tier") \
    .show(25)

print("[DONE] Gold dim_customer complete!")
print("[NEXT] Cell 13 - Gold dim_product")

In [0]:
# ============================================================
# CELL 13: GOLD LAYER - dim_product (Product Dimension)
# ============================================================
#
# WHAT IS dim_product?
# --------------------
# Describes WHAT was sold. Every fact_sales row joins here.
# Business questions this enables:
#   "Revenue by category" 
#   "Top 10 brands by sales"
#   "Average margin by price tier"
#   "Products to discontinue (low rating + low sales)"
#
# SOURCE: silver/products (already cleaned, attributes flattened)
#
# WHAT WE ADD IN GOLD:
# - Surrogate key (product_sk)
# - Only business-relevant columns (drop technical metadata)
# - Consistent column ordering (keys first, then attributes)
#
# DESIGN PRINCIPLE:
# Dimension tables should be WIDE (many columns) but SHORT 
# (few rows). Our dim_product has 50 rows and ~25 columns.
# This is typical - a retail company might have 50,000 products
# with 30+ attributes each.
#
# The "width" of dimensions is what makes star schema powerful.
# One JOIN to dim_product gives you access to category, brand,
# price tier, rating, margin - all in one hop.
# ============================================================

from pyspark.sql.functions import *
from pyspark.sql.window import Window

# ----------------------------------------------------------
# Step 1: Read Silver Products
# ----------------------------------------------------------
df_prod_silver = spark.read.format("delta").load(SILVER + "/products")

silver_count = df_prod_silver.count()
print("STEP 1: Silver Products read - " + str(silver_count) + " rows")


# ----------------------------------------------------------
# Step 2: Build dim_product with surrogate key
# ----------------------------------------------------------
# Same row_number() approach as dim_customer.
# Ordered by product_id for deterministic key assignment.

window_sk = Window.orderBy("product_id")

df_dim_product = df_prod_silver \
    .withColumn("product_sk", row_number().over(window_sk)) \
    .select(
        # Keys (always first in a dimension)
        col("product_sk"),
        col("product_id"),
        # Descriptive attributes
        col("product_name"),
        col("category"),
        col("sub_category"),
        col("brand"),
        col("supplier_id"),
        # Price and cost
        col("price").alias("current_price"),
        col("cost_price"),
        col("profit_margin"),
        col("margin_pct"),
        col("price_tier"),
        # Product characteristics
        col("weight_kg"),
        col("rating"),
        col("review_count"),
        col("rating_category"),
        col("is_active"),
        # Flattened attributes
        col("attr_battery_life"),
        col("attr_colors"),
        col("attr_connectivity"),
        # Time attributes
        col("created_at").alias("product_created_at"),
        col("updated_at").alias("product_updated_at"),
        col("product_age_days"),
        # Gold metadata
        current_timestamp().alias("_gold_processed_at"),
        lit("1.0").alias("_gold_version")
    )

print("STEP 2: dim_product built - " + str(len(df_dim_product.columns)) + " columns")


# ----------------------------------------------------------
# Step 3: Write to Gold layer
# ----------------------------------------------------------
gold_dim_product_path = GOLD + "/dim_product"

df_dim_product.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", True) \
    .save(gold_dim_product_path)


# ----------------------------------------------------------
# Step 4: Verify
# ----------------------------------------------------------
df_verify = spark.read.format("delta").load(gold_dim_product_path)
final_count = df_verify.count()

print("")
print("=" * 65)
print("GOLD dim_product - COMPLETE")
print("=" * 65)
print("  Source (Silver):  " + str(silver_count) + " rows")
print("  Final Gold:       " + str(final_count) + " rows")
print("  Columns:          " + str(len(df_verify.columns)))
print("  Path:             " + gold_dim_product_path)

print("\n  Schema:")
df_verify.printSchema()

# Surrogate key sample
print("\n  Surrogate key sample:")
df_verify.select(
    "product_sk", "product_id", "product_name",
    "category", "brand", "current_price", "price_tier"
).orderBy("product_sk").show(5, truncate=False)

# Category and margin analysis
print("\n  Category analysis:")
df_verify.groupBy("category").agg(
    count("*").alias("products"),
    round(avg("current_price"), 2).alias("avg_price"),
    round(avg("margin_pct"), 2).alias("avg_margin_pct"),
    round(avg("rating"), 2).alias("avg_rating"),
    sum(col("is_active").cast("int")).alias("active_count")
).orderBy(desc("products")).show()

# Price tier analysis
print("\n  Price tier analysis:")
df_verify.groupBy("price_tier").agg(
    count("*").alias("products"),
    round(min("current_price"), 2).alias("min_price"),
    round(max("current_price"), 2).alias("max_price"),
    round(avg("margin_pct"), 2).alias("avg_margin_pct")
).orderBy("min_price").show()

print("[DONE] Gold dim_product complete!")
print("[NEXT] Cell 14 - Gold fact_sales (THE MAIN FACT TABLE)")

In [0]:
# ============================================================
# CELL 14: GOLD LAYER - fact_sales (Main Fact Table)
# ============================================================
#
# THIS IS THE HEART OF THE ENTIRE DATA PLATFORM.
#
# WHAT IS A FACT TABLE?
# ---------------------
# A fact table records BUSINESS EVENTS (things that happened).
# Each row = one item sold in one order.
# This is called the "grain" of the fact table.
#
# GRAIN = "The most atomic level of detail in the fact table"
# Our grain: ONE ORDER ITEM (not one order!)
#
# WHY ORDER ITEM LEVEL (not order level)?
# If we aggregate to order level, we lose product-level detail:
#   - "Which product sold the most?" needs item-level
#   - "Revenue by category" needs item + product JOIN
#   - "Average items per order" needs item count per order
#
# Rule: Always choose the LOWEST useful grain. You can always
# aggregate UP (items -> orders), but you can't disaggregate 
# DOWN (orders -> items) without the source data.
#
# HOW FACT TABLE CONNECTS TO DIMENSIONS:
#
#   dim_date -------- date_key -------- fact_sales
#   dim_customer ---- customer_id ----- fact_sales  
#   dim_product ----- product_id ------ fact_sales
#
# This is the STAR shape:
#           dim_date
#              |
#   dim_customer -- fact_sales -- dim_product
#
# MEASURES (numeric values we analyze):
#   quantity, unit_price, line_total, discount_amount,
#   net_line_total, shipping_amount
#
# These are the numbers we SUM, AVG, COUNT in dashboards.
#
# DEGENERATE DIMENSIONS:
#   order_id and item_id live in the fact table directly.
#   They don't have their own dimension table because
#   there's nothing more to describe about them.
#   This is called a "degenerate dimension" - a dimension
#   key without a dimension table.
# ============================================================

from pyspark.sql.functions import *
from pyspark.sql.types import *

# ----------------------------------------------------------
# Step 1: Read Silver tables we need to JOIN
# ----------------------------------------------------------
df_orders = spark.read.format("delta").load(SILVER + "/orders")
df_items = spark.read.format("delta").load(SILVER + "/order_items")

orders_count = df_orders.count()
items_count = df_items.count()
print("STEP 1: Silver data read")
print("  Orders:      " + str(orders_count) + " rows")
print("  Order Items: " + str(items_count) + " rows")


# ----------------------------------------------------------
# Step 2: JOIN orders with order_items
# ----------------------------------------------------------
# WHY JOIN IN GOLD (not Silver)?
# Silver tables are INDEPENDENT cleaned tables.
# Gold is where we COMBINE them for analytics.
#
# JOIN TYPE: INNER JOIN
# We only want items that belong to valid orders.
# If an item has an order_id that doesn't exist in orders,
# it's orphaned data and should not be in the fact table.
#
# WHAT COLUMNS COME FROM WHERE:
# From orders: customer_id, order_date, order_status, 
#              payment_method, channel, shipping_amount
# From items:  product_id, quantity, unit_price,
#              discount_percent, line_total, net_line_total
#
# We prefix nothing because column names don't conflict
# (we already designed Silver tables carefully).

df_joined = df_orders.alias("o").join(
    df_items.alias("i"),
    col("o.order_id") == col("i.order_id"),
    "inner"
)

joined_count = df_joined.count()
print("STEP 2: Orders JOIN Order Items = " + str(joined_count) + " rows")

# Check for orphaned items (items without matching orders)
orphan_items = df_items.join(df_orders, "order_id", "left_anti").count()
print("  Orphaned items (no matching order): " + str(orphan_items))


# ----------------------------------------------------------
# Step 3: Build fact_sales
# ----------------------------------------------------------
# SELECT COLUMNS carefully:
#
# KEYS (for joining to dimensions):
#   order_date_key -> joins to dim_date.date_key
#   customer_id    -> joins to dim_customer.customer_id
#   product_id     -> joins to dim_product.product_id
#
# DEGENERATE DIMENSIONS (identifiers without dim tables):
#   order_id       -> for drill-down to specific order
#   item_id        -> for uniqueness (primary key of fact)
#
# MEASURES (numbers we aggregate):
#   quantity              -> SUM for total units sold
#   unit_price            -> for price analysis
#   line_total            -> quantity * unit_price (gross)
#   discount_percent      -> for discount analysis
#   discount_amount       -> SUM for total discounts given
#   net_line_total        -> revenue after discount
#   shipping_amount       -> allocated from order level
#
# ATTRIBUTES (for filtering/grouping):
#   order_status, payment_method, channel
#   item_status
#
# FLAGS (pre-computed boolean filters):
#   is_cancelled, is_returned, has_discount
#   is_weekend
#
# TIME ATTRIBUTES (from orders, for quick filtering):
#   order_year, order_month, order_hour, day_name

df_fact_sales = df_joined.select(
    # Primary key
    col("i.item_id").alias("sales_key"),
    # Dimension keys
    date_format(col("o.order_date"), "yyyyMMdd").cast("int").alias("order_date_key"),
    col("o.customer_id"),
    col("i.product_id"),
    # Degenerate dimensions
    col("o.order_id"),
    col("i.item_id"),
    # Measures
    col("i.quantity"),
    col("i.unit_price"),
    col("i.line_total"),
    col("i.discount_percent"),
    col("i.discount_amount"),
    col("i.net_line_total"),
    col("o.shipping_amount"),
    # Order attributes
    col("o.order_date"),
    col("o.order_status"),
    col("o.payment_method"),
    col("o.channel"),
    col("i.item_status"),
    # Pre-computed flags
    col("o.is_cancelled"),
    col("o.is_returned"),
    col("i.has_discount"),
    col("o.is_weekend"),
    col("o.has_free_shipping"),
    # Time parts (for quick filtering without joining dim_date)
    col("o.order_year"),
    col("o.order_month"),
    col("o.order_day"),
    col("o.order_hour"),
    col("o.day_name"),
    # Metadata
    current_timestamp().alias("_gold_processed_at"),
    lit("1.0").alias("_gold_version")
)

print("STEP 3: fact_sales built - " + str(len(df_fact_sales.columns)) + " columns")


# ----------------------------------------------------------
# Step 4: Write to Gold layer (partitioned by year, month)
# ----------------------------------------------------------
# WHY PARTITION fact_sales?
# Fact tables are LARGE (millions of rows in production).
# Partitioning by year/month means:
#   - Query "SELECT * WHERE order_year = 2025" only reads 
#     2025 files, skipping all other years
#   - This is called PARTITION PRUNING
#   - Can reduce query time by 90%+
#
# WHY year + month (not just date)?
# - Partitioning by date creates too many small files
#   (365 partitions per year, each with few rows)
# - year + month = 12 partitions per year (optimal)
# - This is a common production pattern

gold_fact_sales_path = GOLD + "/fact_sales"

df_fact_sales.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("order_year", "order_month") \
    .option("overwriteSchema", True) \
    .save(gold_fact_sales_path)


# ----------------------------------------------------------
# Step 5: Verify and analyze
# ----------------------------------------------------------
df_verify = spark.read.format("delta").load(gold_fact_sales_path)
final_count = df_verify.count()

print("")
print("=" * 65)
print("GOLD fact_sales - COMPLETE")
print("=" * 65)
print("  Orders (Silver):    " + str(orders_count))
print("  Items (Silver):     " + str(items_count))
print("  Joined rows:        " + str(joined_count))
print("  Final fact_sales:   " + str(final_count) + " rows")
print("  Columns:            " + str(len(df_verify.columns)))
print("  Partitioned by:     order_year, order_month")
print("  Path:               " + gold_fact_sales_path)

print("\n  Schema:")
df_verify.printSchema()

print("\n  Sample data:")
df_verify.select(
    "sales_key", "order_date_key", "order_id", "customer_id",
    "product_id", "quantity", "unit_price", "net_line_total",
    "order_status", "channel"
).show(5, truncate=False)

# ============================================================
# BUSINESS KPI QUERIES (This is what the Gold layer enables!)
# ============================================================

print("\n" + "=" * 65)
print("BUSINESS KPIs FROM fact_sales")
print("=" * 65)

# KPI 1: Total Revenue
print("\n  KPI 1: Revenue Summary")
df_verify.agg(
    count("*").alias("total_line_items"),
    countDistinct("order_id").alias("total_orders"),
    countDistinct("customer_id").alias("total_customers"),
    round(sum("net_line_total"), 2).alias("total_revenue"),
    round(avg("net_line_total"), 2).alias("avg_line_item_value"),
    round(sum("discount_amount"), 2).alias("total_discounts")
).show(truncate=False)

# KPI 2: Revenue by Channel
print("\n  KPI 2: Revenue by Channel")
df_verify.groupBy("channel").agg(
    countDistinct("order_id").alias("orders"),
    round(sum("net_line_total"), 2).alias("revenue"),
    round(avg("net_line_total"), 2).alias("avg_item_value")
).orderBy(desc("revenue")).show()

# KPI 3: Revenue by Month
print("\n  KPI 3: Monthly Revenue Trend")
df_verify.groupBy("order_year", "order_month").agg(
    countDistinct("order_id").alias("orders"),
    round(sum("net_line_total"), 2).alias("revenue")
).orderBy("order_year", "order_month").show(15)

# KPI 4: Order Status Breakdown
print("\n  KPI 4: Order Status Breakdown")
df_verify.groupBy("order_status").agg(
    countDistinct("order_id").alias("orders"),
    round(sum("net_line_total"), 2).alias("revenue")
).orderBy(desc("orders")).show()

# KPI 5: Payment Method Analysis
print("\n  KPI 5: Payment Method Analysis")
df_verify.groupBy("payment_method").agg(
    countDistinct("order_id").alias("orders"),
    round(sum("net_line_total"), 2).alias("revenue")
).orderBy(desc("revenue")).show()

print("[DONE] Gold fact_sales complete!")
print("[NEXT] Cell 15 - Gold agg_daily_sales (Pre-aggregated metrics)")

In [0]:
# ============================================================
# CELL 15: GOLD LAYER - agg_daily_sales (Pre-Aggregated Table)
# ============================================================
#
# WHAT IS A PRE-AGGREGATED TABLE?
# --------------------------------
# Instead of running heavy GROUP BY queries on 4780 rows 
# (or millions in production) every time a dashboard loads,
# we pre-compute daily summaries.
#
# Dashboard query WITHOUT pre-aggregation:
#   SELECT date, SUM(net_line_total) FROM fact_sales 
#   GROUP BY date
#   -> Scans ALL rows every time (slow at scale)
#
# Dashboard query WITH pre-aggregation:
#   SELECT * FROM agg_daily_sales WHERE date = '2025-01-15'
#   -> Reads ONE pre-computed row (instant)
#
# WHY IS THIS IMPORTANT?
# - Power BI dashboards refresh every 15-30 minutes
# - Each refresh runs all queries
# - Without pre-agg: 50 queries * millions of rows = slow
# - With pre-agg: 50 queries * hundreds of rows = instant
#
# WHAT METRICS DO WE PRE-COMPUTE?
# Everything a business executive looks at daily:
#   - Total revenue, orders, customers
#   - Average order value
#   - Cancellation and return rates
#   - Revenue by channel
#   - Units sold
#
# GRAIN: One row per day per channel
# This lets us analyze both:
#   - Total daily metrics (GROUP BY date)
#   - Channel comparison (GROUP BY date, channel)
# ============================================================

from pyspark.sql.functions import *

# ----------------------------------------------------------
# Step 1: Read fact_sales from Gold
# ----------------------------------------------------------
df_fact = spark.read.format("delta").load(GOLD + "/fact_sales")
fact_count = df_fact.count()
print("STEP 1: fact_sales read - " + str(fact_count) + " rows")


# ----------------------------------------------------------
# Step 2: Build daily aggregation by channel
# ----------------------------------------------------------
# WHY GROUP BY date AND channel?
# 
# If we only group by date:
#   2025-01-15 | revenue: $50,000 | orders: 200
#   (Can't drill down by channel)
#
# If we group by date + channel:
#   2025-01-15 | web        | revenue: $15,000 | orders: 60
#   2025-01-15 | mobile_app | revenue: $18,000 | orders: 75
#   2025-01-15 | in_store   | revenue: $10,000 | orders: 40
#   2025-01-15 | marketplace| revenue: $7,000  | orders: 25
#   (Can see channel breakdown AND roll up to daily total)
#
# METRICS EXPLAINED:
#   total_orders: COUNT DISTINCT order_id (not line items!)
#     One order with 3 items = 1 order, not 3
#
#   total_customers: unique buyers that day
#     Important for "customer acquisition" tracking
#
#   total_items_sold: SUM of quantity
#     Physical units moved (logistics metric)
#
#   gross_revenue: before discounts
#   total_discount: money given away
#   net_revenue: what we actually earned
#   avg_order_value: revenue / orders
#     Key e-commerce KPI, target is to increase this
#
#   cancel_rate: % of orders cancelled
#     High cancel rate = UX problem or fraud
#
#   return_rate: % of orders returned
#     High return rate = product quality issue

df_agg_daily = df_fact.groupBy(
    col("order_year"),
    col("order_month"),
    col("order_day"),
    to_date(col("order_date")).alias("order_date"),
    col("channel")
).agg(
    # Volume metrics
    countDistinct("order_id").alias("total_orders"),
    countDistinct("customer_id").alias("total_customers"),
    count("*").alias("total_line_items"),
    sum("quantity").alias("total_items_sold"),
    # Revenue metrics
    round(sum("line_total"), 2).alias("gross_revenue"),
    round(sum("discount_amount"), 2).alias("total_discount"),
    round(sum("net_line_total"), 2).alias("net_revenue"),
    round(sum("shipping_amount"), 2).alias("total_shipping"),
    # Averages
    round(avg("net_line_total"), 2).alias("avg_item_value"),
    round(avg("quantity"), 2).alias("avg_quantity_per_item"),
    # Status counts
    countDistinct(when(col("order_status") == "CANCELLED", col("order_id"))).alias("cancelled_orders"),
    countDistinct(when(col("order_status") == "RETURNED", col("order_id"))).alias("returned_orders"),
    countDistinct(when(col("order_status") == "DELIVERED", col("order_id"))).alias("delivered_orders"),
    # Discount metrics
    countDistinct(when(col("has_discount") == True, col("order_id"))).alias("orders_with_discount"),
    # Weekend flag
    first("is_weekend").alias("is_weekend"),
    first("day_name").alias("day_name")
)

# Add calculated rates
df_agg_enriched = df_agg_daily \
    .withColumn("avg_order_value",
        when(col("total_orders") > 0,
            round(col("net_revenue") / col("total_orders"), 2))
        .otherwise(lit(0.0))) \
    .withColumn("cancel_rate_pct",
        when(col("total_orders") > 0,
            round(col("cancelled_orders") / col("total_orders") * 100, 2))
        .otherwise(lit(0.0))) \
    .withColumn("return_rate_pct",
        when(col("total_orders") > 0,
            round(col("returned_orders") / col("total_orders") * 100, 2))
        .otherwise(lit(0.0))) \
    .withColumn("discount_rate_pct",
        when(col("total_orders") > 0,
            round(col("orders_with_discount") / col("total_orders") * 100, 2))
        .otherwise(lit(0.0))) \
    .withColumn("_gold_processed_at", current_timestamp()) \
    .withColumn("_gold_version", lit("1.0"))

agg_count = df_agg_enriched.count()
print("STEP 2: Daily aggregation built - " + str(agg_count) + " rows")


# ----------------------------------------------------------
# Step 3: Write to Gold
# ----------------------------------------------------------
gold_agg_path = GOLD + "/agg_daily_sales"

df_agg_enriched.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", True) \
    .save(gold_agg_path)


# ----------------------------------------------------------
# Step 4: Verify and show executive dashboard metrics
# ----------------------------------------------------------
df_verify = spark.read.format("delta").load(gold_agg_path)
final_count = df_verify.count()

print("")
print("=" * 65)
print("GOLD agg_daily_sales - COMPLETE")
print("=" * 65)
print("  Fact rows aggregated: " + str(fact_count))
print("  Aggregated rows:      " + str(final_count))
print("  Compression ratio:    " + str(fact_count) + " -> " + str(final_count) + " rows")
print("  Path:                 " + gold_agg_path)

print("\n  Schema:")
df_verify.printSchema()

print("\n  Sample daily data:")
df_verify.select(
    "order_date", "channel", "total_orders", "total_customers",
    "net_revenue", "avg_order_value", "cancel_rate_pct"
).orderBy(desc("order_date")).show(10, truncate=False)

# ============================================================
# EXECUTIVE DASHBOARD QUERIES
# ============================================================
print("\n" + "=" * 65)
print("EXECUTIVE DASHBOARD METRICS")
print("=" * 65)

# Overall platform metrics (roll up all channels)
print("\n  PLATFORM TOTALS:")
df_verify.agg(
    sum("total_orders").alias("total_orders"),
    sum("total_customers").alias("total_customer_visits"),
    round(sum("net_revenue"), 2).alias("total_net_revenue"),
    round(sum("total_discount"), 2).alias("total_discounts_given"),
    round(sum("total_items_sold")).alias("total_units_sold"),
    round(avg("avg_order_value"), 2).alias("platform_avg_order_value")
).show(truncate=False)

# Monthly trend (what executives look at first)
print("\n  MONTHLY REVENUE TREND:")
df_verify.groupBy("order_year", "order_month").agg(
    sum("total_orders").alias("orders"),
    round(sum("net_revenue"), 2).alias("revenue"),
    round(avg("avg_order_value"), 2).alias("avg_order_value"),
    round(avg("cancel_rate_pct"), 2).alias("avg_cancel_rate")
).orderBy("order_year", "order_month").show(15)

# Channel performance comparison
print("\n  CHANNEL PERFORMANCE:")
df_verify.groupBy("channel").agg(
    sum("total_orders").alias("orders"),
    round(sum("net_revenue"), 2).alias("revenue"),
    round(avg("avg_order_value"), 2).alias("avg_order_value"),
    round(avg("cancel_rate_pct"), 2).alias("avg_cancel_rate"),
    round(avg("return_rate_pct"), 2).alias("avg_return_rate")
).orderBy(desc("revenue")).show()

# Weekend vs Weekday
print("\n  WEEKEND vs WEEKDAY:")
df_verify.groupBy("is_weekend").agg(
    sum("total_orders").alias("orders"),
    round(sum("net_revenue"), 2).alias("revenue"),
    round(avg("avg_order_value"), 2).alias("avg_order_value")
).show()

# ============================================================
# FINAL GOLD LAYER SUMMARY
# ============================================================
print("\n" + "=" * 65)
print("GOLD LAYER - ALL TABLES COMPLETE!")
print("=" * 65)
print("  1. gold/dim_date         - 1096 rows  (calendar dimension)")
print("  2. gold/dim_customer     - 500 rows   (customer dimension)")
print("  3. gold/dim_product      - 50 rows    (product dimension)")
print("  4. gold/fact_sales       - " + str(fact_count) + " rows  (main fact table)")
print("  5. gold/agg_daily_sales  - " + str(final_count) + " rows   (pre-aggregated)")
print("")
print("  STAR SCHEMA:")
print("        dim_date")
print("           |")
print("  dim_customer --- fact_sales --- dim_product")
print("                       |")
print("                 agg_daily_sales")
print("")
print("[NEXT] Cell 16+ : ML Models (Customer Segmentation, Forecasting)")