In [0]:
%pip install azure-eventhub
dbutils.library.restartPython()

In [0]:
%run ./config

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Gold Layer - Business Aggregations
# MAGIC Creates aggregated metrics tables for business intelligence

# COMMAND ----------

from pyspark.sql.functions import *
from pyspark.sql.types import *

# COMMAND ----------

# MAGIC %md
# MAGIC ## Read Silver Table

# COMMAND ----------

print("="*70)
print("READING SILVER TABLE")
print("="*70)

silver_df = spark.table(silver_table)
total_records = silver_df.count()

print(f"\nSilver Table: {silver_table}")
print(f"  Total Records: {total_records}")
print("="*70)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Table 1: Sales by Brand and Category

# COMMAND ----------

print("\n" + "="*70)
print("CREATING: Sales by Brand and Category")
print("="*70)

sales_by_brand_category = (silver_df
    .groupBy("brand", "category")
    .agg(
        count("*").alias("order_count"),
        sum("quantity").alias("total_quantity"),
        sum("total_amount").alias("total_revenue"),
        avg("total_amount").alias("avg_order_value"),
        min("total_amount").alias("min_order_value"),
        max("total_amount").alias("max_order_value"),
        countDistinct("customer_id").alias("unique_customers"),
        countDistinct("product_id").alias("unique_products"),
        sum("discount_amount").alias("total_discounts")
    )
    .select(
        "brand",
        "category",
        "order_count",
        "total_quantity",
        round(col("total_revenue"), 2).alias("total_revenue"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        round(col("min_order_value"), 2).alias("min_order_value"),
        round(col("max_order_value"), 2).alias("max_order_value"),
        "unique_customers",
        "unique_products",
        round(col("total_discounts"), 2).alias("total_discounts"),
        current_timestamp().alias("gold_timestamp")
    )
    .orderBy(desc("total_revenue"))
)

# Write to table
sales_by_brand_category.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(gold_brand_category_table)

count_brand_cat = sales_by_brand_category.count()
print(f"✓ Created {gold_brand_category_table}")
print(f"  Records: {count_brand_cat}")

display(sales_by_brand_category)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Table 2: Location Performance

# COMMAND ----------

print("\n" + "="*70)
print("CREATING: Location Performance")
print("="*70)

location_performance = (silver_df
    .groupBy("location")
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("total_revenue"),
        avg("total_amount").alias("avg_order_value"),
        sum("quantity").alias("total_items_sold"),
        countDistinct("customer_id").alias("unique_customers"),
        countDistinct("product_id").alias("unique_products"),
        countDistinct("brand").alias("unique_brands"),
        collect_set("category").alias("categories_sold"),
        sum(when(col("order_status") == "delivered", 1).otherwise(0)).alias("delivered_orders"),
        sum(when(col("order_status") == "pending", 1).otherwise(0)).alias("pending_orders")
    )
    .select(
        "location",
        "order_count",
        round(col("total_revenue"), 2).alias("total_revenue"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        "total_items_sold",
        "unique_customers",
        "unique_products",
        "unique_brands",
        "categories_sold",
        "delivered_orders",
        "pending_orders",
        round((col("delivered_orders") / col("order_count") * 100), 2).alias("delivery_rate_pct"),
        current_timestamp().alias("gold_timestamp")
    )
    .orderBy(desc("total_revenue"))
)

# Write to table
location_performance.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(gold_location_table)

count_location = location_performance.count()
print(f"✓ Created {gold_location_table}")
print(f"  Records: {count_location}")

display(location_performance)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Table 3: Product Performance

# COMMAND ----------

print("\n" + "="*70)
print("CREATING: Product Performance")
print("="*70)

product_performance = (silver_df
    .groupBy("product_id", "product_name", "brand", "category", "base_price")
    .agg(
        count("*").alias("order_count"),
        sum("quantity").alias("total_quantity_sold"),
        sum("total_amount").alias("total_revenue"),
        avg("unit_price").alias("avg_selling_price"),
        min("unit_price").alias("min_selling_price"),
        max("unit_price").alias("max_selling_price"),
        countDistinct("customer_id").alias("unique_customers"),
        countDistinct("location").alias("locations_sold"),
        avg("discount_pct").alias("avg_discount_pct"),
        sum("discount_amount").alias("total_discount_amount")
    )
    .select(
        "product_id",
        "product_name",
        "brand",
        "category",
        "base_price",
        "order_count",
        "total_quantity_sold",
        round(col("total_revenue"), 2).alias("total_revenue"),
        round(col("avg_selling_price"), 2).alias("avg_selling_price"),
        round(col("min_selling_price"), 2).alias("min_selling_price"),
        round(col("max_selling_price"), 2).alias("max_selling_price"),
        "unique_customers",
        "locations_sold",
        round(col("avg_discount_pct") * 100, 2).alias("avg_discount_pct"),
        round(col("total_discount_amount"), 2).alias("total_discount_amount"),
        round((col("total_revenue") / col("total_quantity_sold")), 2).alias("revenue_per_unit"),
        current_timestamp().alias("gold_timestamp")
    )
    .orderBy(desc("total_revenue"))
)

# Write to table
product_performance.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(gold_product_table)

count_product = product_performance.count()
print(f"✓ Created {gold_product_table}")
print(f"  Records: {count_product}")

display(product_performance)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Table 4: Customer Insights

# COMMAND ----------

print("\n" + "="*70)
print("CREATING: Customer Insights")
print("="*70)

customer_insights = (silver_df
    .groupBy("customer_id", "customer_name", "location")
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("total_spent"),
        avg("total_amount").alias("avg_order_value"),
        sum("quantity").alias("total_items_purchased"),
        min("order_timestamp").alias("first_order_date"),
        max("order_timestamp").alias("last_order_date"),
        countDistinct("product_id").alias("unique_products_bought"),
        collect_set("brand").alias("brands_purchased"),
        collect_set("category").alias("categories_purchased"),
        avg("discount_pct").alias("avg_discount_rate"),
        collect_set("payment_method").alias("payment_methods_used")
    )
    .withColumn("days_as_customer", 
                datediff(col("last_order_date"), col("first_order_date")))
    .withColumn("customer_segment",
                when(col("total_spent") >= 5000, "High Value")
                .when(col("total_spent") >= 2000, "Medium Value")
                .otherwise("Low Value"))
    .select(
        "customer_id",
        "customer_name",
        "location",
        "order_count",
        round(col("total_spent"), 2).alias("total_spent"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        "total_items_purchased",
        "first_order_date",
        "last_order_date",
        "days_as_customer",
        "unique_products_bought",
        "brands_purchased",
        "categories_purchased",
        round(col("avg_discount_rate") * 100, 2).alias("avg_discount_pct"),
        "payment_methods_used",
        "customer_segment",
        current_timestamp().alias("gold_timestamp")
    )
    .orderBy(desc("total_spent"))
)

# Write to table
customer_insights.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(gold_customer_table)

count_customer = customer_insights.count()
print(f"✓ Created {gold_customer_table}")
print(f"  Records: {count_customer}")

display(customer_insights.limit(20))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Summary Statistics

# COMMAND ----------

print("\n" + "="*70)
print("GOLD LAYER SUMMARY")
print("="*70)

print(f"\n✓ Created 4 Gold aggregation tables:")
print(f"\n1. {gold_brand_category_table}")
print(f"   - {count_brand_cat} brand-category combinations")
print(f"   - Sales metrics by brand and category")

print(f"\n2. {gold_location_table}")
print(f"   - {count_location} locations")
print(f"   - Performance metrics by city")

print(f"\n3. {gold_product_table}")
print(f"   - {count_product} products")
print(f"   - Product-level sales and pricing metrics")

print(f"\n4. {gold_customer_table}")
print(f"   - {count_customer} customers")
print(f"   - Customer behavior and segmentation")

# COMMAND ----------

# Overall business metrics
print("\n" + "="*70)
print("OVERALL BUSINESS METRICS")
print("="*70)

summary = silver_df.select(
    count("*").alias("total_orders"),
    sum("total_amount").alias("total_revenue"),
    avg("total_amount").alias("avg_order_value"),
    sum("quantity").alias("total_items_sold"),
    countDistinct("customer_id").alias("unique_customers"),
    countDistinct("product_id").alias("unique_products")
).first()

print(f"\nTotal Orders:      {summary['total_orders']:,}")
print(f"Total Revenue:     ${summary['total_revenue']:,.2f}")
print(f"Avg Order Value:   ${summary['avg_order_value']:.2f}")
print(f"Total Items Sold:  {summary['total_items_sold']:,}")
print(f"Unique Customers:  {summary['unique_customers']}")
print(f"Unique Products:   {summary['unique_products']}")

# COMMAND ----------

print("\n" + "="*70)
print("✓ GOLD LAYER COMPLETE")
print("="*70)
print(f"\nAll tables created successfully!")
print(f"\nData Pipeline Summary:")
print(f"  Bronze → {total_records} raw events split into orders & products")
print(f"  Silver → {total_records} enriched orders (joined)")
print(f"  Gold   → 4 aggregation tables for business intelligence")
print("\n" + "="*70)