In [0]:
%pip install azure-eventhub
dbutils.library.restartPython()

In [0]:
%run ./config

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Silver Layer - Join Orders with Products
# MAGIC Creates enriched order_details table by joining Bronze tables

# COMMAND ----------


from pyspark.sql.functions import *
from pyspark.sql.types import *

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Read Bronze Tables

# COMMAND ----------

print("="*70)
print("READING BRONZE TABLES")
print("="*70)

# Read orders
orders_df = spark.table(bronze_orders_table)
orders_count = orders_df.count()
print(f"\n✓ Orders: {orders_count} records")
print(f"  Table: {bronze_orders_table}")

# Read products
products_df = spark.table(bronze_products_table)
products_count = products_df.count()
print(f"\n✓ Products: {products_count} records")
print(f"  Table: {bronze_products_table}")

print("\n" + "="*70)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Join Orders with Products

# COMMAND ----------

print("\nJoining orders with products on product_id...")

# Perform LEFT JOIN to ensure all orders are included
order_details_df = (orders_df
    .join(
        products_df,
        on="product_id",
        how="left"
    )
    .select(
        # Order fields
        orders_df.order_id,
        orders_df.customer_id,
        orders_df.customer_name,
        orders_df.location,
        orders_df.order_status,
        orders_df.payment_method,
        orders_df.quantity,
        orders_df.discount_pct,
        orders_df.total_amount,
        to_timestamp(orders_df.order_timestamp).alias("order_timestamp"),
        
        # Product fields (from join)
        products_df.product_id,
        products_df.product_name,
        products_df.category,
        products_df.brand,
        products_df.base_price,
        products_df.unit_price,
        
        # Calculated fields
        to_date(to_timestamp(orders_df.order_timestamp)).alias("order_date"),
        hour(to_timestamp(orders_df.order_timestamp)).alias("order_hour"),
        dayofweek(to_timestamp(orders_df.order_timestamp)).alias("day_of_week"),
        (orders_df.quantity * products_df.unit_price).alias("line_total"),
        (orders_df.total_amount - (orders_df.quantity * products_df.unit_price)).alias("discount_amount")
    )
    .withColumn("silver_timestamp", current_timestamp())
)

joined_count = order_details_df.count()
print(f"✓ Created {joined_count} enriched order records")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Verify Join Quality

# COMMAND ----------

print("\n" + "="*70)
print("JOIN VERIFICATION")
print("="*70)

# Check for null product names (failed joins)
null_products = order_details_df.filter(col("product_name").isNull()).count()
successful_joins = order_details_df.filter(col("product_name").isNotNull()).count()

print(f"\nTotal records:        {joined_count}")
print(f"Successful joins:     {successful_joins}")
print(f"Failed joins (nulls): {null_products}")

if null_products > 0:
    print("\n⚠️  Warning: Some orders have missing product information")
    print("Missing products:")
    display(order_details_df.filter(col("product_name").isNull()).select("order_id", "product_id"))
else:
    print("\n✓ All orders successfully joined with product data!")

print("="*70)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Write to Silver Table

# COMMAND ----------

print(f"\nWriting to Silver table: {silver_table}")

order_details_df.write \
    .format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .saveAsTable(silver_table)

print(f"✓ Saved {joined_count} enriched orders to {silver_table}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: Query Silver Table

# COMMAND ----------

print("\n" + "="*70)
print("SILVER LAYER SUMMARY")
print("="*70)

silver_df = spark.table(silver_table)
total_records = silver_df.count()

print(f"\nSilver Table: {silver_table}")
print(f"  Total Records: {total_records}")
print(f"  New Records: {joined_count}")

print("\nSample enriched orders:")
display(
    silver_df
    .select("order_id", "customer_name", "location", "product_name", "brand", 
            "category", "quantity", "unit_price", "total_amount", "order_timestamp")
    .orderBy(desc("silver_timestamp"))
    .limit(20)
)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 6: Analysis Queries

# COMMAND ----------

print("="*70)
print("SILVER LAYER ANALYSIS")
print("="*70)

# Orders by Brand
print("\n1. Orders by Brand:")
display(
    silver_df
    .groupBy("brand")
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("total_revenue"),
        avg("total_amount").alias("avg_order_value")
    )
    .orderBy(desc("total_revenue"))
)

# COMMAND ----------

# Orders by Category
print("\n2. Orders by Category:")
display(
    silver_df
    .groupBy("category")
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("total_revenue"),
        sum("quantity").alias("total_quantity")
    )
    .orderBy(desc("total_revenue"))
)

# COMMAND ----------

# Orders by Location and Category
print("\n3. Orders by Location and Category:")
display(
    silver_df
    .groupBy("location", "category")
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("total_revenue")
    )
    .orderBy(desc("total_revenue"))
    .limit(20)
)

# COMMAND ----------

# Top Customers
print("\n4. Top Customers by Revenue:")
display(
    silver_df
    .groupBy("customer_id", "customer_name", "location")
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("total_spent"),
        avg("total_amount").alias("avg_order_value")
    )
    .orderBy(desc("total_spent"))
    .limit(20)
)

# COMMAND ----------

# Top Products
print("\n5. Top Products by Sales:")
display(
    silver_df
    .groupBy("product_id", "product_name", "brand", "category")
    .agg(
        count("*").alias("order_count"),
        sum("quantity").alias("total_quantity_sold"),
        sum("total_amount").alias("total_revenue")
    )
    .orderBy(desc("total_revenue"))
)

# COMMAND ----------

print("\n" + "="*70)
print("✓ SILVER LAYER COMPLETE")
print("="*70)
print(f"Total enriched orders: {total_records}")
print(f"Join success rate: {successful_joins}/{joined_count} ({100*successful_joins/joined_count:.1f}%)")
print(f"\nData transformation:")
print(f"  - Bronze Orders ({orders_count}) + Bronze Products ({products_count})")
print(f"  - → Silver Order Details ({total_records}) with full product info")
print("="*70)
print("\nNext: Run 04_Gold_Layer to create business aggregations")