# Silver Layer - Data Processing - FINAL VERSION

In [0]:
# Azure Storage Configuration
STORAGE_ACCOUNT = "dataworks"  # ✅ Set to your actual storage account name

# Authentication - Replace with your actual access key
#spark.conf.set(f"fs.azure.account.key.{STORAGE_ACCOUNT}.dfs.core.windows.net", "your-actual-access-key-here")

print(f"Starting Silver Layer with Azure Storage: {STORAGE_ACCOUNT}")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from datetime import datetime

# Get task values from Bronze layer
try:
    bronze_customers_count = dbutils.jobs.taskValues.get(taskKey="bronze_ingestion", key="bronze_customers_count", debugValue=500)
    bronze_products_count = dbutils.jobs.taskValues.get(taskKey="bronze_ingestion", key="bronze_products_count", debugValue=100)
    bronze_orders_count = dbutils.jobs.taskValues.get(taskKey="bronze_ingestion", key="bronze_orders_count", debugValue=1000)
    bronze_order_items_count = dbutils.jobs.taskValues.get(taskKey="bronze_ingestion", key="bronze_order_items_count", debugValue=2000)
    bronze_quality_score = dbutils.jobs.taskValues.get(taskKey="bronze_ingestion", key="bronze_data_quality_score", debugValue=100.0)
    bronze_validation_passed = dbutils.jobs.taskValues.get(taskKey="bronze_ingestion", key="bronze_validation_passed", debugValue=True)
    
    print(f"📋 Bronze layer metrics received:")
    print(f"   Customers: {bronze_customers_count}")
    print(f"   Products: {bronze_products_count}")
    print(f"   Orders: {bronze_orders_count}")
    print(f"   Order Items: {bronze_order_items_count}")
    print(f"   Quality Score: {bronze_quality_score}%")
    print(f"   Validation Passed: {bronze_validation_passed}")
    
except Exception as e:
    print(f"Could not retrieve Bronze task values (running standalone): {e}")
    bronze_validation_passed = True

# Azure Storage Path Configuration
SILVER_PATH = f'abfss://silver@{STORAGE_ACCOUNT}.dfs.core.windows.net/delta/'

# Don't create the generic silver directory - let Delta handle individual table paths
# Database setup
spark.sql("CREATE DATABASE IF NOT EXISTS ecommerce_silver")
spark.sql("USE ecommerce_silver")

print(f"✅ Silver layer configured: {SILVER_PATH}")
print("📋 Delta will create individual table paths as needed")

In [0]:
# Load Bronze tables from Bronze container
bronze_customers = spark.table("ecommerce_bronze.customers")
bronze_products = spark.table("ecommerce_bronze.products") 
bronze_orders = spark.table("ecommerce_bronze.orders")
bronze_order_items = spark.table("ecommerce_bronze.order_items")

print("✅ Bronze tables loaded successfully")

In [0]:
# Clean customers data
print("🔄 Processing customers...")

silver_customers = (bronze_customers
    .filter(F.col("customer_id").isNotNull())
    .filter(F.col("is_active") == True)
    .withColumn("full_name", F.concat(F.col("first_name"), F.lit(" "), F.col("last_name")))
    .withColumn("email_domain", F.split(F.col("email"), "@").getItem(1))
    .withColumn("customer_since_days", F.datediff(F.current_date(), F.col("registration_date")))
    .select("customer_id", "full_name", "email", "email_domain", "country", "city", 
            "segment", "registration_date", "customer_since_days")
)

# Remove bad emails
silver_customers = silver_customers.filter(F.col("email").contains("@"))

# Save to Silver container with specific table path
customers_silver_path = f"{SILVER_PATH}customers/"
silver_customers.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .option("path", customers_silver_path) \
    .saveAsTable("ecommerce_silver.customers")

customers_silver_count = silver_customers.count()
print(f"✅ Silver customers: {customers_silver_count} records")

In [0]:
# Clean products data
print("🔄 Processing products...")

silver_products = (bronze_products
    .filter(F.col("product_id").isNotNull())
    .filter(F.col("price") > 0)
    .filter(F.col("is_active") == True)
    .withColumn("price_category", 
        F.when(F.col("price") < 50, "Budget")
        .when(F.col("price") < 200, "Standard") 
        .otherwise("Premium"))
    .withColumn("profit_margin", 
        F.round((F.col("price") - F.col("cost")) / F.col("price") * 100, 2))
    .withColumn("inventory_status",
        F.when(F.col("stock_quantity") < 20, "Low Stock")
        .when(F.col("stock_quantity") < 100, "Normal")
        .otherwise("High Stock"))
    .select("product_id", "product_name", "category", "subcategory", "price", "cost", 
            "price_category", "profit_margin", "stock_quantity", "inventory_status")
)

# Save to Silver container with specific table path
products_silver_path = f"{SILVER_PATH}products/"
silver_products.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .option("path", products_silver_path) \
    .saveAsTable("ecommerce_silver.products")

products_silver_count = silver_products.count()
print(f"✅ Silver products: {products_silver_count} records")

In [0]:
# Clean and enrich orders data
print("🔄 Processing orders...")

silver_orders = (bronze_orders
    .filter(F.col("order_id").isNotNull())
    .filter(F.col("total_amount") > 0)
    .join(silver_customers.select("customer_id", "country", "segment"), "customer_id", "inner")
    .withColumn("order_year", F.year(F.col("order_date")))
    .withColumn("order_month", F.month(F.col("order_date")))
    .withColumn("order_quarter", F.quarter(F.col("order_date")))
    .withColumn("order_dayofweek", F.dayofweek(F.col("order_date")))
    .withColumn("revenue_category",
        F.when(F.col("total_amount") < 100, "Small")
        .when(F.col("total_amount") < 500, "Medium")
        .otherwise("Large"))
    .withColumn("days_since_order", F.datediff(F.current_date(), F.col("order_date")))
    .withColumn("is_weekend", F.when(F.col("order_dayofweek").isin([1, 7]), True).otherwise(False))
    .select("order_id", "customer_id", "order_date", "order_year", "order_month", "order_quarter",
            "order_dayofweek", "is_weekend", "status", "payment_method", "subtotal", "shipping_cost", 
            "tax_amount", "total_amount", "revenue_category", "days_since_order", "country", "segment")
)

# Save to Silver container with specific table path
orders_silver_path = f"{SILVER_PATH}orders/"
silver_orders.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .option("path", orders_silver_path) \
    .saveAsTable("ecommerce_silver.orders")

orders_silver_count = silver_orders.count()
print(f"✅ Silver orders: {orders_silver_count} records")

In [0]:
# Clean order items and join with product info
print("🔄 Processing order items...")

silver_order_items = (bronze_order_items
    .filter(F.col("order_id").isNotNull())
    .filter(F.col("product_id").isNotNull())
    .filter(F.col("quantity") > 0)
    .join(silver_products.select("product_id", "price", "cost", "price_category", "profit_margin"), "product_id", "inner")
    .withColumn("total_cost", F.col("quantity") * F.col("cost"))
    .withColumn("profit", F.col("line_total") - F.col("total_cost"))
    .withColumn("discount_amount", F.col("quantity") * F.col("price") * F.col("discount_percent") / 100)
    .withColumn("profit_margin_item", F.round(F.col("profit") / F.col("line_total") * 100, 2))
    .select("order_id", "product_id", "product_name", "category", "quantity", 
            "unit_price", "discount_percent", "discount_amount", "line_total", 
            "total_cost", "profit", "profit_margin_item", "price_category")
)

# Save to Silver container with specific table path
order_items_silver_path = f"{SILVER_PATH}order_items/"
silver_order_items.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .option("path", order_items_silver_path) \
    .saveAsTable("ecommerce_silver.order_items")

order_items_silver_count = silver_order_items.count()
print(f"✅ Silver order items: {order_items_silver_count} records")

In [0]:
# Create customer summary
print("🔄 Creating customer summary...")

customer_summary = (silver_orders
    .groupBy("customer_id", "country", "segment")
    .agg(
        F.count("order_id").alias("total_orders"),
        F.sum("total_amount").alias("total_spent"),
        F.avg("total_amount").alias("avg_order_value"),
        F.max("order_date").alias("last_order_date"),
        F.min("order_date").alias("first_order_date")
    )
    .withColumn("customer_lifetime_value", F.col("total_spent"))
    .withColumn("days_since_last_order", 
        F.datediff(F.current_date(), F.col("last_order_date")))
    .withColumn("customer_tenure_days",
        F.datediff(F.col("last_order_date"), F.col("first_order_date")))
    .withColumn("order_frequency", 
        F.when(F.col("customer_tenure_days") > 0, 
               F.round(F.col("total_orders") / (F.col("customer_tenure_days") / 30.0), 2))
        .otherwise(0))
    .withColumn("customer_value_tier",
        F.when(F.col("total_spent") >= 1000, "High Value")
        .when(F.col("total_spent") >= 500, "Medium Value")
        .otherwise("Low Value"))
)

# Save to Silver container with specific table path
customer_summary_path = f"{SILVER_PATH}customer_summary/"
customer_summary.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .option("path", customer_summary_path) \
    .saveAsTable("ecommerce_silver.customer_summary")

customer_summary_count = customer_summary.count()
print(f"✅ Customer summary: {customer_summary_count} records")

In [0]:
# Data Quality Validation
print("🔍 SILVER LAYER DATA QUALITY CHECKS")
print("=" * 50)

# Check for data integrity
orders_without_customers = (silver_orders
    .join(silver_customers, "customer_id", "left_anti")
    .count())

items_without_orders = (silver_order_items
    .join(silver_orders.select("order_id"), "order_id", "left_anti")
    .count())

items_without_products = (silver_order_items
    .join(silver_products.select("product_id"), "product_id", "left_anti")
    .count())

negative_profits = silver_order_items.filter(F.col('profit') < 0).count()
invalid_emails = silver_customers.filter(~F.col("email").contains("@")).count()

print(f"🔍 Data Integrity Checks:")
print(f"   Orders without customers: {orders_without_customers}")
print(f"   Items without orders: {items_without_orders}")
print(f"   Items without products: {items_without_products}")
print(f"   Negative profits: {negative_profits}")
print(f"   Invalid emails: {invalid_emails}")

# Revenue validation
bronze_revenue = bronze_orders.agg(F.sum("total_amount")).collect()[0][0]
silver_revenue = silver_orders.agg(F.sum("total_amount")).collect()[0][0]
revenue_variance = abs(bronze_revenue - silver_revenue) / bronze_revenue * 100

print(f"\n💰 Revenue Validation:")
print(f"   Bronze revenue: ${bronze_revenue:,.2f}")
print(f"   Silver revenue: ${silver_revenue:,.2f}")
print(f"   Variance: {revenue_variance:.2f}%")

# Calculate Silver quality score
total_silver_issues = (orders_without_customers + items_without_orders + 
                      items_without_products + negative_profits + invalid_emails)
total_silver_records = (customers_silver_count + products_silver_count + 
                       orders_silver_count + order_items_silver_count)

if total_silver_issues == 0 and revenue_variance < 1:
    silver_quality_score = 100.0
else:
    silver_quality_score = max(0, 100 - (total_silver_issues * 2) - revenue_variance)

print(f"\n🎯 Silver Quality Score: {silver_quality_score:.1f}%")

In [0]:
# Set task values for Gold layer
silver_processing_timestamp = str(datetime.now())

dbutils.jobs.taskValues.set(key="silver_customers_count", value=customers_silver_count)
dbutils.jobs.taskValues.set(key="silver_products_count", value=products_silver_count)
dbutils.jobs.taskValues.set(key="silver_orders_count", value=orders_silver_count)
dbutils.jobs.taskValues.set(key="silver_order_items_count", value=order_items_silver_count)
dbutils.jobs.taskValues.set(key="silver_customer_summary_count", value=customer_summary_count)
dbutils.jobs.taskValues.set(key="silver_quality_score", value=float(silver_quality_score))
dbutils.jobs.taskValues.set(key="silver_total_issues", value=total_silver_issues)
dbutils.jobs.taskValues.set(key="silver_revenue", value=float(silver_revenue))
dbutils.jobs.taskValues.set(key="revenue_variance_pct", value=float(revenue_variance))
dbutils.jobs.taskValues.set(key="silver_processing_timestamp", value=silver_processing_timestamp)
dbutils.jobs.taskValues.set(key="silver_container_path", value=f"abfss://silver@{STORAGE_ACCOUNT}.dfs.core.windows.net/")

# Data retention from Bronze
data_retention_pct = (orders_silver_count / bronze_orders_count * 100) if bronze_orders_count > 0 else 100
dbutils.jobs.taskValues.set(key="data_retention_percentage", value=float(data_retention_pct))

print("📋 SILVER LAYER SUMMARY")
print("=" * 40)
print(f"📁 Storage Location: abfss://silver@{STORAGE_ACCOUNT}.dfs.core.windows.net/")
print(f"📊 Records Processed:")
print(f"   Customers: {customers_silver_count:,}")
print(f"   Products: {products_silver_count:,}")
print(f"   Orders: {orders_silver_count:,}")
print(f"   Order Items: {order_items_silver_count:,}")
print(f"   Customer Summary: {customer_summary_count:,}")
print(f"🎯 Data Quality Score: {silver_quality_score:.1f}%")
print(f"💰 Revenue Preserved: {silver_revenue:,.2f} ({100-revenue_variance:.1f}%)")
print(f"📈 Data Retention: {data_retention_pct:.1f}%")

print("\n✅ Silver layer processing complete!")
print("📁 Data stored in Azure Silver container")
print("📋 Task values set for Gold layer analytics")
print("🔜 Next: Run 04_gold_analytics.py")