In [0]:
# Assuming you are running this in a Databricks notebook
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, countDistinct, round, current_timestamp
from pyspark.sql.window import Window
import pyspark.sql.functions as F

# spark = SparkSession.builder.appName("EcomDataSilverLayer").getOrCreate()

processed_data_path = "gs://batch-processing-de_stagging_data/"
silver_data_path = "gs://batch-processing-de_silver_data/"

print("Reading cleaned data...")
df_orders = spark.read.parquet(f"{processed_data_path}orders_cleaned")
df_order_items = spark.read.parquet(f"{processed_data_path}order_items_cleaned")
df_products = spark.read.parquet(f"{processed_data_path}products_cleaned")
df_customers = spark.read.parquet(f"{processed_data_path}customers_cleaned")
print("Cleaned data read successfully.")

# --- Join Orders and Order Items ---
print("Joining orders and order items...")
df_sales_details = df_order_items.join(df_orders, "order_id", "inner") \
    .join(df_products, "product_id", "inner") \
    .withColumn("line_item_total", col("quantity") * col("unit_price"))

# --- Aggregate Sales by Order and Customer ---
print("Aggregating sales by order and customer...")
df_orders_enriched = df_sales_details.groupBy("order_id", "customer_id", "order_date", "order_status", "total_amount") \
    .agg(
        sum("line_item_total").alias("calculated_order_total"),
        countDistinct("product_id").alias("distinct_products_in_order"),
        sum("quantity").alias("total_quantity_in_order")
    )

# Basic data quality check: Compare total_amount with calculated_order_total
df_orders_enriched = df_orders_enriched.withColumn(
    "total_amount_mismatch_flag",
    F.when(F.abs(col("total_amount") - col("calculated_order_total")) > 0.01, True).otherwise(False)
)

# --- Join with Customer data ---
print("Joining with customer data...")
df_silver_sales = df_orders_enriched.join(df_customers, "customer_id", "inner") \
    .select(
        col("order_id"),
        col("customer_id"),
        col("first_name").alias("customer_first_name"),
        col("last_name").alias("customer_last_name"),
        col("email").alias("customer_email"),
        col("country").alias("customer_country"),
        col("registration_date").alias("customer_registration_date"),
        col("order_date"),
        col("order_status"),
        col("total_amount"), # Original total amount
        col("calculated_order_total"), # Calculated total from line items
        col("distinct_products_in_order"),
        col("total_quantity_in_order"),
        col("total_amount_mismatch_flag")
    )

# Add a processing timestamp for the silver layer
df_silver_sales = df_silver_sales.withColumn("processed_timestamp", current_timestamp())

# Write to silver layer as Parquet
df_silver_sales.write.mode("overwrite").parquet(f"{silver_data_path}sales_silver")
print(f"Silver layer sales data written to {silver_data_path}sales_silver")

# You might also want to create a silver products table
df_silver_products = df_products.select(
    col("product_id"),
    col("product_name"),
    col("product_category"),
    col("product_price")
)
df_silver_products = df_silver_products.withColumn("processed_timestamp", current_timestamp())
df_silver_products.write.mode("overwrite").parquet(f"{silver_data_path}products_silver")
print(f"Silver layer products data written to {silver_data_path}products_silver")

print("Silver layer transformations complete.")