In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Superstore Sales Data
# MAGIC This notebook performs an ETL process on Superstore sales data using PySpark. It loads data from Unity Catalog tables, performs transformations, and saves the results back to the catalog.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, StringType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load Orders data from Unity Catalog tables
    orders_central_df = spark.table("catalog.db.orders_central")
    orders_west_df = spark.table("catalog.db.orders_west")
    orders_east_df = spark.table("catalog.db.orders_east")
    orders_south_df = spark.table("catalog.db.orders_south")

    # Load Quota data
    quota_df = spark.table("catalog.db.quota")

    # Load Returns data
    returns_df = spark.table("catalog.db.returns")

    logger.info("Data loaded successfully from Unity Catalog tables.")

# COMMAND ----------

    # Standardize column names
    orders_central_df = orders_central_df.withColumnRenamed("Discounts", "Discount").withColumnRenamed("Product", "Product Name")

    # Exclude null entries
    orders_central_df = orders_central_df.filter(orders_central_df["Order ID"].isNotNull())

# COMMAND ----------

    # Union datasets
    all_orders_df = orders_central_df.union(orders_west_df).union(orders_east_df).union(orders_south_df).cache()

# COMMAND ----------

    # Join with returns
    returns_df = returns_df.drop("Order Date", "Sub-Category")
    enriched_orders_df = all_orders_df.join(F.broadcast(returns_df), ["Order ID", "Product ID"], "left").withColumn("Returned?", returns_df["Return Reason"].isNotNull()).cache()

# COMMAND ----------

    # Calculate Days to Ship
    enriched_orders_df = enriched_orders_df.withColumn("Days to Ship", F.datediff(F.col("Ship Date"), F.col("Order Date")))

    # Extract Year of Sale
    enriched_orders_df = enriched_orders_df.withColumn("Year of Sale", F.year(F.col("Order Date")))

# COMMAND ----------

    # Apply discount filter
    filtered_orders_df = enriched_orders_df.filter(~((F.col("Discount") >= 17) & (F.col("Discount") <= 18)))

# COMMAND ----------

    # Aggregate sales data
    aggregated_sales_df = filtered_orders_df.groupBy("Region", "Year of Sale").agg(
        F.sum("Profit").alias("Total Profit"),
        F.sum("Sales").alias("Total Sales"),
        F.sum("Quantity").alias("Total Quantity"),
        F.avg("Discount").alias("Average Discount")
    )

    logger.info("Data transformation completed successfully.")

# COMMAND ----------

    # Save to Databricks catalog
    aggregated_sales_df.write.format("delta").mode("overwrite").saveAsTable("catalog.db.annual_regional_performance")
    filtered_orders_df.write.format("delta").mode("overwrite").saveAsTable("catalog.db.superstore_sales")

    logger.info("Data saved successfully to Unity Catalog tables.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
