In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Superstore Sales Data
# MAGIC This notebook performs an ETL process on Superstore sales data using PySpark.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Step 1: Data Loading
    logger.info("Loading data from Unity Catalog tables.")
    orders_central_df = spark.table("genai_demo.citi.orders_central")
    orders_east_df = spark.table("genai_demo.citi.orders_east")
    orders_south_2015_df = spark.table("genai_demo.citi.orders_south_2015")
    orders_south_2016_df = spark.table("genai_demo.citi.orders_south_2016")
    orders_south_2017_df = spark.table("genai_demo.citi.orders_south_2017")
    orders_south_2018_df = spark.table("genai_demo.citi.orders_south_2018")
    orders_west_df = spark.table("genai_demo.citi.orders_west")
    quota_df = spark.table("genai_demo.citi.quota")
    returns_df = spark.table("genai_demo.citi.returns")

# COMMAND ----------

    # Step 2: Data Standardization
    logger.info("Standardizing data.")
    orders_central_df = orders_central_df.withColumn(
        "Order Date", 
        F.concat(F.col("Order Day"), F.lit("/"), F.col("Order Month"), F.lit("/"), F.col("Order Year"))
    ).withColumnRenamed("Discounts", "Discount")

# COMMAND ----------

    # Step 3: Data Cleaning
    logger.info("Cleaning data.")
    orders_central_df = orders_central_df.filter(orders_central_df["Order ID"].isNotNull())
    orders_central_df = orders_central_df.withColumn("Sales", F.col("Sales").cast(DoubleType()))

# COMMAND ----------

    # Step 4: Pivoting and Consolidation
    logger.info("Pivoting and consolidating data.")
    quota_df = quota_df.selectExpr("Region", "stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`) as (Year, Quota)")
    all_orders_df = orders_central_df.union(orders_east_df).union(orders_south_2015_df).union(orders_south_2016_df).union(orders_south_2017_df).union(orders_south_2018_df).union(orders_west_df)

# COMMAND ----------

    # Step 5: Calculated Fields and Enhancements
    logger.info("Adding calculated fields.")
    all_orders_df = all_orders_df.withColumn("Days to Ship", F.datediff(F.col("Ship Date"), F.col("Order Date")))
    all_orders_df = all_orders_df.withColumn("Returned?", F.when(F.col("Return Reason").isNotNull(), "Yes").otherwise("No"))
    aggregated_df = all_orders_df.groupBy("Region", F.year("Order Date").alias("Year of Sale")).agg(
        F.sum("Profit").alias("Total Profit"),
        F.sum("Sales").alias("Total Sales"),
        F.sum("Quantity").alias("Total Quantity"),
        F.avg("Discount").alias("Average Discount")
    )

# COMMAND ----------

    # Step 6: Business Rules
    logger.info("Applying business rules.")
    all_orders_df = all_orders_df.filter((F.col("Discount") < 17) | (F.col("Discount") > 18))

# COMMAND ----------

    # Step 7: Output Generation
    logger.info("Writing output to Unity Catalog tables.")
    aggregated_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.citi.annual_regional_performance")
    all_orders_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.citi.superstore_sales")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process.", exc_info=True)
