In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Superstore Sales Data
# MAGIC This notebook performs an ETL process on Superstore sales data using PySpark.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DateType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load data from Unity Catalog tables
    orders_central_df = spark.table("catalog.db.orders_central")
    orders_west_df = spark.table("catalog.db.orders_west")
    orders_east_df = spark.table("catalog.db.orders_east")
    orders_south_df = spark.table("catalog.db.orders_south")
    quota_df = spark.table("catalog.db.quota")
    returns_df = spark.table("catalog.db.returns")

# COMMAND ----------

    # Data Cleansing and Standardization
    def cleanse_orders(df):
        return (df
                .withColumnRenamed("Discounts", "Discount")
                .withColumnRenamed("Product", "Product Name")
                .filter(df["Order ID"].isNotNull())
                .withColumn("Discount", df["Discount"].cast("string"))
                .withColumn("Sales", F.regexp_replace(df["Sales"], '[^0-9.]', '').cast("double"))
                .withColumn("Order Date", F.to_date(F.concat_ws("/", df["Order Day"], df["Order Month"], df["Order Year"]), "dd/MM/yyyy"))
                .withColumn("Ship Date", F.to_date(F.concat_ws("/", df["Ship Day"], df["Ship Month"], df["Ship Year"]), "dd/MM/yyyy"))
                .drop("Order Day", "Order Month", "Order Year", "Ship Day", "Ship Month", "Ship Year"))

    orders_central_df = cleanse_orders(orders_central_df)
    orders_west_df = cleanse_orders(orders_west_df)
    orders_east_df = cleanse_orders(orders_east_df)
    orders_south_df = cleanse_orders(orders_south_df)

# COMMAND ----------

    # Data Integration
    all_orders_df = orders_central_df.union(orders_west_df).union(orders_east_df).union(orders_south_df)

# COMMAND ----------

    # Join with returns data
    orders_returns_df = all_orders_df.join(returns_df, ["Order ID", "Product ID"], "right_outer")

# COMMAND ----------

    # Add custom calculations
    orders_returns_df = (orders_returns_df
                         .withColumn("Returned?", F.when(F.col("Return Reason").isNotNull(), "Yes").otherwise("No"))
                         .withColumn("Days to Ship", F.datediff(F.col("Ship Date"), F.col("Order Date")))
                         .withColumn("Discount", F.coalesce(F.col("Discount"), F.lit(0)))
                         .withColumn("Year of Sale", F.year(F.col("Order Date"))))

# COMMAND ----------

    # Data Restructuring: Unpivot quota data
    quota_unpivoted_df = quota_df.selectExpr("Region", "stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`) as (Year, Quota)")

# COMMAND ----------

    # Aggregations and Calculations
    aggregated_sales_df = (orders_returns_df
                           .groupBy("Region", "Year of Sale")
                           .agg(F.sum("Profit").alias("Total Profit"),
                                F.sum("Sales").alias("Total Sales"),
                                F.sum("Quantity").alias("Total Quantity"),
                                F.avg("Discount").alias("Average Discount")))

# COMMAND ----------

    # Write to Unity Catalog target tables
    aggregated_sales_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.annual_regional_performance")
    orders_returns_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.superstore_sales")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
