In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Superstore Sales Data
# MAGIC This notebook performs an ETL process on Superstore sales data using PySpark. It includes data loading, cleansing, integration, restructuring, aggregation, and writing the results back to Unity Catalog tables.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, StringType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

def load_data():
    """Load data from Unity Catalog tables."""
    logger.info("Loading data from Unity Catalog tables")
    orders_central_df = spark.table("catalog.db.orders_central")
    orders_west_df = spark.table("catalog.db.orders_west")
    orders_east_df = spark.table("catalog.db.orders_east")
    orders_south_df = spark.table("catalog.db.orders_south")
    quota_df = spark.table("catalog.db.quota")
    returns_df = spark.table("catalog.db.returns")
    return orders_central_df, orders_west_df, orders_east_df, orders_south_df, quota_df, returns_df

# COMMAND ----------

def cleanse_and_standardize(orders_df):
    """Perform data cleansing and standardization."""
    logger.info("Performing data cleansing and standardization")
    orders_df = orders_df.withColumn(
        "Order Date",
        F.to_date(F.concat_ws("/", F.col("Order Day"), F.col("Order Month"), F.col("Order Year")), "dd/MM/yyyy")
    ).withColumnRenamed("Discounts", "Discount")
    orders_df = orders_df.drop("Order Day", "Order Month", "Order Year", "Ship Day", "Ship Month", "Ship Year")
    orders_df = orders_df.withColumn("Discount", F.col("Discount").cast(StringType()))
    orders_df = orders_df.withColumn("Sales", F.regexp_replace(F.col("Sales"), r'[:Letter:]', '').cast(DoubleType()))
    return orders_df

# COMMAND ----------

def integrate_data(orders_central_df, orders_west_df, orders_east_df, orders_south_df, returns_df):
    """Integrate data from multiple sources."""
    logger.info("Integrating data from multiple sources")
    all_orders_df = orders_central_df.union(orders_west_df).union(orders_east_df).union(orders_south_df)
    all_orders_df.cache()
    orders_returns_df = all_orders_df.join(F.broadcast(returns_df), ["Order ID", "Product ID"], "right")
    return orders_returns_df

# COMMAND ----------

def restructure_data(quota_df, orders_returns_df):
    """Restructure data."""
    logger.info("Restructuring data")
    quota_unpivoted_df = quota_df.selectExpr("Region", "stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`) as (Year, Quota)")
    state_mapping = {"Arizona": "AZ", "California": "CA"}
    orders_returns_df = orders_returns_df.replace(state_mapping, subset=["State"])
    return quota_unpivoted_df, orders_returns_df

# COMMAND ----------

def perform_aggregations(orders_returns_df):
    """Perform aggregations and custom calculations."""
    logger.info("Performing aggregations and custom calculations")
    aggregated_sales_df = orders_returns_df.groupBy("Region", F.year("Order Date").alias("Year of Sale")).agg(
        F.sum("Profit").alias("Total Profit"),
        F.sum("Sales").alias("Total Sales"),
        F.sum("Quantity").alias("Total Quantity"),
        F.avg("Discount").alias("Average Discount")
    )
    orders_returns_df = orders_returns_df.withColumn("Days to Ship", F.datediff(F.col("Ship Date"), F.col("Order Date")))
    orders_returns_df = orders_returns_df.withColumn("Returned?", F.expr("IF(ISNULL(Return Reason), 'No', 'Yes')"))
    return aggregated_sales_df, orders_returns_df

# COMMAND ----------

def write_data(aggregated_sales_df, orders_returns_df):
    """Write transformed data to Unity Catalog tables."""
    logger.info("Writing transformed data to Unity Catalog tables")
    aggregated_sales_df.write.format("delta").mode("overwrite").saveAsTable("catalog.db.annual_regional_performance")
    orders_returns_df.write.format("delta").mode("overwrite").saveAsTable("catalog.db.superstore_sales")

# COMMAND ----------

try:
    # Load data
    orders_central_df, orders_west_df, orders_east_df, orders_south_df, quota_df, returns_df = load_data()

    # Cleanse and standardize data
    orders_central_df = cleanse_and_standardize(orders_central_df)
    orders_west_df = cleanse_and_standardize(orders_west_df)
    orders_east_df = cleanse_and_standardize(orders_east_df)
    orders_south_df = cleanse_and_standardize(orders_south_df)

    # Integrate data
    orders_returns_df = integrate_data(orders_central_df, orders_west_df, orders_east_df, orders_south_df, returns_df)

    # Restructure data
    quota_unpivoted_df, orders_returns_df = restructure_data(quota_df, orders_returns_df)

    # Perform aggregations
    aggregated_sales_df, orders_returns_df = perform_aggregations(orders_returns_df)

    # Write data
    write_data(aggregated_sales_df, orders_returns_df)

    logger.info("ETL process completed successfully")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
