In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Superstore Sales Data
# MAGIC This notebook performs an ETL process on Superstore sales data, including data loading, standardization, cleaning, integration, and aggregation.

# COMMAND ----------

import logging
from pyspark.sql import functions as F

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Step 1: Data Loading
    logger.info("Loading data from Unity Catalog tables.")
    orders_central_df = spark.table("genai_demo.citi.orders_central")
    orders_east_df = spark.table("genai_demo.citi.orders_east")
    orders_south_2015_df = spark.table("genai_demo.citi.orders_south_2015")
    orders_south_2016_df = spark.table("genai_demo.citi.orders_south_2016")
    orders_south_2017_df = spark.table("genai_demo.citi.orders_south_2017")
    orders_south_2018_df = spark.table("genai_demo.citi.orders_south_2018")
    orders_west_df = spark.table("genai_demo.citi.orders_west")
    quota_df = spark.table("genai_demo.citi.quota")
    returns_df = spark.table("genai_demo.citi.returns")

# COMMAND ----------

    # Step 2: Data Standardization
    logger.info("Standardizing data formats and column names.")
    def standardize_dates(df, date_cols):
        return df.withColumn(
            "Order Date", F.to_date(F.concat(F.col(date_cols[0]), F.lit("/"), F.col(date_cols[1]), F.lit("/"), F.col(date_cols[2])), "dd/MM/yyyy")
        ).drop(*date_cols)

    # Apply date standardization to all datasets with separate date components
    orders_central_df = standardize_dates(orders_central_df, ["Order Day", "Order Month", "Order Year"])
    # Repeat for other datasets if needed

# COMMAND ----------

    # Step 3: Data Cleaning
    logger.info("Cleaning data by filtering null Order IDs and filling missing values.")
    orders_central_df = orders_central_df.filter(F.col("Order ID").isNotNull())
    orders_central_df = orders_central_df.fillna({"Discount": 0})

# COMMAND ----------

    # Step 4: Data Integration and Consolidation
    logger.info("Integrating and consolidating data from different regions.")
    all_orders_df = orders_central_df.unionByName(orders_east_df).unionByName(orders_south_2015_df).unionByName(orders_south_2016_df).unionByName(orders_south_2017_df).unionByName(orders_south_2018_df).unionByName(orders_west_df)

    # Unpivot quota data
    quota_unpivoted_df = quota_df.selectExpr("Region", "stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`) as (Year, Quota)")

# COMMAND ----------

    # Step 5: Calculated Fields and Enhancements
    logger.info("Adding calculated fields such as Days to Ship and Returned?")
    all_orders_df = all_orders_df.withColumn("Days to Ship", F.datediff(F.col("Ship Date"), F.col("Order Date")))

    # Join with returns to add Returned? column
    all_orders_df = all_orders_df.join(returns_df, ["Order ID", "Product ID"], "left").withColumn("Returned?", F.when(F.col("Return Reason").isNotNull(), "Yes").otherwise("No"))

# COMMAND ----------

    # Step 6: Business Rules Application
    logger.info("Applying business rules to filter data.")
    all_orders_df = all_orders_df.filter(~((F.col("Discount") >= 17) & (F.col("Discount") <= 18)))

# COMMAND ----------

    # Step 7: Aggregation and Output
    logger.info("Aggregating sales metrics by region and year.")
    all_orders_df = all_orders_df.withColumn("Year of Sale", F.year(F.col("Order Date")))
    annual_performance_df = all_orders_df.groupBy("Region", "Year of Sale").agg(
        F.sum("Profit").alias("Total Profit"),
        F.sum("Sales").alias("Total Sales"),
        F.sum("Quantity").alias("Total Quantity"),
        F.avg("Discount").alias("Average Discount")
    )

    # Write outputs to Unity Catalog tables
    logger.info("Writing aggregated data to Unity Catalog tables.")
    annual_performance_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.citi.annual_regional_performance")
    all_orders_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.citi.superstore_sales")

    logger.info("ETL process completed successfully.")

# COMMAND ----------

except Exception as e:
    logger.error("An error occurred during the ETL process: %s", e)
    raise
