In [None]:
# Databricks notebook source
# COMMAND ----------
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DateType
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast

# COMMAND ----------
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
def load_data():
    try:
        # Load data from Unity Catalog tables
        orders_central_df = spark.table("catalog.sales.orders_central")
        orders_west_df = spark.table("catalog.sales.orders_west")
        orders_east_df = spark.table("catalog.sales.orders_east")
        orders_south_df = spark.table("catalog.sales.orders_south")
        quota_df = spark.table("catalog.sales.quota")
        returns_df = spark.table("catalog.sales.returns")
        return orders_central_df, orders_west_df, orders_east_df, orders_south_df, quota_df, returns_df
    except Exception as e:
        logger.error("Error loading data", exc_info=True)
        raise

# COMMAND ----------
def standardize_dates(df, date_columns):
    for date_col in date_columns:
        df = df.withColumn(date_col, F.to_date(F.col(date_col), "dd/MM/yyyy"))
    return df

# COMMAND ----------
def transform_data(orders_central_df, orders_west_df, orders_east_df, orders_south_df, returns_df):
    try:
        # Standardize Dates
        logger.info("Standardizing and cleaning data...")
        orders_central_df = orders_central_df.withColumn("Order Date", F.concat(F.col("Order Day"), F.lit("/"), F.col("Order Month"), F.lit("/"), F.col("Order Year")))
        orders_central_df = orders_central_df.withColumn("Ship Date", F.concat(F.col("Ship Day"), F.lit("/"), F.col("Ship Month"), F.lit("/"), F.col("Ship Year")))

        # Convert date strings to date type
        orders_central_df = standardize_dates(orders_central_df, ["Order Date", "Ship Date"])

        # Union Operations
        logger.info("Combining regional order datasets...")
        all_orders_df = orders_central_df.unionByName(orders_west_df).unionByName(orders_east_df).unionByName(orders_south_df)

        # Join Operations
        logger.info("Joining orders with returns data...")
        returns_df = returns_df.drop("Order Date", "Product ID")
        orders_returns_df = all_orders_df.join(broadcast(returns_df), ["Order ID"], "left")

        # Custom Calculations
        logger.info("Calculating custom fields...")
        orders_returns_df = orders_returns_df.withColumn("Days to Ship", F.datediff(F.col("Ship Date"), F.col("Order Date")))
        orders_returns_df = orders_returns_df.withColumn("Returned?", F.when(F.col("Return Reason").isNotNull(), "Yes").otherwise("No"))
        orders_returns_df = orders_returns_df.withColumn("Year of Sale", F.year(F.col("Order Date")))

        return orders_returns_df
    except Exception as e:
        logger.error("Error during data transformation", exc_info=True)
        raise

# COMMAND ----------
def aggregate_data(orders_returns_df):
    try:
        # Data Aggregation
        logger.info("Aggregating sales metrics...")
        aggregated_df = orders_returns_df.groupBy("Region", "Year of Sale").agg(
            F.sum("Sales").alias("Total Sales"),
            F.sum("Profit").alias("Total Profit"),
            F.sum("Quantity").alias("Total Quantity"),
            F.avg("Discount").alias("Average Discount")
        )
        return aggregated_df
    except Exception as e:
        logger.error("Error during data aggregation", exc_info=True)
        raise

# COMMAND ----------
def write_data(aggregated_df, orders_returns_df):
    try:
        # Output to Unity Catalog
        logger.info("Writing data to Unity Catalog tables...")
        aggregated_df.write.format("delta").mode("overwrite").saveAsTable("catalog.analytics.annual_regional_performance")
        orders_returns_df.write.format("delta").mode("overwrite").saveAsTable("catalog.analytics.superstore_sales")
    except Exception as e:
        logger.error("Error writing data to Unity Catalog", exc_info=True)
        raise

# COMMAND ----------
def main():
    try:
        orders_central_df, orders_west_df, orders_east_df, orders_south_df, quota_df, returns_df = load_data()
        orders_returns_df = transform_data(orders_central_df, orders_west_df, orders_east_df, orders_south_df, returns_df)
        aggregated_df = aggregate_data(orders_returns_df)
        write_data(aggregated_df, orders_returns_df)
        logger.info("ETL process completed successfully.")
    except Exception as e:
        logger.error("An error occurred during the ETL process", exc_info=True)

# COMMAND ----------
if __name__ == "__main__":
    main()
