In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Unity Catalog Data
# MAGIC This notebook performs an ETL process on data from Unity Catalog tables, including data loading, transformation, and writing the results back to Unity Catalog.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    response_df = spark.table("catalog.db.response").select("Region", "2015", "2016", "2017", "2018")
    orders_east_df = spark.table("catalog.db.orders_east")
    orders_south_df = spark.table("catalog.db.orders_south")
    orders_central_df = spark.table("catalog.db.orders_central")
    return_reasons_df = spark.table("catalog.db.return_reasons")
    orders_west_df = spark.table("catalog.db.orders_west")

# COMMAND ----------

    # Unpivoting the response data
    logger.info("Unpivoting response data...")
    quota_df = response_df.selectExpr(
        "Region",
        "stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`) as (Year, Quota)"
    )

# COMMAND ----------

    # Union all orders data
    logger.info("Union all orders data...")
    all_orders_df = orders_east_df.union(orders_south_df).union(orders_central_df).union(orders_west_df).cache()

# COMMAND ----------

    # Join operations with broadcast
    logger.info("Joining dataframes...")
    joined_df = all_orders_df.join(
        F.broadcast(quota_df),
        (all_orders_df.Region == quota_df.Region) & (all_orders_df.Year == quota_df.Year),
        "inner"
    )

# COMMAND ----------

    # Custom Calculations
    logger.info("Performing custom calculations...")
    joined_df = joined_df.withColumn("Compensation", F.col("Base_Salary") + (F.col("Commission_Percentage") * F.col("Base_Salary")) + F.col("Bonus"))
    joined_df = joined_df.withColumn("Days_to_Ship", F.datediff(F.col("Ship_Date"), F.col("Order_Date")))
    joined_df = joined_df.withColumn("Returned", F.when(F.col("Return_Reason").isNotNull(), "Yes").otherwise("No"))
    joined_df = joined_df.withColumn("Year_of_Sale", F.year(F.col("Order_Date")))

# COMMAND ----------

    # Data Type Conversion
    logger.info("Converting data types...")
    joined_df = joined_df.withColumn("Discount", F.col("Discount").cast(DoubleType()))
    joined_df = joined_df.withColumn("Sales", F.regexp_replace(F.col("Sales"), "[^0-9.]", "").cast(DoubleType()))

# COMMAND ----------

    # Filtering and Sorting
    logger.info("Filtering and sorting data...")
    filtered_df = joined_df.filter(F.col("Year_of_Sale") > 2023).orderBy("Year_of_Sale")

# COMMAND ----------

    # Aggregations
    logger.info("Aggregating data...")
    aggregated_df = filtered_df.groupBy("Region", "Year_of_Sale").agg(
        F.sum("Profit").alias("Total_Profit"),
        F.sum("Sales").alias("Total_Sales"),
        F.sum("Quantity").alias("Total_Quantity"),
        F.avg("Discount").alias("Average_Discount")
    )

# COMMAND ----------

    # Write the processed data to Unity Catalog tables
    logger.info("Writing processed data to Unity Catalog tables...")
    aggregated_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.annual_regional_performance")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise
