In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Orders Data
# MAGIC This notebook performs an ETL process on orders data from Unity Catalog tables, including data loading, cleaning, transformation, and saving the results back to Unity Catalog.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DateType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    orders_central = spark.table("genai_demo.citi.orders_central").withColumn("Region", F.lit("Central"))
    orders_east = spark.table("genai_demo.citi.orders_east").withColumn("Region", F.lit("East"))
    orders_south_2015 = spark.table("genai_demo.citi.orders_south_2015").withColumn("Region", F.lit("South"))
    orders_south_2016 = spark.table("genai_demo.citi.orders_south_2016").withColumn("Region", F.lit("South"))
    orders_south_2017 = spark.table("genai_demo.citi.orders_south_2017").withColumn("Region", F.lit("South"))
    orders_south_2018 = spark.table("genai_demo.citi.orders_south_2018").withColumn("Region", F.lit("South"))
    orders_west = spark.table("genai_demo.citi.orders_west").withColumn("Region", F.lit("West"))
    quota = spark.table("genai_demo.citi.quota")
    returns = spark.table("genai_demo.citi.returns")

# COMMAND ----------

    # Standardize Date Fields
    def standardize_dates(df):
        return (df
                .withColumn("Order Date", F.concat_ws("-", F.col("Order Year"), F.col("Order Month"), F.col("Order Day")).cast(DateType()))
                .withColumn("Ship Date", F.concat_ws("-", F.col("Ship Year"), F.col("Ship Month"), F.col("Ship Day")).cast(DateType()))
                .drop("Order Year", "Order Month", "Order Day", "Ship Year", "Ship Month", "Ship Day"))

    logger.info("Standardizing date fields...")
    orders_central = standardize_dates(orders_central)
    orders_east = standardize_dates(orders_east)
    orders_south_2015 = standardize_dates(orders_south_2015)
    orders_south_2016 = standardize_dates(orders_south_2016)
    orders_south_2017 = standardize_dates(orders_south_2017)
    orders_south_2018 = standardize_dates(orders_south_2018)
    orders_west = standardize_dates(orders_west)

# COMMAND ----------

    # Data Cleaning
    def clean_data(df):
        return (df
                .filter(F.col("Order ID").isNotNull())
                .withColumn("Discount", F.when(F.col("Discount").isNull(), 0).otherwise(F.col("Discount")))
                .withColumn("Sales", F.regexp_replace(F.col("Sales"), "[^0-9.]", "").cast("double")))

    logger.info("Cleaning data...")
    orders_central = clean_data(orders_central)
    orders_east = clean_data(orders_east)
    orders_south_2015 = clean_data(orders_south_2015)
    orders_south_2016 = clean_data(orders_south_2016)
    orders_south_2017 = clean_data(orders_south_2017)
    orders_south_2018 = clean_data(orders_south_2018)
    orders_west = clean_data(orders_west)

# COMMAND ----------

    # Combine Regional Datasets
    logger.info("Combining regional datasets...")
    all_orders = orders_central.union(orders_east).union(orders_south_2015).union(orders_south_2016).union(orders_south_2017).union(orders_south_2018).union(orders_west)

# COMMAND ----------

    # Calculated Fields
    logger.info("Adding calculated fields...")
    all_orders = (all_orders
                  .withColumn("Days to Ship", F.datediff(F.col("Ship Date"), F.col("Order Date")))
                  .withColumn("Returned?", F.when(F.col("Return Reason").isNotNull(), "Yes").otherwise("No"))
                  .withColumn("Year of Sale", F.year(F.col("Order Date"))))

# COMMAND ----------

    # Exclude Specific Discounts
    logger.info("Excluding specific discounts...")
    all_orders = all_orders.filter(~((F.col("Discount") >= 17) & (F.col("Discount") <= 18)))

# COMMAND ----------

    # Pivot Quota Data
    logger.info("Pivoting quota data...")
    quota_unpivoted = quota.selectExpr("Region", "stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`) as (Year, Quota)")

# COMMAND ----------

    # Join Orders with Returns
    logger.info("Joining orders with returns...")
    all_orders = all_orders.join(returns, ["Order ID", "Product ID"], "left")

# COMMAND ----------

    # Aggregate Sales Metrics
    logger.info("Aggregating sales metrics...")
    annual_performance = (all_orders
                          .groupBy("Region", "Year of Sale")
                          .agg(F.sum("Profit").alias("Total Profit"),
                               F.sum("Sales").alias("Total Sales"),
                               F.sum("Quantity").alias("Total Quantity"),
                               F.avg("Discount").alias("Average Discount")))

# COMMAND ----------

    # Save the transformed data back to Unity Catalog
    logger.info("Saving transformed data to Unity Catalog...")
    all_orders.write.format("delta").mode("overwrite").saveAsTable("genai_demo.citi.all_orders_transformed")
    annual_performance.write.format("delta").mode("overwrite").saveAsTable("genai_demo.citi.annual_performance")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
