In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Orders Data
# MAGIC This notebook performs an ETL process on orders data from various regions, cleans and transforms the data, and aggregates it for reporting.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, StringType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Data Loading
# MAGIC Load data from Unity Catalog tables into DataFrames.

# COMMAND ----------

try:
    logger.info("Loading data from Unity Catalog tables...")
    orders_central_df = spark.table("genai_demo.citi.orders_central")
    orders_east_df = spark.table("genai_demo.citi.orders_east")
    orders_south_2015_df = spark.table("genai_demo.citi.orders_south_2015")
    orders_south_2016_df = spark.table("genai_demo.citi.orders_south_2016")
    orders_south_2017_df = spark.table("genai_demo.citi.orders_south_2017")
    orders_south_2018_df = spark.table("genai_demo.citi.orders_south_2018")
    orders_west_df = spark.table("genai_demo.citi.orders_west")
    quota_df = spark.table("genai_demo.citi.quota")
    returns_df = spark.table("genai_demo.citi.returns")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Combine Orders Data
# MAGIC Combine all regional orders into a single DataFrame.

# COMMAND ----------

    logger.info("Combining all regional orders into a single DataFrame...")
    orders_df = orders_central_df.union(orders_east_df).union(orders_south_2015_df)\
        .union(orders_south_2016_df).union(orders_south_2017_df).union(orders_south_2018_df)\
        .union(orders_west_df)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Data Cleaning and Standardization
# MAGIC Perform data cleaning and standardization on the combined DataFrame.

# COMMAND ----------

    logger.info("Performing data cleaning and standardization...")
    orders_df = orders_df.filter(orders_df["Order ID"].isNotNull())
    orders_df = orders_df.withColumnRenamed("Discounts", "Discount").withColumnRenamed("Product", "Product Name")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Transformation Logic
# MAGIC Apply transformation logic to prepare the data for analysis.

# COMMAND ----------

    logger.info("Applying transformation logic...")
    orders_df = orders_df.withColumn(
        "Order Date", 
        F.concat(F.col("Order Day").cast(StringType()), F.lit("/"), 
                 F.col("Order Month").cast(StringType()), F.lit("/"), 
                 F.col("Order Year").cast(StringType()))
    ).withColumn(
        "Ship Date", 
        F.concat(F.col("Ship Day").cast(StringType()), F.lit("/"), 
                 F.col("Ship Month").cast(StringType()), F.lit("/"), 
                 F.col("Ship Year").cast(StringType()))
    )

    orders_df = orders_df.withColumn(
        "Sales", 
        F.regexp_replace(F.col("Sales"), "[^0-9.]", "").cast(DoubleType())
    ).withColumn(
        "Discount", 
        F.col("Discount").cast(StringType())
    )

    orders_df = orders_df.withColumn(
        "Days to Ship", 
        F.datediff(F.to_date(F.col("Ship Date"), "dd/MM/yyyy"), F.to_date(F.col("Order Date"), "dd/MM/yyyy"))
    ).withColumn(
        "Returned?", 
        F.when(F.isnull(F.col("Return Reason")), "No").otherwise("Yes")
    ).withColumn(
        "Year of Sale", 
        F.year(F.to_date(F.col("Order Date"), "dd/MM/yyyy"))
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Data Aggregation
# MAGIC Aggregate the data for reporting purposes.

# COMMAND ----------

    logger.info("Aggregating data for reporting...")
    aggregated_df = orders_df.groupBy("Region", "Year of Sale").agg(
        F.sum("Profit").alias("Total Profit"),
        F.sum("Sales").alias("Total Sales"),
        F.sum("Quantity").alias("Total Quantity"),
        F.avg("Discount").alias("Average Discount")
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: Output Data
# MAGIC Write the transformed data to a Unity Catalog table.

# COMMAND ----------

    logger.info("Writing transformed data to Unity Catalog table...")
    aggregated_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.citi.annual_regional_performance")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise
