In [None]:
# Databricks notebook source
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, StringType

# COMMAND ----------

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Step 1: Data Loading
    logger.info("Loading data from Unity Catalog tables...")
    orders_central_df = spark.table("genai_demo.citi.orders_central")
    orders_east_df = spark.table("genai_demo.citi.orders_east")
    orders_south_2015_df = spark.table("genai_demo.citi.orders_south_2015")
    orders_south_2016_df = spark.table("genai_demo.citi.orders_south_2016")
    orders_south_2017_df = spark.table("genai_demo.citi.orders_south_2017")
    orders_south_2018_df = spark.table("genai_demo.citi.orders_south_2018")
    orders_west_df = spark.table("genai_demo.citi.orders_west")
    quota_df = spark.table("genai_demo.citi.quota")
    returns_df = spark.table("genai_demo.citi.returns")

# COMMAND ----------

    # Step 2: Data Cleaning and Standardization
    logger.info("Performing data cleaning and standardization...")
    def standardize_dates(df, order_day, order_month, order_year, ship_day, ship_month, ship_year):
        return df.withColumn(
            "Order Date", F.concat(F.col(order_day), F.lit("/"), F.col(order_month), F.lit("/"), F.col(order_year))
        ).withColumn(
            "Ship Date", F.concat(F.col(ship_day), F.lit("/"), F.col(ship_month), F.lit("/"), F.col(ship_year))
        ).drop(order_day, order_month, order_year, ship_day, ship_month, ship_year)

    def transform_orders(df):
        return df.filter(F.col("Order ID").isNotNull()) \
                 .withColumn("Discount", F.col("Discount").cast(DoubleType())) \
                 .withColumn("Sales", F.col("Sales").cast(DoubleType())) \
                 .withColumn("Days to Ship", F.datediff(F.to_date(F.col("Ship Date"), "dd/MM/yyyy"), F.to_date(F.col("Order Date"), "dd/MM/yyyy"))) \
                 .withColumn("Returned?", F.when(F.col("Return Reason").isNotNull(), "Yes").otherwise("No"))

    def process_orders(df):
        df = standardize_dates(df, "Order Day", "Order Month", "Order Year", "Ship Day", "Ship Month", "Ship Year")
        return transform_orders(df)

    # Apply standardization and transformation to all datasets
    orders_central_df = process_orders(orders_central_df)
    orders_east_df = process_orders(orders_east_df)
    orders_south_2015_df = process_orders(orders_south_2015_df)
    orders_south_2016_df = process_orders(orders_south_2016_df)
    orders_south_2017_df = process_orders(orders_south_2017_df)
    orders_south_2018_df = process_orders(orders_south_2018_df)
    orders_west_df = process_orders(orders_west_df)

# COMMAND ----------

    # Step 3: Data Integration
    logger.info("Integrating datasets...")
    all_orders_df = orders_central_df.union(orders_east_df).union(orders_south_2015_df) \
        .union(orders_south_2016_df).union(orders_south_2017_df).union(orders_south_2018_df).union(orders_west_df)

    # Ensure 'Year of Sale' is extracted from 'Order Date'
    all_orders_df = all_orders_df.withColumn("Year of Sale", F.year(F.to_date(F.col("Order Date"), "dd/MM/yyyy")))

    orders_with_returns_df = all_orders_df.join(returns_df, ["Order ID", "Product ID"], "left")

# COMMAND ----------

    # Step 4: Aggregation and Output
    logger.info("Aggregating data and writing outputs...")
    annual_performance_df = all_orders_df.groupBy("Region", "Year of Sale").agg(
        F.sum("Profit").alias("Total Profit"),
        F.sum("Sales").alias("Total Sales"),
        F.sum("Quantity").alias("Total Quantity"),
        F.avg("Discount").alias("Average Discount")
    )

    # Write to Unity Catalog
    annual_performance_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.citi.annual_regional_performance")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise
