In [None]:
# COMMAND ----------
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType, StringType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
# Step 1: Data Source Configuration
# Assume data sources are registered in Unity Catalog and accessible via Spark SQL

# COMMAND ----------
# Step 2: Data Ingestion
try:
    # Reading data from Unity Catalog tables
    orders_central_df = spark.table("catalog.db.orders_central")
    orders_west_df = spark.table("catalog.db.orders_west")
    orders_east_df = spark.table("catalog.db.orders_east")
    orders_south_df = spark.table("catalog.db.orders_south")
    quota_df = spark.table("catalog.db.quota")
    returns_df = spark.table("catalog.db.returns")
    logger.info("Data ingestion completed successfully.")
except Exception as e:
    logger.error(f"Error during data ingestion: {e}")
    raise

# COMMAND ----------
# Step 3: Data Cleansing and Standardization
try:
    # Adjusting date fields and standardizing column names
    orders_central_df = orders_central_df.withColumn(
        "Order Date",
        F.to_date(F.concat_ws("/", F.col("Order Day"), F.col("Order Month"), F.col("Order Year")), "dd/MM/yyyy")
    ).drop("Order Day", "Order Month", "Order Year")

    orders_central_df = orders_central_df.withColumnRenamed("Discounts", "Discount")
    logger.info("Data cleansing and standardization completed successfully.")
except Exception as e:
    logger.error(f"Error during data cleansing and standardization: {e}")
    raise

# COMMAND ----------
# Step 4: Data Type Adjustments
try:
    # Converting data types
    orders_central_df = orders_central_df.withColumn("Discount", F.col("Discount").cast(StringType()))
    orders_central_df = orders_central_df.withColumn("Sales", F.col("Sales").cast(DoubleType()))
    logger.info("Data type adjustments completed successfully.")
except Exception as e:
    logger.error(f"Error during data type adjustments: {e}")
    raise

# COMMAND ----------
# Step 5: Data Integration
try:
    # Union datasets
    all_orders_df = orders_central_df.unionByName(orders_west_df).unionByName(orders_east_df).unionByName(orders_south_df)

    # Join with returns
    orders_returns_df = all_orders_df.join(returns_df, ["Order ID", "Product ID"], "right")
    logger.info("Data integration completed successfully.")
except Exception as e:
    logger.error(f"Error during data integration: {e}")
    raise

# COMMAND ----------
# Step 6: Data Restructuring
try:
    # Unpivot quota data
    quota_unpivoted_df = quota_df.selectExpr("Region", "stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`) as (Year, Quota)")

    # Standardize state names
    state_mapping = {"Arizona": "AZ", "California": "CA"}
    orders_returns_df = orders_returns_df.replace(state_mapping, subset=["State"])
    logger.info("Data restructuring completed successfully.")
except Exception as e:
    logger.error(f"Error during data restructuring: {e}")
    raise

# COMMAND ----------
# Step 7: Aggregations and Calculations
try:
    # Aggregating sales data
    aggregated_sales_df = orders_returns_df.groupBy("Region", F.year("Order Date").alias("Year of Sale")).agg(
        F.sum("Profit").alias("Total Profit"),
        F.sum("Sales").alias("Total Sales"),
        F.sum("Quantity").alias("Total Quantity"),
        F.avg("Discount").alias("Average Discount")
    )

    # Custom calculations
    orders_returns_df = orders_returns_df.withColumn("Days to Ship", F.datediff(F.col("Ship Date"), F.col("Order Date")))
    orders_returns_df = orders_returns_df.withColumn("Returned?", F.expr("IF(ISNULL(Return Reason), 'No', 'Yes')"))
    logger.info("Aggregations and calculations completed successfully.")
except Exception as e:
    logger.error(f"Error during aggregations and calculations: {e}")
    raise

# COMMAND ----------
# Step 8: Output Data Generation
try:
    # Writing to Databricks tables
    aggregated_sales_df.write.format("delta").mode("overwrite").saveAsTable("catalog.db.annual_regional_performance")
    orders_returns_df.write.format("delta").mode("overwrite").saveAsTable("catalog.db.superstore_sales")
    logger.info("Output data generation completed successfully.")
except Exception as e:
    logger.error(f"Error during output data generation: {e}")
    raise
