In [None]:
"# Databricks notebook source\n# COMMAND ----------\n# MAGIC %md\n# MAGIC # ETL Process for Superstore Sales Data\n# MAGIC This notebook performs an ETL process on Superstore sales data, including data loading, transformation, and writing results back to Unity Catalog tables.\n\n# COMMAND ----------\n# MAGIC
\n# Import necessary libraries\nimport logging\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.types import IntegerType, DecimalType\n\n# COMMAND ----------\n# MAGIC
\n# Configure logging\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n# COMMAND ----------\n# MAGIC
\n# Function to fix dates and standardize columns\ndef fix_dates(df, region_name):\n    df = df.withColumn(\"Region\", F.lit(region_name))\n    df = df.withColumn(\"Order Date\", F.to_date(F.concat(\"Order Day\", \"Order Month\", \"Order Year\"), \"ddMMyyyy\"))\n    df = df.withColumn(\"Ship Date\", F.to_date(F.concat(\"Ship Day\", \"Ship Month\", \"Ship Year\"), \"ddMMyyyy\"))\n    df = df.drop(\"Order Year\", \"Order Month\", \"Order Day\", \"Ship Year\", \"Ship Month\", \"Ship Day\")\n    df = df.withColumnRenamed(\"Discounts\", \"Discount\").withColumnRenamed(\"Product\", \"Product Name\")\n    return df\n\n# COMMAND ----------\n# MAGIC
\n# Function to load data from Unity Catalog tables\ndef load_data():\n    orders_central_df = spark.table(\"genai_demo.citi.orders_central\")\n    orders_east_df = spark.table(\"genai_demo.citi.orders_east\")\n    orders_south_2015_df = spark.table(\"genai_demo.citi.orders_south_2015\")\n    orders_south_2016_df = spark.table(\"genai_demo.citi.orders_south_2016\")\n    orders_south_2017_df = spark.table(\"genai_demo.citi.orders_south_2017\")\n    orders_south_2018_df = spark.table(\"genai_demo.citi.orders_south_2018\")\n    orders_west_df = spark.table(\"genai_demo.citi.orders_west\")\n    quota_df = spark.table(\"genai_demo.citi.quota\")\n    returns_df = spark.table(\"genai_demo.citi.returns\")\n    return (orders_central_df, orders_east_df, orders_south_2015_df, orders_south_2016_df, \n            orders_south_2017_df, orders_south_2018_df, orders_west_df, quota_df, returns_df)\n\n# COMMAND ----------\n# MAGIC
\n# Function to perform data transformations\ndef transform_data(orders_central_df, orders_east_df, orders_south_2015_df, orders_south_2016_df, \n                   orders_south_2017_df, orders_south_2018_df, orders_west_df, quota_df):\n    # Fix Dates\n    orders_central_df = fix_dates(orders_central_df, \"Central\")\n    \n    # Remove Nulls\n    orders_central_df = orders_central_df.filter(F.col(\"Order ID\").isNotNull())\n    \n    # Fix Data Type\n    orders_central_df = orders_central_df.withColumn(\"Discount\", F.col(\"Discount\").cast(\"string\"))\n    orders_central_df = orders_central_df.withColumn(\"Sales\", F.regexp_replace(\"Sales\", \"[^0-9.]\", \"\").cast(\"double\"))\n    \n    # Rename States\n    state_mapping = {\n        \"CA\": \"California\",\n        \"NY\": \"New York\",\n        # Add other state mappings here\n    }\n    orders_central_df = orders_central_df.replace(state_mapping, subset=[\"State\"])\n    \n    # Pivot Quotas\n    quota_df = quota_df.select(\"Region\", \n                               F.expr(\"stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`)\").alias(\"Year\", \"Quota\"))\n    quota_df = quota_df.withColumn(\"Year\", F.col(\"Year\").cast(IntegerType()))\n    \n    # All Orders\n    all_orders_df = orders_central_df.union(orders_east_df).union(orders_south_2015_df).union(orders_south_2016_df).union(orders_south_2017_df).union(orders_south_2018_df).union(orders_west_df)\n    \n    # Orders and Returns\n    all_orders_df = all_orders_df.withColumn(\"Discount\", F.col(\"Discount\").cast(DecimalType(10, 2)))\n    all_orders_df = all_orders_df.withColumn(\"Days to Ship\", F.datediff(\"Ship Date\", \"Order Date\"))\n    all_orders_df = all_orders_df.withColumn(\"Returned\", F.when(F.col(\"Return Reason\").isNotNull(), \"Yes\").otherwise(\"No\"))\n    all_orders_df = all_orders_df.drop(\"Table Names\", \"File Paths\")\n    all_orders_df = all_orders_df.withColumn(\"Discount\", F.when(F.col(\"Discount\").isNull(), 0).otherwise(F.col(\"Discount\")))\n    all_orders_df = all_orders_df.withColumn(\"Year of Sale\", F.year(\"Order Date\"))\n    \n    # Exclude discounts between 17 and 18\n    exclude_discount_condition = (F.col(\"Discount\") > 17) & (F.col(\"Discount\") < 18)\n    all_orders_df = all_orders_df.filter(~exclude_discount_condition)\n    \n    # Clean Notes and Approver\n    all_orders_df = all_orders_df.withColumn(\"Return Notes\", F.split(\"Notes\", \" - \")[0])\n    all_orders_df = all_orders_df.withColumn(\"Approver\", F.split(\"Notes\", \" - \")[1])\n    all_orders_df = all_orders_df.drop(\"Notes\")\n    all_orders_df = all_orders_df.withColumn(\"Approver\", F.trim(\"Approver\"))\n    \n    # Roll Up Sales\n    annual_performance_df = all_orders_df.groupBy(\"Region\", \"Year of Sale\").agg(\n        F.sum(\"Profit\").alias(\"Profit\"),\n        F.sum(\"Sales\").alias(\"Sales\"),\n        F.sum(\"Quantity\").alias(\"Quantity\"),\n        F.avg(\"Discount\").alias(\"Discount\")\n    )\n    \n    # Quota and Orders\n    quota_orders_df = annual_performance_df.join(quota_df, (annual_performance_df.Region == quota_df.Region) & (annual_performance_df[\"Year of Sale\"] == quota_df.Year), \"inner\")\n    \n    return all_orders_df, annual_performance_df\n\n# COMMAND ----------\n# MAGIC
\n# Function to write data to Unity Catalog tables\ndef write_data(all_orders_df, annual_performance_df):\n    spark.sql(\"DROP TABLE IF EXISTS genai_demo.citi.annual_regional_performance\")\n    annual_performance_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"genai_demo.citi.annual_regional_performance\")\n\n    spark.sql(\"DROP TABLE IF EXISTS genai_demo.citi.superstore_sales\")\n    all_orders_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"genai_demo.citi.superstore_sales\")\n\n# COMMAND ----------\n# MAGIC
\n# Main ETL process\ntry:\n    # Load data\n    (orders_central_df, orders_east_df, orders_south_2015_df, orders_south_2016_df, \n     orders_south_2017_df, orders_south_2018_df, orders_west_df, quota_df, returns_df) = load_data()\n    \n    # Transform data\n    all_orders_df, annual_performance_df = transform_data(orders_central_df, orders_east_df, orders_south_2015_df, \n                                                          orders_south_2016_df, orders_south_2017_df, orders_south_2018_df, \n                                                          orders_west_df, quota_df)\n    \n    # Write data\n    write_data(all_orders_df, annual_performance_df)\n    \n    logger.info(\"ETL process completed successfully.\")\n\nexcept Exception as e:\n    logger.error(f\"An error occurred during the ETL process: {e}\")\n    raise\n"
