In [None]:
"# Databricks notebook source\n# COMMAND ----------\n# %md\n# # ETL Process for Orders Data\n# This notebook performs an ETL process on orders data from various regions, standardizes and cleans the data, applies business rules, and aggregates metrics by region and time.\n\n# COMMAND ----------\n#
\n# Import necessary libraries\nimport logging\nfrom pyspark.sql.functions import col, to_date, datediff, when, lit, sum as _sum\n\n# COMMAND ----------\n#
\n# Initialize logging\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n# COMMAND ----------\n#
\n# Function to load data from Unity Catalog tables\ndef load_data():\n    logger.info(\"Loading data from Unity Catalog tables...\")\n    orders_central_df = spark.table(\"genai_demo.citi.orders_central\")\n    orders_east_df = spark.table(\"genai_demo.citi.orders_east\")\n    orders_south_2015_df = spark.table(\"genai_demo.citi.orders_south_2015\")\n    orders_south_2016_df = spark.table(\"genai_demo.citi.orders_south_2016\")\n    orders_south_2017_df = spark.table(\"genai_demo.citi.orders_south_2017\")\n    orders_south_2018_df = spark.table(\"genai_demo.citi.orders_south_2018\")\n    orders_west_df = spark.table(\"genai_demo.citi.orders_west\")\n    quota_df = spark.table(\"genai_demo.citi.quota\")\n    returns_df = spark.table(\"genai_demo.citi.returns\")\n    return (orders_central_df, orders_east_df, orders_south_2015_df, orders_south_2016_df, orders_south_2017_df, orders_south_2018_df, orders_west_df, quota_df, returns_df)\n\n# COMMAND ----------\n#
\n# Function to standardize date fields and column names\ndef standardize_data(dfs):\n    logger.info(\"Standardizing date fields and column names...\")\n    for df in dfs:\n        df = df.withColumn(\"Order Date\", to_date(col(\"Order Date\"), \"yyyy-MM-dd\"))\n    return dfs\n\n# COMMAND ----------\n#
\n# Function to clean data by removing invalid entries and handling missing values\ndef clean_data(dfs):\n    logger.info(\"Cleaning data by removing invalid entries and handling missing values...\")\n    for df in dfs:\n        df = df.dropna(subset=[\"Order ID\", \"Customer Name\"])\n    return dfs\n\n# COMMAND ----------\n#
\n# Function to unpivot quota data and combine sales datasets\ndef pivot_and_consolidate(orders_dfs, quota_df):\n    logger.info(\"Unpivoting quota data and combining sales datasets...\")\n    quota_df = quota_df.selectExpr(\"Region\", \"stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`) as (Year, Quota)\")\n    combined_orders_df = orders_dfs[0]\n    for df in orders_dfs[1:]:\n        combined_orders_df = combined_orders_df.union(df)\n    return combined_orders_df, quota_df\n\n# COMMAND ----------\n#
\n# Function to add custom fields such as Days to Ship and Returned\ndef add_custom_fields(combined_orders_df):\n    logger.info(\"Adding custom fields such as Days to Ship and Returned...\")\n    combined_orders_df = combined_orders_df.withColumn(\"Days to Ship\", datediff(col(\"Ship Date\"), col(\"Order Date\")))\n    combined_orders_df = combined_orders_df.withColumn(\"Returned\", when(col(\"Return Reason\").isNull(), lit(\"No\")).otherwise(lit(\"Yes\")))\n    return combined_orders_df\n\n# COMMAND ----------\n#
\n# Function to apply business rules and filters\ndef apply_business_rules(combined_orders_df):\n    logger.info(\"Applying business rules and filters...\")\n    combined_orders_df = combined_orders_df.filter(col(\"Discount\") < 0.2)\n    return combined_orders_df\n\n# COMMAND ----------\n#
\n# Function to aggregate metrics by region and time\ndef aggregate_metrics(combined_orders_df):\n    logger.info(\"Aggregating metrics by region and time...\")\n    aggregated_df = combined_orders_df.groupBy(\"Region\", \"Year\").agg(_sum(\"Sales\").alias(\"Total Sales\"), _sum(\"Profit\").alias(\"Total Profit\"))\n    return aggregated_df\n\n# COMMAND ----------\n#
\n# Function to write the final DataFrame to Unity Catalog table\ndef write_output(aggregated_df):\n    logger.info(\"Writing the final DataFrame to Unity Catalog table...\")\n    spark.sql(\"DROP TABLE IF EXISTS genai_demo.guardian.customer_360\")\n    aggregated_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"genai_demo.guardian.customer_360\")\n\n# COMMAND ----------\n#
\n# Main ETL process\ntry:\n    # Load data\n    orders_dfs, quota_df, returns_df = load_data()[:7], load_data()[7], load_data()[8]\n    \n    # Standardize data\n    orders_dfs = standardize_data(orders_dfs)\n    \n    # Clean data\n    orders_dfs = clean_data(orders_dfs)\n    \n    # Pivot and consolidate\n    combined_orders_df, quota_df = pivot_and_consolidate(orders_dfs, quota_df)\n    \n    # Add custom fields\n    combined_orders_df = add_custom_fields(combined_orders_df)\n    \n    # Apply business rules\n    combined_orders_df = apply_business_rules(combined_orders_df)\n    \n    # Aggregate metrics\n    aggregated_df = aggregate_metrics(combined_orders_df)\n    \n    # Write output\n    write_output(aggregated_df)\n    \n    logger.info(\"ETL process completed successfully.\")\n\nexcept Exception as e:\n    logger.error(\"An error occurred during the ETL process\", exc_info=True)\n"
