In [None]:
"# Databricks notebook source\n# MAGIC %md\n# MAGIC # Data Processing with PySpark\n# MAGIC This notebook demonstrates data loading, transformation, and output using PySpark in a Databricks environment.\n\n# COMMAND ----------\n\nimport logging\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import concat, col, lit, to_date, expr, datediff, year, sum, avg, broadcast\nfrom pyspark.sql.types import IntegerType\n\n# Configure logging\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n# COMMAND ----------\n\n# MAGIC %md\n# MAGIC ## Load Data\n# MAGIC Load data from CSV and Excel files into DataFrames.\n\n# COMMAND ----------\n\ndef load_data():\n    \"\"\"Load data from CSV and Excel files.\"\"\"\n    try:\n        orders_central_df = spark.read.csv(\"/mnt/datalake/Orders_Central.csv\", header=True, inferSchema=True)\n        orders_west_df = spark.read.csv(\"/mnt/datalake/Orders_West.csv\", header=True, inferSchema=True)\n        orders_south_df = spark.read.csv(\"/mnt/datalake/orders_south_2015.csv\", header=True, inferSchema=True)\n        orders_east_df = spark.read.format(\"com.crealytics.spark.excel\").option(\"header\", \"true\").load(\"/mnt/datalake/Orders_East.xlsx\")\n        returns_df = spark.read.format(\"com.crealytics.spark.excel\").option(\"header\", \"true\").load(\"/mnt/datalake/return_reasons_new.xlsx\")\n        quota_df = spark.read.format(\"com.crealytics.spark.excel\").option(\"header\", \"true\").load(\"/mnt/datalake/Quota.xlsx\")\n        logger.info(\"Data loaded successfully from CSV and Excel files.\")\n        return orders_central_df, orders_west_df, orders_south_df, orders_east_df, returns_df, quota_df\n    except Exception as e:\n        logger.error(f\"Error loading data: {e}\")\n        raise\n\n# COMMAND ----------\n\n# MAGIC %md\n# MAGIC ## Standardize and Clean Data\n# MAGIC Standardize and clean the Orders Central data.\n\n# COMMAND ----------\n\ndef standardize_and_clean_data(orders_central_df):\n    \"\"\"Standardize and clean the Orders Central data.\"\"\"\n    try:\n        # Standardize column names\n        orders_central_df = orders_central_df.withColumnRenamed(\"Discounts\", \"Discount\").withColumnRenamed(\"Product\", \"Product Name\")\n        \n        # Concatenate date fields to form Order Date and Ship Date\n        orders_central_df = orders_central_df.withColumn(\"Order Date\", to_date(concat(col(\"Order Day\"), lit(\"/\"), col(\"Order Month\"), lit(\"/\"), col(\"Order Year\")), \"dd/MM/yyyy\"))\n        orders_central_df = orders_central_df.withColumn(\"Ship Date\", to_date(concat(col(\"Ship Day\"), lit(\"/\"), col(\"Ship Month\"), lit(\"/\"), col(\"Ship Year\")), \"dd/MM/yyyy\"))\n        \n        # Remove unnecessary columns\n        orders_central_df = orders_central_df.drop(\"Order Day\", \"Order Month\", \"Order Year\", \"Ship Day\", \"Ship Month\", \"Ship Year\")\n        \n        # Remove rows with null Order ID\n        orders_central_df = orders_central_df.filter(orders_central_df[\"Order ID\"].isNotNull())\n        \n        # Replace null discounts with 0\n        orders_central_df = orders_central_df.fillna({\"Discount\": 0})\n        \n        # Convert Sales to a real number\n        orders_central_df = orders_central_df.withColumn(\"Sales\", col(\"Sales\").cast(\"double\"))\n        \n        # Replace state names with abbreviations\n        state_abbr = {\"California\": \"CA\", \"New York\": \"NY\"}  # Example mapping\n        orders_central_df = orders_central_df.replace(state_abbr, subset=[\"State\"])\n        \n        logger.info(\"Data standardization and cleaning completed.\")\n        return orders_central_df\n    except Exception as e:\n        logger.error(f\"Error during data standardization and cleaning: {e}\")\n        raise\n\n# COMMAND ----------\n\n# MAGIC %md\n# MAGIC ## Pivot and Consolidate Data\n# MAGIC Pivot and consolidate data from multiple regions.\n\n# COMMAND ----------\n\ndef pivot_and_consolidate_data(orders_central_df, orders_west_df, orders_east_df, orders_south_df, quota_df):\n    \"\"\"Pivot and consolidate data.\"\"\"\n    try:\n        # Unpivot Quota Data\n        quota_df = quota_df.selectExpr(\"Region\", \"stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`) as (Year, Quota)\")\n        quota_df = quota_df.withColumn(\"Year\", col(\"Year\").cast(IntegerType()))\n        \n        # Combine all order datasets\n        combined_orders_df = orders_central_df.union(orders_west_df).union(orders_east_df).union(orders_south_df)\n        \n        logger.info(\"Pivoting and consolidation completed.\")\n        return combined_orders_df, quota_df\n    except Exception as e:\n        logger.error(f\"Error during pivoting and consolidation: {e}\")\n        raise\n\n# COMMAND ----------\n\n# MAGIC %md\n# MAGIC ## Calculate Fields and Enhancements\n# MAGIC Add calculated fields and enhancements to the data.\n\n# COMMAND ----------\n\ndef calculate_fields_and_enhancements(combined_orders_df):\n    \"\"\"Add calculated fields and enhancements.\"\"\"\n    try:\n        combined_orders_df = combined_orders_df.withColumn(\"Days to Ship\", datediff(col(\"Ship Date\"), col(\"Order Date\")))\n        combined_orders_df = combined_orders_df.withColumn(\"Year of Sale\", year(col(\"Order Date\")))\n        combined_orders_df = combined_orders_df.withColumn(\"Returned?\", expr(\"CASE WHEN `Return Reason` IS NOT NULL THEN 'Yes' ELSE 'No' END\"))\n        \n        # Exclude transactions with discounts between 17 and 18\n        combined_orders_df = combined_orders_df.filter(~((col(\"Discount\") >= 17) & (col(\"Discount\") < 18)))\n        \n        logger.info(\"Calculated fields and enhancements completed.\")\n        return combined_orders_df\n    except Exception as e:\n        logger.error(f\"Error during calculated fields and enhancements: {e}\")\n        raise\n\n# COMMAND ----------\n\n# MAGIC %md\n# MAGIC ## Write Output Data\n# MAGIC Write the processed data to Unity Catalog tables.\n\n# COMMAND ----------\n\ndef write_output_data(combined_orders_df):\n    \"\"\"Write output data to Unity Catalog tables.\"\"\"\n    try:\n        # Annual Regional Performance Report\n        report_df = combined_orders_df.groupBy(\"Region\", \"Year of Sale\").agg(\n            sum(\"Profit\").alias(\"Total Profit\"),\n            sum(\"Sales\").alias(\"Total Sales\"),\n            sum(\"Quantity\").alias(\"Total Quantity\"),\n            avg(\"Discount\").alias(\"Average Discount\")\n        )\n        report_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"catalog.target_db.annual_regional_performance\")\n        \n        # Superstore Sales Dataset\n        combined_orders_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"catalog.target_db.superstore_sales\")\n        \n        logger.info(\"Output configuration and data writing completed.\")\n    except Exception as e:\n        logger.error(f\"Error during output configuration and data writing: {e}\")\n        raise\n\n# COMMAND ----------\n\n# MAGIC %md\n# MAGIC ## Main Execution\n# MAGIC Execute the main function to process the data.\n\n# COMMAND ----------\n\ndef main():\n    orders_central_df, orders_west_df, orders_south_df, orders_east_df, returns_df, quota_df = load_data()\n    orders_central_df = standardize_and_clean_data(orders_central_df)\n    combined_orders_df, quota_df = pivot_and_consolidate_data(orders_central_df, orders_west_df, orders_east_df, orders_south_df, quota_df)\n    combined_orders_df = calculate_fields_and_enhancements(combined_orders_df)\n    write_output_data(combined_orders_df)\n\nif __name__ == \"__main__\":\n    main()\n"
