In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Cardinal Health Data
# MAGIC This notebook performs an ETL process on data from Unity Catalog tables, including data loading, transformations, and writing the final DataFrame back to Unity Catalog.

# COMMAND ----------

import logging
from pyspark.sql.functions import col, expr, when
from pyspark.sql.types import DoubleType

# COMMAND ----------

# MAGIC %md
# MAGIC ## Configure Logging
# MAGIC Set up logging to capture information about the ETL process.

# COMMAND ----------

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Load Data from Unity Catalog
# MAGIC Load data from various Unity Catalog tables into DataFrames.

# COMMAND ----------

try:
    logger.info("Loading data from Unity Catalog tables...")
    associates_df = spark.table("genai_demo.cardinal_health.associates_employment")
    compensation_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
    hospitals_df = spark.table("genai_demo.cardinal_health.hospitals_stats")
    assignments_df = spark.table("genai_demo.cardinal_health.hospital_assignments")
    logistics_df = spark.table("genai_demo.cardinal_health.logistics_channels")
    growth_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
    historical_sales_df = spark.table("genai_demo.cardinal_health.historical_sales_trending")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Perform Join Operations
# MAGIC Join the loaded DataFrames to create a unified dataset.

# COMMAND ----------

    logger.info("Performing join operations...")
    joined_df_4 = associates_df.join(compensation_df, "Associate_ID", "inner")
    joined_df_6 = hospitals_df.join(assignments_df, ["Hospital_ID", "Hospital_Name"], "inner")
    joined_df_7 = joined_df_4.join(joined_df_6, ["Associate_ID", "Associate_Name"], "inner") \
                             .drop("Associate_ID", "Associate_Name")
    joined_df_12 = logistics_df.join(growth_df, ["Channel_ID", "Channel_Type"], "inner")
    joined_df_13 = joined_df_12.join(hospitals_df, "Hospital_ID", "inner") \
                               .drop("Hospital_ID")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Perform Custom Calculations
# MAGIC Add calculated columns to the DataFrames for further analysis.

# COMMAND ----------

    logger.info("Performing custom calculations...")
    joined_df_4 = joined_df_4.withColumn("Compensation", 
                                         col("Base_Salary") + 
                                         (col("Commission_Percentage") / 100) * col("Base_Salary") + 
                                         col("Bonus"))

    historical_sales_df = historical_sales_df.withColumn("Projected_Revenue", 
                                                         when(col("Target_Year") == 2024, col("Sales_Revenue") * (1 + col("Projected_Sales_Growth_Rate") / 100))
                                                         .when(col("Target_Year") == 2025, col("Sales_Revenue") * (1 + col("Projected_Sales_Growth_Rate") / 100) ** 2)
                                                         .when(col("Target_Year") == 2026, col("Sales_Revenue") * (1 + col("Projected_Sales_Growth_Rate") / 100) ** 3)
                                                         .otherwise(col("Sales_Revenue")))

    growth_df = growth_df.withColumn("Projected_Sales_Growth_Rate", 
                                     when(col("Target_Year") == 2024, col("Projected_Growth_Rate"))
                                     .when(col("Target_Year") == 2025, col("Projected_Growth_Rate") ** 2)
                                     .when(col("Target_Year") == 2026, col("Projected_Growth_Rate") ** 3)
                                     .otherwise(col("Projected_Growth_Rate")))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Write Final DataFrame to Unity Catalog
# MAGIC Save the final processed DataFrame back to Unity Catalog.

# COMMAND ----------

    logger.info("Writing the final DataFrame to Unity Catalog table...")
    final_df = joined_df_13  # Assuming joined_df_13 is the final DataFrame after all transformations
    final_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.final_processed_data")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process: %s", str(e))
    raise
