In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Cardinal Health Data
# MAGIC This notebook performs an ETL process on data from Unity Catalog tables, including data integration, custom calculations, filtering, aggregation, and writing the output back to Unity Catalog.

# COMMAND ----------

import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, broadcast

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume the Spark session is already available as 'spark'

# COMMAND ----------

def main():
    try:
        # Load data from Unity Catalog tables
        logger.info("Loading data from Unity Catalog tables...")
        associates_df = spark.table("genai_demo.cardinal_health.associates_employment")
        compensation_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
        growth_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
        hospital_assignments_df = spark.table("genai_demo.cardinal_health.hospital_assignments")
        hospitals_stats_df = spark.table("genai_demo.cardinal_health.hospitals_stats")
        logistics_channels_df = spark.table("genai_demo.cardinal_health.logistics_channels")

        # Data Integration: Join operations
        logger.info("Performing join operations...")
        # Join associates with compensation
        joined_df = associates_df.join(broadcast(compensation_df), "Associate_ID", "inner")

        # Join hospital assignments with hospitals stats
        hospital_joined_df = hospital_assignments_df.join(hospitals_stats_df, "Hospital_ID", "inner")

        # Join logistics channels with growth opportunities
        logistics_joined_df = logistics_channels_df.join(broadcast(growth_df), ["Channel_ID", "Channel_Type"], "inner")

        # Join all together
        final_joined_df = joined_df.join(hospital_joined_df, "Associate_ID", "inner") \
                                   .join(logistics_joined_df, "Hospital_ID", "inner")

        # Custom Calculations
        logger.info("Calculating total compensation and projected revenue...")
        final_joined_df = final_joined_df.withColumn("Total_Compensation", 
                                                     col("Base_Salary") + 
                                                     (col("Commission_Percentage") / 100) * col("Base_Salary") + 
                                                     col("Bonus"))

        final_joined_df = final_joined_df.withColumn("Projected_Revenue", 
                                                     expr("Sales_Revenue * (1 + Projected_Sales_Growth_Rate / 100)"))

        # Data Filtering and Selection
        logger.info("Filtering data for target years beyond 2023...")
        filtered_df = final_joined_df.filter(col("Target_Year") > 2023) \
                                     .select("Hospital_ID", "Associate_ID", "Total_Compensation", "Projected_Revenue")

        # Aggregation and Business Logic
        logger.info("Aggregating data...")
        aggregated_df = filtered_df.groupBy("Hospital_ID").agg({"Total_Compensation": "sum", "Projected_Revenue": "sum"})

        # Output Data Configuration: Write to Unity Catalog
        logger.info("Writing the processed data to Unity Catalog...")
        aggregated_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.final_output")

        logger.info("ETL process completed successfully.")

    except Exception as e:
        logger.error("An error occurred during the ETL process", exc_info=True)

if __name__ == "__main__":
    main()
