In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Cardinal Health Data
# MAGIC This notebook performs an ETL process on data from Unity Catalog tables, including data loading, integration, calculations, filtering, sorting, and output.

# COMMAND ----------

import logging
from pyspark.sql import functions as F

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Step 1: Data Loading
    logger.info("Loading data from Unity Catalog tables.")
    associates_employment_df = spark.table("genai_demo.cardinal_health.associates_employment")
    compensation_guidelines_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
    hospital_assignments_df = spark.table("genai_demo.cardinal_health.hospital_assignments")
    hospitals_stats_df = spark.table("genai_demo.cardinal_health.hospitals_stats")
    logistics_channels_df = spark.table("genai_demo.cardinal_health.logistics_channels")
    growth_opportunities_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
    historical_sales_trending_df = spark.table("genai_demo.cardinal_health.historical_sales_trending")
    company_goals_df = spark.table("genai_demo.cardinal_health.company_goals_1")
    third_party_trends_df = spark.table("genai_demo.cardinal_health.third_party_trends")

# COMMAND ----------

    # Step 2: Data Integration
    logger.info("Performing data integration through joins.")
    employment_compensation_df = associates_employment_df.join(
        compensation_guidelines_df,
        on="Associate_ID",
        how="inner"
    )

    hospital_assignments_df = hospital_assignments_df.join(
        hospitals_stats_df,
        on=["Hospital_ID", "Hospital_Name"],
        how="inner"
    )

# COMMAND ----------

    # Step 3: Custom Calculations
    logger.info("Calculating total compensation and projected revenue.")
    employment_compensation_df = employment_compensation_df.withColumn(
        "Total_Compensation",
        F.col("Base_Salary") + (F.col("Commission_Percentage") * F.col("Base_Salary")) + F.col("Bonus")
    )

    historical_sales_trending_df = historical_sales_trending_df.withColumn(
        "Projected_Revenue",
        F.expr("""
            CASE
                WHEN Target_Year = 2024 THEN Sales_Revenue * (Projected_Sales_Growth_Rate / 100)
                WHEN Target_Year = 2025 THEN Sales_Revenue * (1 + Projected_Sales_Growth_Rate / 100)
                WHEN Target_Year = 2026 THEN Sales_Revenue * (1 + Projected_Sales_Growth_Rate / 100)
                ELSE Sales_Revenue
            END
        """)
    )

# COMMAND ----------

    # Step 4: Data Filtering and Selection
    logger.info("Filtering and selecting relevant data.")
    filtered_df = historical_sales_trending_df.filter(F.col("Target_Year") > 2023)

    selected_df = filtered_df.select(
        "Hospital_ID", "Channel_Type", "Growth_Opportunities", "Projected_Growth_Rate", "Market_Potential", "Expected_ROI"
    )

# COMMAND ----------

    # Step 5: Data Sorting
    logger.info("Sorting data by target year.")
    sorted_df = selected_df.orderBy("Target_Year")

# COMMAND ----------

    # Step 6: Data Output
    logger.info("Writing the final output to Unity Catalog table.")
    sorted_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.hospitals_output")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process.", exc_info=True)
