In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Cardinal Health Data
# MAGIC This notebook performs an ETL process on data from Unity Catalog tables related to Cardinal Health.

# COMMAND ----------

import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, broadcast

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Load Data
# MAGIC Load data from Unity Catalog tables.

# COMMAND ----------

def load_data():
    try:
        logger.info("Loading data from Unity Catalog tables...")
        associates_df = spark.table("genai_demo.cardinal_health.associates_employment")
        compensation_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
        hospital_assignments_df = spark.table("genai_demo.cardinal_health.hospital_assignments")
        logistics_channels_df = spark.table("genai_demo.cardinal_health.logistics_channels")
        growth_opportunities_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
        historical_sales_df = spark.table("genai_demo.cardinal_health.historical_sales_trending")
        hospital_data_df = spark.table("genai_demo.cardinal_health.hospitals_stats")
        return (associates_df, compensation_df, hospital_assignments_df, logistics_channels_df,
                growth_opportunities_df, historical_sales_df, hospital_data_df)
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Integrate Data
# MAGIC Integrate data from different sources.

# COMMAND ----------

def integrate_data(associates_df, compensation_df, hospital_data_df, hospital_assignments_df):
    try:
        logger.info("Integrating data...")
        # Join Associates Employment Data with Compensation Guidelines
        joined_df_1 = associates_df.join(broadcast(compensation_df), "Associate_ID", "inner")

        # Join Hospital Data with Hospital Assignments
        joined_df_2 = hospital_data_df.join(hospital_assignments_df, ["Hospital_ID", "Hospital_Name"], "inner")

        # Join the results of the above joins on Associate_ID
        final_joined_df = joined_df_1.join(joined_df_2, "Associate_ID", "inner").cache()
        return final_joined_df
    except Exception as e:
        logger.error(f"Error integrating data: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Calculate Compensation
# MAGIC Calculate the compensation for each associate.

# COMMAND ----------

def calculate_compensation(final_joined_df):
    try:
        logger.info("Calculating compensation...")
        compensation_calculated_df = final_joined_df.withColumn(
            "Compensation",
            col("Base_Salary") + (col("Commission_Percentage") / 100) * col("Base_Salary") + col("Bonus")
        )
        return compensation_calculated_df
    except Exception as e:
        logger.error(f"Error calculating compensation: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Calculate Projected Revenue
# MAGIC Calculate the projected revenue based on historical sales data.

# COMMAND ----------

def calculate_projected_revenue(historical_sales_df):
    try:
        logger.info("Calculating projected revenue...")
        projected_revenue_df = historical_sales_df.withColumn(
            "Projected_Revenue",
            when(col("Year") == 2024, col("Sales_Revenue") * (1 + col("Projected_Sales_Growth_Rate") / 100))
            .when(col("Year") == 2025, col("Sales_Revenue") * (1 + col("Projected_Sales_Growth_Rate") / 100))
            .when(col("Year") == 2026, col("Sales_Revenue") * (1 + col("Projected_Sales_Growth_Rate") / 100))
            .otherwise(col("Sales_Revenue"))
        )
        return projected_revenue_df
    except Exception as e:
        logger.error(f"Error calculating projected revenue: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Filter and Sort Data
# MAGIC Filter the data for target years greater than 2023 and sort by year.

# COMMAND ----------

def filter_and_sort_data(projected_revenue_df):
    try:
        logger.info("Filtering and sorting data...")
        # Filter for Target Years Greater than 2023
        filtered_df = projected_revenue_df.filter(col("Year") > 2023)

        # Sort by Target Year
        sorted_df = filtered_df.orderBy("Year")
        return sorted_df
    except Exception as e:
        logger.error(f"Error filtering and sorting data: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Write Output
# MAGIC Write the final sorted data to a Unity Catalog table.

# COMMAND ----------

def write_output(sorted_df):
    try:
        logger.info("Writing output data to Unity Catalog table...")
        sorted_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.hospitals_output")
    except Exception as e:
        logger.error(f"Error writing output data: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Main Execution
# MAGIC Execute the ETL process.

# COMMAND ----------

def main():
    try:
        # Load data
        (associates_df, compensation_df, hospital_assignments_df, logistics_channels_df,
         growth_opportunities_df, historical_sales_df, hospital_data_df) = load_data()

        # Integrate data
        final_joined_df = integrate_data(associates_df, compensation_df, hospital_data_df, hospital_assignments_df)

        # Calculate compensation
        compensation_calculated_df = calculate_compensation(final_joined_df)

        # Calculate projected revenue
        projected_revenue_df = calculate_projected_revenue(historical_sales_df)

        # Filter and sort data
        sorted_df = filter_and_sort_data(projected_revenue_df)

        # Write output
        write_output(sorted_df)

        logger.info("ETL process completed successfully.")
    except Exception as e:
        logger.error(f"ETL process failed: {e}")

if __name__ == "__main__":
    main()
