In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Cardinal Health Data
# MAGIC This notebook performs an ETL process on data from Unity Catalog tables, integrating and transforming the data to produce a final output table.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import broadcast

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    df_employment = spark.table("genai_demo.cardinal_health.sales_associates_employment_details").select("Associate_ID", "Associate_Name", "Region", "Employment_Type", "Years_of_Experience")
    df_goals = spark.table("genai_demo.cardinal_health.Company_Goals").select("Hospital_ID", "Channel_Type", "Growth_Target", "Investment_Planned")
    df_compensation = spark.table("genai_demo.cardinal_health.Compensation_Guidelines").select("Associate_ID", "Base_Salary", "Commission_Percentage", "Bonus")
    df_growth = spark.table("genai_demo.cardinal_health.Growth_Opportunities").select("Channel_ID", "Channel_Type", "Market_Potential", "Projected_Growth_Rate", "Investment_Required", "Expected_ROI")
    df_sales = spark.table("genai_demo.cardinal_health.Historical_Sales").select("Hospital_ID", "Channel_Type", "Sales_Revenue", "Year")
    df_assignments = spark.table("genai_demo.cardinal_health.hospital_sales_assignments").select("Associate_ID", "Hospital_ID")
    df_hospital_stats = spark.table("genai_demo.cardinal_health.hospital_stats_north_america").select("Hospital_ID", "Hospital_Name", "City", "State", "Number_of_Beds", "Annual_Revenue", "Patient_Satisfaction_Score")
    df_logistics = spark.table("genai_demo.cardinal_health.Logistics_Channels").select("Channel_ID", "Channel_Type", "Hospital_ID", "Growth_Opportunities")
    df_trends = spark.table("genai_demo.cardinal_health.third_party_sales_trends").select("Channel_Type", "Market_Trend", "Political_Impact", "Economic_Impact")

# COMMAND ----------

    # Data Integration: Join tables based on common identifiers
    logger.info("Joining data tables...")
    df_joined = df_employment.join(broadcast(df_compensation), "Associate_ID", "inner") \
                             .join(df_assignments, "Associate_ID", "inner") \
                             .join(df_hospital_stats, "Hospital_ID", "inner") \
                             .join(df_logistics, "Hospital_ID", "inner") \
                             .join(df_growth, ["Channel_ID", "Channel_Type"], "inner") \
                             .join(df_sales, ["Hospital_ID", "Channel_Type"], "inner") \
                             .join(df_goals, ["Hospital_ID", "Channel_Type"], "inner") \
                             .join(df_trends, "Channel_Type", "inner")

    # Cache the joined DataFrame if used multiple times
    df_joined.cache()

# COMMAND ----------

    # Custom Calculations: Calculate total compensation
    logger.info("Calculating total compensation...")
    df_joined = df_joined.withColumn("Compensation", 
                                     F.col("Base_Salary") + 
                                     (F.col("Commission_Percentage") * F.col("Base_Salary")) + 
                                     F.col("Bonus"))

# COMMAND ----------

    # Custom Calculations: Calculate projected revenue
    logger.info("Calculating projected revenue...")
    df_joined = df_joined.withColumn("Projected_Revenue", 
                                     F.when(F.col("Year") == 2024, F.col("Sales_Revenue") * (F.col("Projected_Growth_Rate") / 100))
                                      .when(F.col("Year") == 2025, F.col("Sales_Revenue") * (1 + F.col("Projected_Growth_Rate") / 100))
                                      .when(F.col("Year") == 2026, F.col("Sales_Revenue") * (1 + F.col("Projected_Growth_Rate") / 100))
                                      .otherwise(F.col("Sales_Revenue")))
    df_joined.show()
# COMMAND ----------

    # Data Filtering and Sorting
    logger.info("Filtering and sorting data...")
    df_filtered = df_joined.filter(F.col("Year") > 2023).orderBy("Year")

# COMMAND ----------

    # Data Preparation for Output: Select relevant fields
    logger.info("Selecting relevant fields for output...")
    df_output = df_filtered.select("Hospital_ID", "Compensation", "Projected_Revenue", "Year")

# COMMAND ----------

    # Output Data: Write the final processed data to Unity Catalog
    logger.info("Writing output data to Unity Catalog...")
    # df_output.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.Hospitals_Output")
    df_output.show()
    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise
