In [0]:
# COMMAND ----------
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, broadcast

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume the Spark session is already available as 'spark'

# COMMAND ----------
# Step 1: Load Data from Unity Catalog Tables
try:
    logger.info("Loading data from Unity Catalog tables...")
    hospital_stats_df = spark.table("genai_demo.cardinal_health.hospital_stats")
    employment_details_df = spark.table("genai_demo.cardinal_health.employment_details")
    compensation_guidelines_df = spark.table("genai_demo.cardinal_health.db.compensation_guidelines")
    hospital_sales_assignments_df = spark.table("genai_demo.cardinal_health.db.hospital_sales_assignments")
    logistics_channels_df = spark.table("genai_demo.cardinal_health.db.logistics_channels")
    growth_opportunities_df = spark.table("genai_demo.cardinal_health.db.growth_opportunities")
    historical_sales_df = spark.table("genai_demo.cardinal_health.db.historical_sales")
    company_goals_df = spark.table("genai_demo.cardinal_health.db.company_goals")
except Exception as e:
    logger.error(f"Error loading data from Unity Catalog: {e}")
    raise

# COMMAND ----------
# Step 2: Data Joining
try:
    logger.info("Performing data joins...")
    # Select necessary columns and join employment details with compensation guidelines
    employment_details_df = employment_details_df.select("Associate_ID", "Associate_Name", "Years_of_Experience")
    compensation_guidelines_df = compensation_guidelines_df.select("Associate_ID", "Base_Salary", "Commission_Percentage", "Bonus")
    
    employment_compensation_df = employment_details_df.join(
        broadcast(compensation_guidelines_df), "Associate_ID", "inner"
    ).cache()

    # Select necessary columns and join hospital stats with hospital sales assignments
    hospital_stats_df = hospital_stats_df.select("Hospital_ID", "Hospital_Name", "Number_of_Beds", "Annual_Revenue", "Patient_Satisfaction_Score")
    hospital_sales_assignments_df = hospital_sales_assignments_df.select("Hospital_ID", "Hospital_Name", "Associate_ID", "Associate_Name")
    
    hospital_sales_df = hospital_stats_df.join(
        hospital_sales_assignments_df, ["Hospital_ID", "Hospital_Name"], "inner"
    ).cache()

    # Join the above results on Associate_ID and Associate_Name
    combined_df = employment_compensation_df.join(
        hospital_sales_df, ["Associate_ID", "Associate_Name"], "inner"
    )
except Exception as e:
    logger.error(f"Error during data joining: {e}")
    raise

# COMMAND ----------
# Step 3: Custom Calculations
try:
    logger.info("Performing custom calculations...")
    # Calculate total compensation
    combined_df = combined_df.withColumn(
        "Total_Compensation",
        col("Base_Salary") + (col("Commission_Percentage") / 100) * col("Base_Salary") + col("Bonus")
    )

    # Select necessary columns and join logistics channels with growth opportunities
    logistics_channels_df = logistics_channels_df.select("Channel_ID", "Channel_Type", "Hospital_ID")
    growth_opportunities_df = growth_opportunities_df.select("Channel_ID", "Channel_Type", "Projected_Growth_Rate")
    
    logistics_growth_df = logistics_channels_df.join(
        growth_opportunities_df, ["Channel_ID", "Channel_Type"], "inner"
    ).cache()

    # Join with combined_df on Hospital_ID
    final_df = combined_df.join(
        logistics_growth_df, "Hospital_ID", "inner"
    )

    # Calculate projected revenue
    final_df = final_df.withColumn(
        "Projected_Revenue",
        expr("""
            CASE
                WHEN Target_Year = 2024 THEN Sales_Revenue * (1 + Projected_Growth_Rate / 100)
                WHEN Target_Year = 2025 THEN Sales_Revenue * (1 + Projected_Growth_Rate / 100)
                WHEN Target_Year = 2026 THEN Sales_Revenue * (1 + Projected_Growth_Rate / 100)
                ELSE Sales_Revenue
            END
        """)
    )
except Exception as e:
    logger.error(f"Error during custom calculations: {e}")
    raise

# COMMAND ----------
# Step 4: Filtering and Sorting
try:
    logger.info("Filtering and sorting data...")
    # Filter records where Target Year is greater than 2023
    filtered_df = final_df.filter(col("Target_Year") > 2023)

    # Sort records by Target Year in ascending order
    sorted_df = filtered_df.orderBy("Target_Year")
except Exception as e:
    logger.error(f"Error during filtering and sorting: {e}")
    raise

# COMMAND ----------
# Step 5: Write Output to Unity Catalog
try:
    logger.info("Writing output to Unity Catalog...")
    # Ensure the target database exists
    spark.sql("CREATE DATABASE IF NOT EXISTS catalog.target_db")
    
    sorted_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.target_sales")
except Exception as e:
    logger.error(f"Error writing output to Unity Catalog: {e}")
    raise

logger.info("ETL process completed successfully.")
