In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Cardinal Health Data
# MAGIC This notebook performs an ETL process on data from Unity Catalog tables, integrating various datasets and calculating projected revenue.

# COMMAND ----------

import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, broadcast

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# Assume the Spark session is already initialized as 'spark'

def main():
    try:
        # Load data from Unity Catalog tables
        logger.info("Loading data from Unity Catalog tables...")
        associates_df = spark.table("genai_demo.cardinal_health.associates_employment")
        compensation_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
        hospital_assignments_df = spark.table("genai_demo.cardinal_health.hospital_assignments")
        logistics_channels_df = spark.table("genai_demo.cardinal_health.logistics_channels")
        growth_opportunities_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
        historical_sales_df = spark.table("genai_demo.cardinal_health.historical_sales_trending")
        third_party_trends_df = spark.table("genai_demo.cardinal_health.third_party_trends")
        company_goals_df = spark.table("genai_demo.cardinal_health.company_goals_1")

# COMMAND ----------

        # Data Integration: Join associates with compensation
        logger.info("Joining associates with compensation data...")
        joined_df = associates_df.join(compensation_df, "Associate_ID", "inner")

        # Calculate total compensation
        logger.info("Calculating total compensation...")
        joined_df = joined_df.withColumn("Compensation", expr("Base_Salary + (Commission_Percentage * Base_Salary) + Bonus"))

# COMMAND ----------

        # Join hospital data with hospital assignments
        logger.info("Joining hospital data with hospital assignments...")
        hospital_joined_df = hospital_assignments_df.join(hospital_assignments_df, ["Hospital_ID", "Hospital_Name"], "inner")

        # Join logistics channels with growth opportunities
        logger.info("Joining logistics channels with growth opportunities...")
        logistics_joined_df = logistics_channels_df.join(growth_opportunities_df, ["Channel_ID", "Channel_Type"], "inner")

# COMMAND ----------

        # Ensure unique records
        logger.info("Ensuring unique records...")
        unique_df = logistics_joined_df.dropDuplicates(["Channel_ID", "Channel_Type", "Hospital_ID"])

        # Join historical sales with third-party trends
        logger.info("Joining historical sales with third-party trends...")
        sales_trends_joined_df = historical_sales_df.join(third_party_trends_df, "Channel_Type", "inner")

# COMMAND ----------

        # Calculate projected revenue
        logger.info("Calculating projected revenue...")
        def calculate_projected_revenue(df):
            return df.withColumn("Projected_Revenue", expr("""
                CASE 
                    WHEN Target_Year = 2024 THEN Sales_Revenue * (1 + Projected_Sales_Growth_Rate / 100)
                    WHEN Target_Year = 2025 THEN Sales_Revenue * (1 + Projected_Sales_Growth_Rate / 100) * (1 + Projected_Sales_Growth_Rate / 100)
                    WHEN Target_Year = 2026 THEN Sales_Revenue * (1 + Projected_Sales_Growth_Rate / 100) * (1 + Projected_Sales_Growth_Rate / 100) * (1 + Projected_Sales_Growth_Rate / 100)
                    ELSE Sales_Revenue
                END
            """))

        final_df = calculate_projected_revenue(sales_trends_joined_df)

# COMMAND ----------

        # Filter data for target years greater than 2023
        logger.info("Filtering data for target years greater than 2023...")
        filtered_df = final_df.filter(col("Target_Year") > 2023)

        # Select specific fields for final output
        logger.info("Selecting specific fields for final output...")
        selected_df = filtered_df.select("Hospital_ID", "Channel_Type", "Investment_Planned", "Sales_Revenue", "Market_Trend", "Political_Impact", "Economic_Impact", "Target_Year", "Projected_Sales_Growth_Rate", "Projected_Revenue")

# COMMAND ----------

        # Sort data by target year
        logger.info("Sorting data by target year...")
        sorted_df = selected_df.orderBy("Target_Year")

        # Write the final processed data to Unity Catalog table
        logger.info("Writing the final processed data to Unity Catalog table...")
        sorted_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.final_output")

        logger.info("ETL process completed successfully.")

    except Exception as e:
        logger.error("An error occurred during the ETL process", exc_info=True)

if __name__ == "__main__":
    main()
