In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Cardinal Health Data
# MAGIC This notebook performs an ETL process on data from Unity Catalog tables, integrating and transforming the data to produce a final dataset.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    sales_associates_df = spark.table("genai_demo.cardinal_health.SalesAssociates_EmploymentDetails")
    company_goals_df = spark.table("genai_demo.cardinal_health.Company_Goals")
    compensation_guidelines_df = spark.table("genai_demo.cardinal_health.Compensation_Guidelines")
    growth_opportunities_df = spark.table("genai_demo.cardinal_health.Growth_Opportunities")
    historical_sales_df = spark.table("genai_demo.cardinal_health.Historical_Sales")
    hospital_sales_assignments_df = spark.table("genai_demo.cardinal_health.HospitalSales_Assignments")
    hospital_stats_df = spark.table("genai_demo.cardinal_health.hospital_stats_north_america")
    logistics_channels_df = spark.table("genai_demo.cardinal_health.Logistics_Channels")
    third_party_sales_trends_df = spark.table("genai_demo.cardinal_health.ThirdParty_SalesTrends")

# COMMAND ----------

    # Data Integration: Perform necessary joins
    logger.info("Performing data integration through joins...")
    # Join Sales Associates with Compensation Guidelines
    associates_compensation_df = sales_associates_df.join(compensation_guidelines_df, "Associate_ID", "inner")

    # Join Hospital Stats with Hospital Sales Assignments
    hospital_sales_df = hospital_stats_df.join(hospital_sales_assignments_df, "Hospital_ID", "inner")

    # Join the results of previous joins
    # Ensure the join keys are correct for the datasets
    combined_df = associates_compensation_df.join(hospital_sales_df, "Hospital_ID", "inner")

# COMMAND ----------

    # Custom Calculations
    logger.info("Applying custom calculations...")
    # Calculate total compensation
    combined_df = combined_df.withColumn("Compensation",
                                         combined_df["Base_Salary"] +
                                         (combined_df["Commission_Percentage"] * combined_df["Base_Salary"]) +
                                         combined_df["Bonus"])

    # Define UDF for projected revenue calculation
    def calculate_projected_revenue(year, sales_revenue, growth_rate):
        if year == 2024:
            return sales_revenue * (growth_rate / 100)
        elif year in [2025, 2026]:
            return sales_revenue * (1 + growth_rate / 100)
        else:
            return sales_revenue

    projected_revenue_udf = F.udf(calculate_projected_revenue, DoubleType())

    # Apply UDF to calculate projected revenue
    combined_df = combined_df.withColumn("Projected_Revenue",
                                         projected_revenue_udf(combined_df["Target Year"],
                                                               combined_df["Sales_Revenue"],
                                                               combined_df["Projected_Sales_Growth_Rate"]))

# COMMAND ----------

    # Data Selection and Standardization
    logger.info("Selecting and standardizing data fields...")
    final_df = combined_df.select("Hospital_ID", "Channel_Type", "Growth_Opportunities",
                                  "Projected_Growth_Rate", "Market_Potential", "Expected_ROI")

# COMMAND ----------

    # Output Data: Write the final processed data to Unity Catalog table
    logger.info("Writing the final processed data to Unity Catalog table...")
    final_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.target_sales")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise
