In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Sales Data
# MAGIC This notebook performs an ETL process on sales data using PySpark in Databricks.

# COMMAND ----------

# MAGIC
import logging
from pyspark.sql import functions as F
from pyspark.sql import DataFrame

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC
def load_data_from_unity_catalog(table_name: str) -> DataFrame:
    """Load data from Unity Catalog table."""
    try:
        logger.info(f"Loading data from Unity Catalog table: {table_name}")
        df = spark.table(table_name)
        return df
    except Exception as e:
        logger.error(f"Error loading data from {table_name}: {e}")
        raise

# COMMAND ----------

# MAGIC
def join_dataframes(df1: DataFrame, df2: DataFrame, join_column: str, join_type: str = "inner") -> DataFrame:
    """Join two DataFrames on a specified column, selecting only necessary columns."""
    try:
        logger.info(f"Joining DataFrames on column: {join_column}")
        # Select necessary columns from each DataFrame
        df1_selected = df1.select(join_column, *[col for col in df1.columns if col != join_column])
        df2_selected = df2.select(join_column, *[col for col in df2.columns if col != join_column])
        
        joined_df = df1_selected.join(df2_selected, join_column, join_type)
        return joined_df
    except Exception as e:
        logger.error(f"Error joining DataFrames on {join_column}: {e}")
        raise

# COMMAND ----------

# MAGIC
def calculate_compensation(df: DataFrame) -> DataFrame:
    """Calculate total compensation for each associate."""
    try:
        logger.info("Calculating total compensation for each associate")
        df = df.withColumn(
            "Compensation",
            F.col("Base_Salary") + (F.col("Commission_Percentage") / 100 * F.col("Base_Salary")) + F.col("Bonus")
        )
        return df
    except Exception as e:
        logger.error(f"Error calculating compensation: {e}")
        raise

# COMMAND ----------

# MAGIC
def calculate_projected_revenue(df: DataFrame) -> DataFrame:
    """Calculate projected revenue based on target year and growth rate."""
    try:
        logger.info("Calculating projected revenue")
        df = df.withColumn(
            "Projected_Revenue",
            F.when(F.col("Target Year") == 2024, F.col("Sales_Revenue") * (F.col("Projected_Sales_Growth_Rate") / 100))
            .when(F.col("Target Year") == 2025, F.col("Sales_Revenue") * (1 + F.col("Projected_Sales_Growth_Rate") / 100))
            .when(F.col("Target Year") == 2026, F.col("Sales_Revenue") * (1 + F.col("Projected_Sales_Growth_Rate") / 100))
            .otherwise(F.col("Sales_Revenue"))
        )
        return df
    except Exception as e:
        logger.error(f"Error calculating projected revenue: {e}")
        raise

# COMMAND ----------

# MAGIC
def calculate_projected_sales_growth_rate(df: DataFrame) -> DataFrame:
    """Adjust the projected sales growth rate based on the target year."""
    try:
        logger.info("Calculating projected sales growth rate")
        df = df.withColumn(
            "Projected_Sales_Growth_Rate",
            F.when(F.col("Target Year") == 2024, F.col("Projected_Growth_Rate") + (F.col("Projected_Growth_Rate") / 100))
            .when(F.col("Target Year") == 2025, F.col("Projected_Growth_Rate") + (F.col("Projected_Growth_Rate") / 100) * 2)
            .when(F.col("Target Year") == 2026, F.col("Projected_Growth_Rate") + (F.col("Projected_Growth_Rate") / 100) * 3)
            .otherwise(F.col("Projected_Growth_Rate"))
        )
        return df
    except Exception as e:
        logger.error(f"Error calculating projected sales growth rate: {e}")
        raise

# COMMAND ----------

# MAGIC
def main():
    try:
        # Load data from Unity Catalog tables
        hospital_stats_df = load_data_from_unity_catalog("genai_demo.cardinal_health.hospital_stats_north_america")
        sales_assignments_df = load_data_from_unity_catalog("genai_demo.cardinal_health.hospital_sales_assignments")
        employment_details_df = load_data_from_unity_catalog("genai_demo.cardinal_health.sales_associates_employment_details")
        compensation_guidelines_df = load_data_from_unity_catalog("genai_demo.cardinal_health.compensation_guidelines")
        historical_sales_df = load_data_from_unity_catalog("genai_demo.cardinal_health.historical_sales")

        # Join operations
        hospital_sales_df = join_dataframes(hospital_stats_df, sales_assignments_df, "Hospital_ID")
        employment_compensation_df = join_dataframes(employment_details_df, compensation_guidelines_df, "Associate_ID")
        combined_df = join_dataframes(employment_compensation_df, hospital_sales_df, "Associate_ID")

        # Calculate compensation
        compensation_df = calculate_compensation(combined_df)

        # Calculate projected revenue and sales growth rate
        revenue_df = calculate_projected_revenue(historical_sales_df)
        growth_rate_df = calculate_projected_sales_growth_rate(revenue_df)

        # Write the final DataFrame to Unity Catalog table
        logger.info("Writing the final DataFrame to Unity Catalog table")
        growth_rate_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.Target_sales")

    except Exception as e:
        logger.error(f"Error in ETL process: {e}")
        raise

if __name__ == "__main__":
    main()
