In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Customer 360 View
# MAGIC This notebook performs an ETL process to create a comprehensive Customer 360 view using data from Unity Catalog tables.

# COMMAND ----------
import logging
from pyspark.sql.functions import count, avg, max, datediff, current_date, when, lit, col
from pyspark.sql import DataFrame

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def cache_dataframe(df: DataFrame, name: str) -> DataFrame:
    """Cache a DataFrame and log the action."""
    logger.info(f"Caching DataFrame: {name}")
    return df.cache()

# COMMAND ----------
try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    policy_df = spark.table("genai_demo.jnj.policy")
    claims_df = spark.table("genai_demo.jnj.claims")
    demographics_df = spark.table("genai_demo.jnj.demographics")
    scores_df = spark.table("genai_demo.jnj.scores")
    aiml_insights_df = spark.table("genai_demo.jnj.aiml_insights")

# COMMAND ----------
    # Data Selection and Filtering
    logger.info("Selecting relevant fields from datasets...")
    selected_demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", 
        "City", "State", "Postal_Code", "Date_of_Birth", "Gender", 
        "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )

    selected_claims_df = claims_df.select(
        "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", 
        "Claim_Status", "Claim_Amount", "Claim_Payout"
    )

# COMMAND ----------
    # Data Integration
    logger.info("Integrating datasets...")
    demographics_policy_df = selected_demographics_df.join(
        policy_df, "Customer_ID", "inner"
    )
    demographics_policy_df = cache_dataframe(demographics_policy_df, "demographics_policy_df")

    # Check for duplicate columns in join
    integrated_df = demographics_policy_df.join(
        selected_claims_df, "Policy_ID", "inner"
    ).drop(selected_claims_df["Policy_ID"])

# COMMAND ----------
    # Data Aggregation
    logger.info("Aggregating data...")
    aggregated_df = integrated_df.groupBy("Customer_ID").agg(
        count("Claim_ID").alias("Total_Claims"),
        count("Policy_ID").alias("Policy_Count"),
        max("Claim_Date").alias("Recent_Claim_Date"),
        avg("Claim_Amount").alias("Average_Claim_Amount")
    )
    aggregated_df = cache_dataframe(aggregated_df, "aggregated_df")

# COMMAND ----------
    # Custom Calculations
    logger.info("Performing custom calculations...")
    final_df = aggregated_df.join(demographics_policy_df, "Customer_ID", "inner").withColumn(
        "Age", datediff(current_date(), "Date_of_Birth") / 365
    ).withColumn(
        "Claim_To_Premium_Ratio", when(
            col("Total_Premium_Paid") != 0, 
            col("Claim_Amount") / col("Total_Premium_Paid")
        ).otherwise(0)
    ).withColumn(
        "Claims_Per_Policy", when(
            col("Policy_Count") != 0, 
            col("Total_Claims") / col("Policy_Count")
        ).otherwise(0)
    ).withColumn(
        "Retention_Rate", lit(0.85)
    ).withColumn(
        "Cross_Sell_Opportunities", lit("Multi-Policy Discount, Home Coverage Add-on")
    ).withColumn(
        "Upsell_Potential", lit("Premium Vehicle Coverage")
    )

# COMMAND ----------
    # Comprehensive Data Consolidation
    logger.info("Consolidating data with AI/ML insights and scores...")
    customer_360_df = final_df.join(
        broadcast(scores_df), "Customer_ID", "inner"
    ).join(
        broadcast(aiml_insights_df), "Customer_ID", "inner"
    )

# COMMAND ----------
    # Output to Customer 360 View
    logger.info("Writing final DataFrame to Unity Catalog table...")
    customer_360_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.jnj.customer_360_view")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
