In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Customer 360 View
# MAGIC This notebook performs an ETL process to create a comprehensive customer 360 view by integrating data from various sources.

# COMMAND ----------
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, StringType
from pyspark.sql import DataFrame

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def cache_df(df: DataFrame, name: str) -> DataFrame:
    """Cache a DataFrame and log its action."""
    logger.info(f"Caching DataFrame: {name}")
    return df.cache()

# COMMAND ----------
try:
    # Step 1: Data Loading
    logger.info("Loading data from Unity Catalog tables.")
    policy_df = spark.table("genai_demo.jnj.policy")
    claims_df = spark.table("genai_demo.jnj.claims")
    demographics_df = spark.table("genai_demo.jnj.demographics")
    scores_df = spark.table("genai_demo.jnj.scores")
    aiml_insights_df = spark.table("genai_demo.jnj.aiml_insights")

# COMMAND ----------
    # Step 2: Data Selection
    logger.info("Selecting relevant fields from demographics and claims data.")
    selected_demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", 
        "State", "Postal_Code", "Date_of_Birth", "Gender", "Marital_Status", 
        "Occupation", "Income_Level", "Customer_Segment"
    )

    selected_claims_df = claims_df.select(
        "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", "Claim_Status", 
        "Claim_Amount", "Claim_Payout"
    )

# COMMAND ----------
    # Step 3: Data Integration
    logger.info("Joining datasets based on common identifiers.")
    joined_df = selected_demographics_df.join(policy_df, "Customer_ID") \
        .join(selected_claims_df, "Policy_ID")
    joined_df = cache_df(joined_df, "joined_df")

# COMMAND ----------
    # Step 4: Data Aggregation
    logger.info("Computing aggregate metrics.")
    aggregated_df = joined_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.count("Policy_ID").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )

# COMMAND ----------
    # Step 5: Custom Calculations
    logger.info("Implementing custom calculations for additional metrics.")
    final_df = aggregated_df.withColumn(
        "Age", F.datediff(F.current_date(), F.col("Date_of_Birth")) / 365
    ).withColumn(
        "Claim_To_Premium_Ratio", F.when(F.col("Total_Premium_Paid") != 0, F.col("Claim_Amount") / F.col("Total_Premium_Paid")).otherwise(0)
    ).withColumn(
        "Claims_Per_Policy", F.when(F.col("Policy_Count") != 0, F.col("Total_Claims") / F.col("Policy_Count")).otherwise(0)
    ).withColumn(
        "Retention_Rate", F.lit(0.85).cast(DoubleType())
    ).withColumn(
        "Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on").cast(StringType())
    ).withColumn(
        "Upsell_Potential", F.lit("Premium Vehicle Coverage").cast(StringType())
    )

# COMMAND ----------
    # Step 6: Comprehensive Data Joining
    logger.info("Integrating additional insights from AI/ML and scores data.")
    customer_360_df = final_df.join(scores_df, "Customer_ID") \
        .join(aiml_insights_df, "Customer_ID")

# COMMAND ----------
    # Step 7: Output Data
    logger.info("Writing the final customer 360 data to a Unity Catalog table.")
    customer_360_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.jnj.customer_360_view")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process.", exc_info=True)
