In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Customer 360 Profile
# MAGIC This notebook performs an ETL process to create a comprehensive customer profile by integrating data from various sources.

# COMMAND ----------

import logging
from pyspark.sql import functions as F

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    demographics_df = spark.table("genai_demo.jnj.demographics")
    claims_df = spark.table("genai_demo.jnj.claims")
    policy_df = spark.table("genai_demo.jnj.policy")
    scores_df = spark.table("genai_demo.jnj.scores")
    aiml_insights_df = spark.table("genai_demo.jnj.aiml_insights")

# COMMAND ----------

    # Step 1: Data Selection and Filtering
    logger.info("Selecting relevant fields from each dataset...")
    selected_demographics_df = demographics_df.select("Customer_ID", "Customer_Name", "Email", "Date_of_Birth")
    selected_claims_df = claims_df.select("Claim_ID", "Policy_ID", "Claim_Date", "Claim_Amount")
    selected_policy_df = policy_df.select("policy_id", "customer_id", "policy_type", "total_premium_paid")

# COMMAND ----------

    # Step 2: Data Integration
    logger.info("Joining datasets based on common identifiers...")
    joined_df = selected_demographics_df.join(selected_policy_df, selected_demographics_df.Customer_ID == selected_policy_df.customer_id, "inner")
    joined_df = joined_df.join(selected_claims_df, joined_df.policy_id == selected_claims_df.Policy_ID, "inner")

    # Cache the joined DataFrame for performance
    joined_df.cache()

# COMMAND ----------

    # Step 3: Data Aggregation
    logger.info("Aggregating data to compute metrics...")
    aggregated_df = joined_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.count("policy_id").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )

# COMMAND ----------

    # Step 4: Custom Calculations
    logger.info("Performing custom calculations...")
    final_df = aggregated_df.withColumn("Age", F.datediff(F.current_date(), F.to_date("Date_of_Birth", "yyyy-MM-dd")) / 365) \
        .withColumn("Claim_To_Premium_Ratio", F.when(aggregated_df.total_premium_paid != 0, aggregated_df.Average_Claim_Amount / aggregated_df.total_premium_paid).otherwise(0)) \
        .withColumn("Claims_Per_Policy", F.when(aggregated_df.Policy_Count != 0, aggregated_df.Total_Claims / aggregated_df.Policy_Count).otherwise(0))

# COMMAND ----------

    # Step 5: Comprehensive Data Joining
    logger.info("Integrating all data sources into a comprehensive customer profile...")
    comprehensive_profile_df = final_df.join(aiml_insights_df, "Customer_ID", "inner") \
        .join(scores_df, "Customer_ID", "inner")

# COMMAND ----------

    # Output Configuration
    logger.info("Writing the comprehensive customer profile to Unity Catalog...")
    comprehensive_profile_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.jnj.customer_360_profile")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
