In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Customer 360 Data
# MAGIC This notebook performs an ETL process to consolidate customer data from various sources into a comprehensive Customer 360 view.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Data Selection and Filtering
# MAGIC Load and filter data from Unity Catalog tables.

# COMMAND ----------

try:
    logger.info("Loading and filtering data from Unity Catalog tables.")

    # Load demographics data
    demographics_df = spark.table("genai_demo.jnj.demographics").select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
        F.to_date("Date_of_Birth").alias("Date_of_Birth"), "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    ).cache()

    # Load claims data
    claims_df = spark.table("genai_demo.jnj.claims").select(
        "Claim_ID", "Policy_ID", F.to_date("Claim_Date").alias("Claim_Date"), "Claim_Type", "Claim_Status",
        F.col("Claim_Amount").cast(DoubleType()).alias("Claim_Amount"), F.col("Claim_Payout").cast(DoubleType()).alias("Claim_Payout")
    ).cache()

    # Load policy data
    policy_df = spark.table("genai_demo.jnj.policy").select(
        "policy_id", "customer_id", "policy_type", "policy_status", "policy_start_date", "policy_end_date",
        "policy_term", "policy_premium", "total_premium_paid", "renewal_status", "policy_addons"
    ).cache()

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Data Integration
# MAGIC Integrate data from multiple sources.

# COMMAND ----------

    logger.info("Integrating data from multiple sources.")

    # Join demographics and policy data on Customer_ID
    integrated_df = demographics_df.join(policy_df, demographics_df.Customer_ID == policy_df.customer_id, "inner")

    # Join the result with claims data on Policy_ID
    integrated_df = integrated_df.join(claims_df, integrated_df.policy_id == claims_df.Policy_ID, "inner")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Data Aggregation
# MAGIC Aggregate data to compute metrics.

# COMMAND ----------

    logger.info("Aggregating data to compute metrics.")

    aggregated_df = integrated_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.count("policy_id").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Custom Calculations
# MAGIC Perform custom calculations.

# COMMAND ----------

    logger.info("Performing custom calculations.")

    final_df = aggregated_df.withColumn("Age", F.datediff(F.current_date(), F.col("Date_of_Birth")) / 365) \
        .withColumn("Claim_To_Premium_Ratio", F.when(F.col("total_premium_paid") != 0, F.col("Claim_Amount") / F.col("total_premium_paid")).otherwise(0)) \
        .withColumn("Claims_Per_Policy", F.when(F.col("Policy_Count") != 0, F.col("Total_Claims") / F.col("Policy_Count")).otherwise(0)) \
        .withColumn("Retention_Rate", F.lit(0.85)) \
        .withColumn("Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on")) \
        .withColumn("Upsell_Potential", F.lit("Premium Vehicle Coverage"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: Comprehensive Data Consolidation
# MAGIC Consolidate data with AI/ML insights and scores.

# COMMAND ----------

    logger.info("Consolidating data with AI/ML insights and scores.")

    aiml_insights_df = spark.table("genai_demo.jnj.aiml_insights").cache()
    scores_df = spark.table("genai_demo.jnj.scores").cache()

    customer_360_df = final_df.join(F.broadcast(aiml_insights_df), "Customer_ID", "inner") \
        .join(F.broadcast(scores_df), "Customer_ID", "inner")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Output Handling
# MAGIC Write the final consolidated data to a Unity Catalog table.

# COMMAND ----------

    logger.info("Writing the final consolidated data to Unity Catalog table.")
    customer_360_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.jnj.customer_360")

except Exception as e:
    logger.error("An error occurred during the ETL process: %s", str(e))
    raise
