In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Customer 360 View ETL Process
# MAGIC This notebook performs an ETL process to create a comprehensive Customer 360 View using data from Unity Catalog tables.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, DateType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Data Selection and Filtering
# MAGIC Load and select data from Unity Catalog tables.

# COMMAND ----------

try:
    logger.info("Loading and selecting data from Unity Catalog tables.")
    
    demographics_df = spark.table("genai_demo.jnj.demographics").select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City",
        "State", "Postal_Code", "Date_of_Birth", "Gender", "Marital_Status",
        "Occupation", "Income_Level", "Customer_Segment"
    ).withColumn("Date_of_Birth", F.col("Date_of_Birth").cast(DateType()))

    claims_df = spark.table("genai_demo.jnj.claims").select(
        "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", "Claim_Status",
        "Claim_Amount", "Claim_Payout"
    ).withColumn("Claim_Date", F.col("Claim_Date").cast(DateType())) \
     .withColumn("Claim_Amount", F.col("Claim_Amount").cast(DoubleType())) \
     .withColumn("Claim_Payout", F.col("Claim_Payout").cast(DoubleType()))

    policy_df = spark.table("genai_demo.jnj.policy").select(
        "Policy_ID", "Customer_ID", "Policy_Type", "Policy_Status", "Policy_Start_Date",
        "Policy_End_Date", "Policy_Term", "Policy_Premium", "Total_Premium_Paid",
        "Renewal_Status", "Policy_Addons"
    ).withColumn("Policy_Start_Date", F.col("Policy_Start_Date").cast(DateType())) \
     .withColumn("Policy_End_Date", F.col("Policy_End_Date").cast(DateType())) \
     .withColumn("Policy_Premium", F.col("Policy_Premium").cast(DoubleType())) \
     .withColumn("Total_Premium_Paid", F.col("Total_Premium_Paid").cast(DoubleType()))

    scores_df = spark.table("genai_demo.jnj.scores").select(
        "Customer_ID", "Credit_Score", "Fraud_Score", "Customer_Risk_Score"
    )

    aiml_insights_df = spark.table("genai_demo.jnj.aiml_insights").select(
        "Customer_ID", "Churn_Probability", "Next_Best_Offer", "Claims_Fraud_Probability", "Revenue_Potential"
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Data Integration
# MAGIC Integrate datasets based on common identifiers.

# COMMAND ----------

    logger.info("Integrating datasets based on common identifiers.")
    
    integrated_df = demographics_df.join(policy_df, "Customer_ID", "inner")
    integrated_df = integrated_df.join(claims_df, "Policy_ID", "inner")

    # Cache the integrated DataFrame as it will be used multiple times
    integrated_df.cache()

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Data Aggregation
# MAGIC Aggregate data to compute metrics.

# COMMAND ----------

    logger.info("Aggregating data to compute metrics.")
    
    aggregated_df = integrated_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.count("Policy_ID").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Custom Calculations
# MAGIC Perform custom calculations on the aggregated data.

# COMMAND ----------

    logger.info("Performing custom calculations.")
    
    aggregated_df = aggregated_df.withColumn(
        "Age", F.datediff(F.current_date(), "Date_of_Birth") / 365
    ).withColumn(
        "Claim_To_Premium_Ratio",
        F.when(aggregated_df["Total_Premium_Paid"] != 0,
               aggregated_df["Average_Claim_Amount"] / aggregated_df["Total_Premium_Paid"]).otherwise(0)
    ).withColumn(
        "Claims_Per_Policy",
        F.when(aggregated_df["Policy_Count"] != 0,
               aggregated_df["Total_Claims"] / aggregated_df["Policy_Count"]).otherwise(0)
    ).withColumn(
        "Retention_Rate", F.lit(0.85)
    ).withColumn(
        "Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on")
    ).withColumn(
        "Upsell_Potential", F.lit("Premium Vehicle Coverage")
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: Comprehensive Data Joining
# MAGIC Join with AI/ML insights and scores.

# COMMAND ----------

    logger.info("Joining with AI/ML insights and scores.")
    
    final_df = aggregated_df.join(aiml_insights_df, "Customer_ID", "inner")
    final_df = final_df.join(scores_df, "Customer_ID", "inner")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 6: Output Generation
# MAGIC Write the final Customer 360 View to Unity Catalog.

# COMMAND ----------

    logger.info("Writing the final Customer 360 View to Unity Catalog.")
    
    final_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.guardian.customer_360_view")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process: %s", e)
    raise
