In [None]:
# Databricks notebook source
import logging
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, datediff, current_date, broadcast

# COMMAND ----------

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    policy_df = spark.table("genai_demo.guardian.policy")
    claims_df = spark.table("genai_demo.guardian.claims")
    demographics_df = spark.table("genai_demo.guardian.demographics")
    scores_df = spark.table("genai_demo.guardian.scores")
    aiml_insights_df = spark.table("genai_demo.guardian.aiml_insights")

    # COMMAND ----------

    # Select relevant fields from demographics data
    logger.info("Selecting relevant fields from demographics data...")
    selected_demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", 
        "City", "State", "Postal_Code", "Date_of_Birth", "Gender", 
        "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )

    # COMMAND ----------

    # Join demographics with policy data on Customer_ID
    logger.info("Joining demographics with policy data on Customer_ID...")
    joined_df = selected_demographics_df.join(
        policy_df, "Customer_ID", "inner"
    )

    # COMMAND ----------

    # Check for duplicate columns in claims_df and drop them
    claims_columns = set(claims_df.columns)
    joined_columns = set(joined_df.columns)
    duplicate_columns = claims_columns.intersection(joined_columns)
    claims_df = claims_df.drop(*duplicate_columns)

    # COMMAND ----------

    # Join with claims data on Policy_ID
    logger.info("Joining with claims data on Policy_ID...")
    joined_df = joined_df.join(
        claims_df, "Policy_ID", "inner"
    )

    # COMMAND ----------

    # Aggregate data to calculate total claims, policy count, and average claim amount
    logger.info("Aggregating data for total claims, policy count, and average claim amount...")
    aggregated_df = joined_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.countDistinct("Policy_ID").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )

    # COMMAND ----------

    # Implement custom calculations
    logger.info("Implementing custom calculations...")
    final_df = aggregated_df.join(joined_df, "Customer_ID").withColumn(
        "Age", datediff(current_date(), col("Date_of_Birth")) / 365
    ).withColumn(
        "Claim_To_Premium_Ratio", when(col("Total_Premium_Paid") != 0, col("Average_Claim_Amount") / col("Total_Premium_Paid")).otherwise(0)
    ).withColumn(
        "Claims_Per_Policy", when(col("Policy_Count") != 0, col("Total_Claims") / col("Policy_Count")).otherwise(0)
    ).withColumn(
        "Retention_Rate", F.lit(0.85)
    ).withColumn(
        "Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on")
    ).withColumn(
        "Upsell_Potential", F.lit("Premium Vehicle Coverage")
    )

    # COMMAND ----------

    # Join with AI/ML insights and scores
    logger.info("Joining with AI/ML insights and scores...")
    enriched_df = final_df.join(
        broadcast(aiml_insights_df), "Customer_ID", "inner"
    ).join(
        broadcast(scores_df), "Customer_ID", "inner"
    )

    # COMMAND ----------

    # Write the final DataFrame to a Unity Catalog target table
    logger.info("Writing the final DataFrame to a Unity Catalog target table...")
    enriched_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.guardian.customer_360")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
