In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Customer 360 Data
# MAGIC This notebook performs an ETL process to integrate and transform data from various Unity Catalog tables into a comprehensive Customer 360 view.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables.")
    demographics_df = spark.table("genai_demo.guardian.demographics")
    claims_df = spark.table("genai_demo.guardian.claims")
    policy_df = spark.table("genai_demo.guardian.policy")
    aiml_insights_df = spark.table("genai_demo.guardian.aiml_insights")
    scores_df = spark.table("genai_demo.guardian.scores")

# COMMAND ----------

    # Select relevant fields
    logger.info("Selecting relevant fields from demographics and claims data.")
    selected_demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
        "Date_of_Birth", "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )
    selected_claims_df = claims_df.select(
        "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", "Claim_Status", "Claim_Amount", "Claim_Payout"
    )

# COMMAND ----------

    # Data Integration
    logger.info("Joining demographics and policy data on Customer_ID.")
    demographics_policy_df = selected_demographics_df.join(policy_df, "Customer_ID", "inner")

    # Remove duplicate columns from the right DataFrame
    logger.info("Joining the result with claims data on Policy_ID.")
    integrated_df = demographics_policy_df.join(selected_claims_df, "Policy_ID", "inner")

    # Cache the integrated DataFrame if reused
    integrated_df.cache()

# COMMAND ----------

    # Data Aggregation
    logger.info("Aggregating data to calculate metrics.")
    aggregated_df = integrated_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.countDistinct("Policy_ID").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )

# COMMAND ----------

    # Custom Calculations
    logger.info("Performing custom calculations.")
    integrated_df = integrated_df.withColumn("Age", F.datediff(F.current_date(), "Date_of_Birth") / 365)
    integrated_df = integrated_df.withColumn(
        "Claim_To_Premium_Ratio",
        F.when(integrated_df["Total_Premium_Paid"] != 0, integrated_df["Claim_Amount"] / integrated_df["Total_Premium_Paid"]).otherwise(0)
    )
    integrated_df = integrated_df.withColumn(
        "Claims_Per_Policy",
        F.when(aggregated_df["Policy_Count"] != 0, aggregated_df["Total_Claims"] / aggregated_df["Policy_Count"]).otherwise(0)
    )
    integrated_df = integrated_df.withColumn("Retention_Rate", F.lit(0.85))
    integrated_df = integrated_df.withColumn("Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on"))
    integrated_df = integrated_df.withColumn("Upsell_Potential", F.lit("Premium Vehicle Coverage"))

# COMMAND ----------

    # Final Data Integration
    logger.info("Combining all data with AI/ML insights and scores.")
    final_df = integrated_df.join(F.broadcast(aiml_insights_df), "Customer_ID", "inner").join(F.broadcast(scores_df), "Customer_ID", "inner")

# COMMAND ----------

    # Write to Unity Catalog target table
    logger.info("Writing final output to Unity Catalog target table.")
    final_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.guardian.customer_360")

except Exception as e:
    logger.error("An error occurred during the ETL process: %s", str(e))
    raise
