In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Insurance Data
# MAGIC This notebook performs an ETL process on insurance data using PySpark. It loads data from Unity Catalog tables, performs data transformations, and writes the final output back to Unity Catalog.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, DateType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    policy_df = spark.table("catalog.insurance_db.policy_data")
    claims_df = spark.table("catalog.insurance_db.claims_data")
    demographics_df = spark.table("catalog.insurance_db.demographics_data")
    scores_df = spark.table("catalog.insurance_db.scores_data")
    aiml_insights_df = spark.table("catalog.insurance_db.aiml_insights_data")

# COMMAND ----------

    # Data Selection and Filtering
    logger.info("Selecting relevant fields from demographics data...")
    demographics_selected = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
        F.col("Date_of_Birth").cast(DateType()), "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )

# COMMAND ----------

    # Data Integration
    logger.info("Joining demographics and policy data...")
    joined_df = demographics_selected.join(
        policy_df, demographics_selected.Customer_ID == policy_df.customer_id, "inner"
    ).cache()

# COMMAND ----------

    # Data Aggregation
    logger.info("Aggregating claims data...")
    aggregated_df = claims_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )

# COMMAND ----------

    # Custom Calculations
    logger.info("Calculating custom fields...")
    final_df = joined_df.withColumn(
        "Age", F.datediff(F.current_date(), "Date_of_Birth") / 365
    ).withColumn(
        "Claim_To_Premium_Ratio", F.when(joined_df.total_premium_paid != 0, joined_df.Claim_Amount / joined_df.total_premium_paid).otherwise(0)
    ).withColumn(
        "Claims_Per_Policy", F.when(F.col("Policy_Count") != 0, F.col("Total_Claims") / F.col("Policy_Count")).otherwise(0)
    ).withColumn(
        "Retention_Rate", F.lit(0.85)
    ).withColumn(
        "Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on")
    ).withColumn(
        "Upsell_Potential", F.lit("Premium Vehicle Coverage")
    )

# COMMAND ----------

    # Comprehensive Data Consolidation
    logger.info("Creating Customer 360 view...")
    customer_360_df = final_df.join(
        aggregated_df, "Customer_ID", "inner"
    ).join(
        scores_df, "Customer_ID", "inner"
    ).join(
        aiml_insights_df, "Customer_ID", "inner"
    )

# COMMAND ----------

    # Output Handling
    logger.info("Writing the final output to Unity Catalog table...")
    customer_360_df.write.format("delta").mode("overwrite").saveAsTable("catalog.insurance_db.customer_360")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
