In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Customer 360 View
# MAGIC This notebook performs an ETL process to consolidate data from various sources into a comprehensive customer 360 view.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.functions import current_date, datediff, col

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    policy_df = spark.table("catalog.db.policy_data")
    claims_df = spark.table("catalog.db.claims_data")
    demographics_df = spark.table("catalog.db.demographics_data")
    scores_df = spark.table("catalog.db.scores_data")
    aiml_insights_df = spark.table("catalog.db.aiml_insights_data")

# COMMAND ----------

    # Data Selection and Filtering
    logger.info("Selecting relevant fields from demographics data...")
    demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
        "Date_of_Birth", "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )

# COMMAND ----------

    # Data Integration
    logger.info("Joining datasets based on common identifiers...")
    joined_df = demographics_df.join(policy_df, demographics_df.Customer_ID == policy_df.customer_id, "inner") \
                               .join(claims_df, policy_df.policy_id == claims_df.Policy_ID, "inner")

# COMMAND ----------

    # Data Aggregation
    logger.info("Aggregating data to compute metrics...")
    aggregated_df = joined_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.countDistinct("policy_id").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )

# COMMAND ----------

    # Custom Calculations
    logger.info("Performing custom calculations...")
    demographics_df = demographics_df.withColumn("Age", (datediff(current_date(), col("Date_of_Birth")) / 365.25).cast("int"))
    joined_df = joined_df.withColumn("Claim_To_Premium_Ratio", F.when(joined_df.total_premium_paid != 0, joined_df.Claim_Amount / joined_df.total_premium_paid).otherwise(0))
    aggregated_df = aggregated_df.withColumn("Claims_Per_Policy", F.when(aggregated_df.Policy_Count != 0, aggregated_df.Total_Claims / aggregated_df.Policy_Count).otherwise(0))
    aggregated_df = aggregated_df.withColumn("Retention_Rate", F.lit(0.85))
    aggregated_df = aggregated_df.withColumn("Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on"))
    aggregated_df = aggregated_df.withColumn("Upsell_Potential", F.lit("Premium Vehicle Coverage"))

# COMMAND ----------

    # Comprehensive Data Consolidation
    logger.info("Consolidating all data sources into a single view...")
    final_df = aggregated_df.join(scores_df, "Customer_ID", "inner") \
                            .join(aiml_insights_df, "Customer_ID", "inner")

# COMMAND ----------

    # Output Handling
    logger.info("Writing the final DataFrame to Unity Catalog as a Delta table...")
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.db.customer_360_view")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
