In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Customer 360 Data
# MAGIC This notebook performs an ETL process to create a Customer 360 dataset using data from Unity Catalog tables.

# COMMAND ----------

import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, avg, max, when, col, datediff, current_date, lit, broadcast

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Data Ingestion
# MAGIC Load data from Unity Catalog tables.

# COMMAND ----------

try:
    logger.info("Loading data from Unity Catalog tables")
    policy_df = spark.table("catalog.source_db.policy")
    claims_df = spark.table("catalog.source_db.claims")
    demographics_df = spark.table("catalog.source_db.demographics")
    scores_df = spark.table("catalog.source_db.scores")
    aiml_insights_df = spark.table("catalog.source_db.aiml_insights")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Data Selection and Filtering
# MAGIC Select relevant fields from each dataset.

# COMMAND ----------

    logger.info("Selecting relevant fields from each dataset")
    selected_demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
        "Date_of_Birth", "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )
    selected_claims_df = claims_df.select(
        "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", "Claim_Status", "Claim_Amount", "Claim_Payout"
    )
    selected_policy_df = policy_df.select(
        "policy_id", "customer_id", "policy_type", "policy_status", "policy_start_date", "policy_end_date",
        "policy_term", "policy_premium", "total_premium_paid", "renewal_status", "policy_addons"
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Data Integration
# MAGIC Join datasets based on key identifiers.

# COMMAND ----------

    logger.info("Joining datasets based on key identifiers")
    joined_df = selected_demographics_df.join(
        selected_policy_df, selected_demographics_df.Customer_ID == selected_policy_df.customer_id, "inner"
    ).join(
        selected_claims_df, selected_policy_df.policy_id == selected_claims_df.Policy_ID, "inner"
    )

    # Cache the joined DataFrame if it will be reused
    joined_df.cache()

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Data Aggregation
# MAGIC Compute aggregate metrics.

# COMMAND ----------

    logger.info("Computing aggregate metrics")
    aggregated_df = joined_df.groupBy("Customer_ID").agg(
        count("Claim_ID").alias("Total_Claims"),
        count("policy_id").alias("Policy_Count"),
        max("Claim_Date").alias("Recent_Claim_Date"),
        avg("Claim_Amount").alias("Average_Claim_Amount")
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: Custom Calculations
# MAGIC Derive additional insights.

# COMMAND ----------

    logger.info("Deriving additional insights")
    final_df = aggregated_df.withColumn("Age", datediff(current_date(), col("Date_of_Birth")) / 365) \
        .withColumn("Claim_To_Premium_Ratio", when(col("total_premium_paid") != 0, col("Claim_Amount") / col("total_premium_paid")).otherwise(0)) \
        .withColumn("Claims_Per_Policy", when(col("Policy_Count") != 0, col("Total_Claims") / col("Policy_Count")).otherwise(0)) \
        .withColumn("Retention_Rate", lit(0.85)) \
        .withColumn("Cross_Sell_Opportunities", lit("Multi-Policy Discount, Home Coverage Add-on")) \
        .withColumn("Upsell_Potential", lit("Premium Vehicle Coverage"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 6: Final Data Assembly
# MAGIC Combine all transformed data into a single dataset.

# COMMAND ----------

    logger.info("Combining all transformed data into a single dataset")
    # Use broadcast join if scores_df or aiml_insights_df are small
    final_customer_360_df = final_df.join(broadcast(scores_df), "Customer_ID", "inner") \
        .join(broadcast(aiml_insights_df), "Customer_ID", "inner")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 7: Output Data
# MAGIC Write the final dataset to Unity Catalog table.

# COMMAND ----------

    logger.info("Writing the final dataset to Unity Catalog table")
    final_customer_360_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.customer_360")

    logger.info("ETL process completed successfully")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
