In [None]:
# Databricks notebook source
# COMMAND ----------
# Import necessary libraries
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DateType, DoubleType, IntegerType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
try:
    # Step 1: Data Ingestion from Unity Catalog Tables
    logger.info("Loading data from Unity Catalog tables...")
    policy_df = spark.table("catalog.source_db.policy_data")
    claims_df = spark.table("catalog.source_db.claims_data")
    demographics_df = spark.table("catalog.source_db.demographics_data")
    scores_df = spark.table("catalog.source_db.scores_data")
    aiml_insights_df = spark.table("catalog.source_db.aiml_insights_data")

    # COMMAND ----------
    # Step 2: Data Selection
    logger.info("Selecting relevant fields from each dataset...")
    selected_demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
        F.col("Date_of_Birth").cast(DateType()), "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )

    selected_claims_df = claims_df.select(
        "Claim_ID", "Policy_ID", F.col("Claim_Date").cast(DateType()), "Claim_Type", "Claim_Status",
        F.col("Claim_Amount").cast(DoubleType()), F.col("Claim_Payout").cast(DoubleType())
    )

    selected_policy_df = policy_df.select(
        "policy_id", "customer_id", "policy_type", "policy_status",
        F.col("policy_start_date").cast(DateType()), F.col("policy_end_date").cast(DateType()),
        F.col("policy_term").cast(IntegerType()), F.col("policy_premium").cast(DoubleType()),
        F.col("total_premium_paid").cast(DoubleType()), "renewal_status", "policy_addons"
    )

    # COMMAND ----------
    # Step 3: Data Integration
    logger.info("Merging datasets based on common identifiers...")
    joined_df = selected_demographics_df.join(selected_policy_df, selected_demographics_df.Customer_ID == selected_policy_df.customer_id, "inner") \
                                        .join(selected_claims_df, selected_policy_df.policy_id == selected_claims_df.Policy_ID, "inner")

    # COMMAND ----------
    # Step 4: Data Aggregation
    logger.info("Computing aggregate metrics...")
    aggregated_df = joined_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.countDistinct("policy_id").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )

    # COMMAND ----------
    # Step 5: Custom Calculations
    logger.info("Implementing custom calculations...")
    final_df = aggregated_df.withColumn("Age", F.datediff(F.current_date(), F.to_date("Date_of_Birth")) / 365) \
                            .withColumn("Claim_To_Premium_Ratio", F.when(F.col("total_premium_paid") != 0, F.col("Claim_Amount") / F.col("total_premium_paid")).otherwise(0)) \
                            .withColumn("Claims_Per_Policy", F.when(F.col("Policy_Count") != 0, F.col("Total_Claims") / F.col("Policy_Count")).otherwise(0)) \
                            .withColumn("Retention_Rate", F.lit(0.85)) \
                            .withColumn("Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on")) \
                            .withColumn("Upsell_Potential", F.lit("Premium Vehicle Coverage"))

    # COMMAND ----------
    # Step 6: Output Generation
    logger.info("Writing the final DataFrame to Unity Catalog table...")
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.customer_360_output")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
