In [None]:
# Databricks notebook source
# COMMAND ----------
# Import necessary libraries
import pyspark.sql.functions as F
import logging
from pyspark.sql.functions import broadcast

# COMMAND ----------
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
try:
    # Step 1: Data Source Configuration
    logger.info("Loading data from CSV files into DataFrames.")
    policy_df = spark.read.csv("tfs://dataeconomy-9k42/62457/uploads/62457/29aabe9d-c354-4176-8f98-2dd7a5fd7216/policy.csv", header=True, inferSchema=True)
    claims_df = spark.read.csv("tfs://dataeconomy-9k42/62457/uploads/62457/29aabe9d-c354-4176-8f98-2dd7a5fd7216/claims.csv", header=True, inferSchema=True)
    demographics_df = spark.read.csv("tfs://dataeconomy-9k42/62457/uploads/62457/29aabe9d-c354-4176-8f98-2dd7a5fd7216/demographics.csv", header=True, inferSchema=True)
    scores_df = spark.read.csv("tfs://dataeconomy-9k42/62457/uploads/62457/29aabe9d-c354-4176-8f98-2dd7a5fd7216/scores.csv", header=True, inferSchema=True)
    aiml_insights_df = spark.read.csv("tfs://dataeconomy-9k42/62457/uploads/62457/29aabe9d-c354-4176-8f98-2dd7a5fd7216/aiml_insights.csv", header=True, inferSchema=True)

# COMMAND ----------
    # Step 2: Data Selection and Filtering
    logger.info("Selecting relevant fields from demographics data.")
    selected_demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code", 
        "Date_of_Birth", "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )

# COMMAND ----------
    # Step 3: Data Integration
    logger.info("Integrating demographics and policy data.")
    integrated_df = selected_demographics_df.join(policy_df, selected_demographics_df.Customer_ID == policy_df.customer_id, "inner")
    integrated_df = integrated_df.drop(policy_df.customer_id)  # Remove duplicate columns

    logger.info("Integrating claims data.")
    integrated_df = integrated_df.join(claims_df, integrated_df.policy_id == claims_df.Policy_ID, "inner")
    integrated_df = integrated_df.drop(claims_df.Policy_ID)  # Remove duplicate columns

# COMMAND ----------
    # Step 4: Data Aggregation
    logger.info("Aggregating claims data.")
    aggregated_df = claims_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.countDistinct("policy_id").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )

# COMMAND ----------
    # Step 5: Custom Calculations
    logger.info("Performing custom calculations.")
    final_df = integrated_df.withColumn("Age", F.datediff(F.current_date(), F.to_date("Date_of_Birth", "yyyy-MM-dd")) / 365)
    final_df = final_df.withColumn("Claim_To_Premium_Ratio", F.when(F.col("total_premium_paid") != 0, F.col("Claim_Amount") / F.col("total_premium_paid")).otherwise(0))
    final_df = final_df.withColumn("Claims_Per_Policy", F.when(F.col("Policy_Count") != 0, F.col("Total_Claims") / F.col("Policy_Count")).otherwise(0))
    final_df = final_df.withColumn("Retention_Rate", F.lit(0.85))
    final_df = final_df.withColumn("Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on"))
    final_df = final_df.withColumn("Upsell_Potential", F.lit("Premium Vehicle Coverage"))

# COMMAND ----------
    # Step 6: Comprehensive Data Consolidation
    logger.info("Consolidating all data sources for a complete customer profile.")
    customer_360_df = final_df.join(broadcast(scores_df), "Customer_ID", "inner").join(broadcast(aiml_insights_df), "Customer_ID", "inner")

# COMMAND ----------
    # Step 7: Output Generation
    logger.info("Writing the final Customer 360 View to a Unity Catalog table.")
    customer_360_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.customer_360_view")

except Exception as e:
    logger.error("An error occurred during the ETL process: %s", e)
    raise
