In [0]:
# Databricks notebook source
# MAGIC %md 
# MAGIC # ETL Process for Customer 360 Data - demo
# MAGIC This notebook performs an ETL process to create a comprehensive Customer 360 view by integrating data from various sources.

# COMMAND ----------


import logging
from pyspark.sql import functions as F

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Step 1: Data Source Configuration
    logger.info("Loading data from Unity Catalog tables.")
    claims_df = spark.table("genai_demo.guardian.claims")
    demographics_df = spark.table("genai_demo.guardian.demographics")
    policy_df = spark.table("genai_demo.guardian.policy")
    scores_df = spark.table("genai_demo.guardian.scores")
    aiml_insights_df = spark.table("genai_demo.guardian.aiml_insights")

# COMMAND ----------

    # Step 2: Data Transformation
    # Field Selection
    logger.info("Selecting relevant fields from each DataFrame.")
    demographics_selected = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
        "Date_of_Birth", "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )
    claims_selected = claims_df.select(
        "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", "Claim_Status", "Claim_Amount", "Claim_Payout"
    )
    policy_selected = policy_df.select(
        "Policy_ID", "Customer_ID", "Policy_Type", "Policy_Status", "Policy_Start_Date", "Policy_End_Date",
        "Policy_Term", "Policy_Premium", "Total_Premium_Paid", "Renewal_Status", "Policy_Addons"
    )

# COMMAND ----------

    # Data Integration
    logger.info("Joining datasets based on key identifiers.")
    demographics_policy_joined = demographics_selected.join(policy_selected, "Customer_ID", "inner")
    demographics_policy_joined.cache()  # Cache if reused
    full_joined_df = demographics_policy_joined.join(claims_selected, "Policy_ID", "inner")
    full_joined_df.cache()  # Cache if reused

# COMMAND ----------

    # Data Aggregation
    logger.info("Computing aggregate metrics.")
    aggregated_df = full_joined_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.count("Policy_ID").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )
    full_joined_df_1 = full_joined_df.join(aggregated_df, "Customer_ID", "inner")
# COMMAND ----------

    # Custom Calculations
    logger.info("Deriving additional metrics.")
    calculated_df = full_joined_df_1.withColumn("Age", F.expr("DATEDIFF(current_date(), Date_of_Birth)/365")) \
        .withColumn("Claim_To_Premium_Ratio", F.expr("CASE WHEN Total_Premium_Paid != 0 THEN Claim_Amount/Total_Premium_Paid ELSE 0 END")) \
        .withColumn("Claims_Per_Policy", F.expr("CASE WHEN Policy_Count != 0 THEN Total_Claims/Policy_Count ELSE 0 END")) \
        .withColumn("Retention_Rate", F.lit(0.85)) \
        .withColumn("Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on")) \
        .withColumn("Upsell_Potential", F.lit("Premium Vehicle Coverage"))

# COMMAND ----------

    # Comprehensive Data Assembly
    logger.info("Integrating AI/ML insights and scores.")
    final_df = calculated_df.join(F.broadcast(aiml_insights_df), "Customer_ID", "inner").join(F.broadcast(scores_df), "Customer_ID", "inner")

# COMMAND ----------

    # Step 3: Output Data
    logger.info("Writing the final DataFrame to a Unity Catalog table.")
    final_df.show()
    #final_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.guardian.customer_360")
    

except Exception as e:
    logger.error("An error occurred during the ETL process: %s", e)
    raise
