In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Guardian Data
# MAGIC This notebook performs an ETL process on data from Unity Catalog tables, integrating and transforming it into a comprehensive dataset.

# COMMAND ----------

import logging
from pyspark.sql.functions import col, to_date, count, avg, max, when, datediff, current_date, lit, broadcast

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    policy_df = spark.table("genai_demo.guardian.policy")
    claims_df = spark.table("genai_demo.guardian.claims")
    demographics_df = spark.table("genai_demo.guardian.demographics")
    scores_df = spark.table("genai_demo.guardian.scores")
    aiml_insights_df = spark.table("genai_demo.guardian.aiml_insights")

# COMMAND ----------

    # Data Selection and Type Conversion
    logger.info("Selecting relevant fields and performing type conversions...")
    demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
        to_date(col("Date_of_Birth"), "yyyy-MM-dd").alias("Date_of_Birth"), "Gender", "Marital_Status",
        "Occupation", "Income_Level", "Customer_Segment"
    )

    claims_df = claims_df.select(
        "Claim_ID", "Policy_ID", to_date(col("Claim_Date"), "yyyy-MM-dd").alias("Claim_Date"),
        "Claim_Type", "Claim_Status", col("Claim_Amount").cast("double").alias("Claim_Amount"),
        col("Claim_Payout").cast("double").alias("Claim_Payout")
    )

    policy_df = policy_df.select(
        "policy_id", "customer_id", "policy_type", "policy_status",
        to_date(col("policy_start_date"), "yyyy-MM-dd").alias("policy_start_date"),
        to_date(col("policy_end_date"), "yyyy-MM-dd").alias("policy_end_date"),
        col("policy_term").cast("int").alias("policy_term"),
        col("policy_premium").cast("double").alias("policy_premium"),
        col("total_premium_paid").cast("double").alias("total_premium_paid"),
        "renewal_status", "policy_addons"
    )

# COMMAND ----------

    # Data Integration
    logger.info("Performing data integration through joins...")
    joined_df = demographics_df.join(policy_df, demographics_df.Customer_ID == policy_df.customer_id, "inner") \
                               .drop(policy_df.customer_id) \
                               .join(claims_df, "policy_id", "inner")

    # Cache the joined DataFrame if reused
    joined_df.cache()

# COMMAND ----------

    # Data Aggregation
    logger.info("Aggregating claims data...")
    aggregated_df = joined_df.groupBy("Customer_ID").agg(
        count("Claim_ID").alias("Total_Claims"),
        count("policy_id").alias("Policy_Count"),
        max("Claim_Date").alias("Recent_Claim_Date"),
        avg("Claim_Amount").alias("Average_Claim_Amount")
    )

# COMMAND ----------

    # Custom Calculations
    logger.info("Implementing custom calculations...")
    final_df = aggregated_df.withColumn("Age", datediff(current_date(), col("Date_of_Birth")) / 365) \
                            .withColumn("Claim_To_Premium_Ratio", when(col("total_premium_paid") != 0, col("Average_Claim_Amount") / col("total_premium_paid")).otherwise(0)) \
                            .withColumn("Claims_Per_Policy", when(col("Policy_Count") != 0, col("Total_Claims") / col("Policy_Count")).otherwise(0)) \
                            .withColumn("Retention_Rate", lit(0.85)) \
                            .withColumn("Cross_Sell_Opportunities", lit("Multi-Policy Discount, Home Coverage Add-on")) \
                            .withColumn("Upsell_Potential", lit("Premium Vehicle Coverage"))

# COMMAND ----------

    # Data Consolidation
    logger.info("Consolidating data into a final comprehensive dataset...")
    final_df = final_df.join(broadcast(scores_df), "Customer_ID", "inner") \
                       .join(broadcast(aiml_insights_df), "Customer_ID", "inner")

# COMMAND ----------

    # Output Data
    logger.info("Writing the final dataset to Unity Catalog table...")
    final_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.guardian.customer_360")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise
