In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Customer 360 View
# MAGIC This notebook performs an ETL process to create a comprehensive customer 360 view by integrating data from various sources.

# COMMAND ----------

import logging
from pyspark.sql.functions import count, max, avg, datediff, current_date, when, lit, broadcast

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume the Spark session is already initialized as 'spark'

# COMMAND ----------

def main():
    try:
        # Load data from Unity Catalog tables
        logger.info("Loading data from Unity Catalog tables...")
        claims_df = spark.table("genai_demo.guardian.claims")
        demographics_df = spark.table("genai_demo.guardian.demographics")
        policy_df = spark.table("genai_demo.guardian.policy")
        scores_df = spark.table("genai_demo.guardian.scores")
        aiml_insights_df = spark.table("genai_demo.guardian.aiml_insights")

# COMMAND ----------

        # Select necessary fields from demographics and claims data
        logger.info("Selecting necessary fields...")
        selected_demographics_df = demographics_df.select(
            "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", 
            "City", "State", "Postal_Code", "Date_of_Birth", "Gender", 
            "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
        )
        selected_claims_df = claims_df.select(
            "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", 
            "Claim_Status", "Claim_Amount", "Claim_Payout"
        )

# COMMAND ----------

        # Data Integration: Join demographics and policy data on Customer_ID, then with claims data on Policy_ID
        logger.info("Joining data...")
        joined_df = selected_demographics_df.join(policy_df, "Customer_ID").join(selected_claims_df, "Policy_ID")
        joined_df.cache()  # Cache the joined DataFrame for performance

# COMMAND ----------

        # Data Aggregation
        logger.info("Aggregating data...")
        aggregated_df = joined_df.groupBy("Customer_ID").agg(
            count("Claim_ID").alias("Total_Claims"),
            count("Policy_ID").alias("Policy_Count"),
            max("Claim_Date").alias("Recent_Claim_Date"),
            avg("Claim_Amount").alias("Average_Claim_Amount")
        )

# COMMAND ----------

        # Custom Calculations
        logger.info("Performing custom calculations...")
        final_df = aggregated_df.withColumn("Age", datediff(current_date(), "Date_of_Birth") / 365) \
            .withColumn("Claim_To_Premium_Ratio", when(policy_df["Total_Premium_Paid"] != 0, claims_df["Claim_Amount"] / policy_df["Total_Premium_Paid"]).otherwise(0)) \
            .withColumn("Claims_Per_Policy", when(aggregated_df["Policy_Count"] != 0, aggregated_df["Total_Claims"] / aggregated_df["Policy_Count"]).otherwise(0)) \
            .withColumn("Retention_Rate", lit(0.85)) \
            .withColumn("Cross_Sell_Opportunities", lit("Multi-Policy Discount, Home Coverage Add-on")) \
            .withColumn("Upsell_Potential", lit("Premium Vehicle Coverage"))

# COMMAND ----------

        # Join with additional data sources: AI/ML insights and scores
        logger.info("Joining with additional data sources...")
        comprehensive_df = final_df.join(broadcast(scores_df), "Customer_ID").join(broadcast(aiml_insights_df), "Customer_ID")

# COMMAND ----------

        # Output Data Source: Save the final comprehensive dataset to a Unity Catalog table
        logger.info("Writing the final dataset to Unity Catalog...")
        comprehensive_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.guardian.customer_360_view")

        logger.info("ETL process completed successfully.")

    except Exception as e:
        logger.error("An error occurred during the ETL process", exc_info=True)

# COMMAND ----------

if __name__ == "__main__":
    main()
