In [None]:
# Databricks notebook source # MAGIC %md # MAGIC # ETL Process for Guardian Data # MAGIC This notebook performs an ETL process on data from Unity Catalog tables, including data selection, integration, aggregation, and output to a Delta table.  # COMMAND ----------  import pyspark.sql.functions as F import logging  # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__)  # COMMAND ----------  try:     # Step 1: Data Source Configuration     logger.info("Loading data from Unity Catalog tables.")     policy_df = spark.table("genai_demo.guardian.policy")     claims_df = spark.table("genai_demo.guardian.claims")     demographics_df = spark.table("genai_demo.guardian.demographics")     scores_df = spark.table("genai_demo.guardian.scores")     aiml_insights_df = spark.table("genai_demo.guardian.aiml_insights")  # COMMAND ----------      # Step 2: Data Selection and Filtering     logger.info("Selecting relevant fields from demographics data.")     demographics_selected = demographics_df.select(         "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address",          "City", "State", "Postal_Code", "Date_of_Birth", "Gender",          "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"     )  # COMMAND ----------      # Step 3: Data Integration     logger.info("Joining datasets on common identifiers.")     # Remove duplicate columns from claims_df before joining     claims_df = claims_df.drop("Customer_ID")     joined_df = demographics_selected.join(policy_df, "Customer_ID").join(claims_df, "Policy_ID")  # COMMAND ----------      # Step 4: Data Aggregation     logger.info("Aggregating data to calculate metrics.")     aggregated_df = joined_df.groupBy("Customer_ID").agg(         F.count("Claim_ID").alias("Total_Claims"),         F.countDistinct("policy_id").alias("Policy_Count"),         F.max("Claim_Date").alias("Recent_Claim_Date"),         F.avg("Claim_Amount").alias("Average_Claim_Amount")     )  # COMMAND ----------      # Step 5: Custom Calculations     logger.info("Performing custom calculations.")     final_df = aggregated_df.withColumn(         "Age", F.datediff(F.current_date(), F.to_date(F.col("Date_of_Birth"))) / 365     ).withColumn(         "Claim_To_Premium_Ratio",          F.when(F.col("total_premium_paid") != 0, F.col("Average_Claim_Amount") / F.col("total_premium_paid")).otherwise(0)     ).withColumn(         "Claims_Per_Policy",          F.when(F.col("Policy_Count") != 0, F.col("Total_Claims") / F.col("Policy_Count")).otherwise(0)     ).withColumn(         "Retention_Rate", F.lit(0.85)     ).withColumn(         "Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on")     ).withColumn(         "Upsell_Potential", F.lit("Premium Vehicle Coverage")     )  # COMMAND ----------      # Step 6: Comprehensive Data Assembly     logger.info("Combining all processed data into a single dataset.")     customer_360_df = final_df.join(scores_df, "Customer_ID").join(aiml_insights_df, "Customer_ID")  # COMMAND ----------      # Step 7: Output Data     logger.info("Writing the final dataset to Unity Catalog as a Delta table.")     customer_360_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.guardian.customer_360")      logger.info("ETL process completed successfully.")  except Exception as e:     logger.error("An error occurred during the ETL process: %s", e)     raise
