In [None]:
"# Databricks notebook source\nimport logging\nfrom pyspark.sql.functions import col, to_date, count, avg, max, expr, broadcast\n\n# Configure logging\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n# COMMAND ----------\n\n# Step 1: Data Loading\ntry:\n    logger.info(\"Loading data from Unity Catalog tables.\")\n    policy_df = spark.table(\"genai_demo.guardian.policy\")\n    claims_df = spark.table(\"genai_demo.guardian.claims\")\n    demographics_df = spark.table(\"genai_demo.guardian.demographics\")\n    scores_df = spark.table(\"genai_demo.guardian.scores\")\n    aiml_insights_df = spark.table(\"genai_demo.guardian.aiml_insights\")\nexcept Exception as e:\n    logger.error(f\"Error loading data: {e}\")\n    raise\n\n# COMMAND ----------\n\n# Step 2: Data Selection and Type Conversion\ntry:\n    logger.info(\"Performing type conversions on data.\")\n    demographics_df = demographics_df.withColumn(\"Date_of_Birth\", to_date(col(\"Date_of_Birth\"), \"yyyy-MM-dd\"))\n    claims_df = claims_df.withColumn(\"Claim_Date\", to_date(col(\"Claim_Date\"), \"yyyy-MM-dd\")) \\\n                         .withColumn(\"Claim_Amount\", col(\"Claim_Amount\").cast(\"double\")) \\\n                         .withColumn(\"Claim_Payout\", col(\"Claim_Payout\").cast(\"double\"))\n    policy_df = policy_df.withColumn(\"policy_start_date\", to_date(col(\"policy_start_date\"), \"yyyy-MM-dd\")) \\\n                         .withColumn(\"policy_end_date\", to_date(col(\"policy_end_date\"), \"yyyy-MM-dd\")) \\\n                         .withColumn(\"policy_term\", col(\"policy_term\").cast(\"int\")) \\\n                         .withColumn(\"policy_premium\", col(\"policy_premium\").cast(\"double\")) \\\n                         .withColumn(\"total_premium_paid\", col(\"total_premium_paid\").cast(\"double\"))\nexcept Exception as e:\n    logger.error(f\"Error in type conversion: {e}\")\n    raise\n\n# COMMAND ----------\n\n# Step 3: Data Integration\ntry:\n    logger.info(\"Joining datasets to create a unified dataset.\")\n    demographics_policy_df = demographics_df.join(policy_df, demographics_df.Customer_ID == policy_df.customer_id, \"inner\")\n    \n    # Remove duplicate columns from claims_df before joining\n    claims_df = claims_df.drop(\"Policy_ID\")\n    \n    integrated_df = demographics_policy_df.join(claims_df, demographics_policy_df.policy_id == claims_df.policy_id, \"inner\")\nexcept Exception as e:\n    logger.error(f\"Error during data integration: {e}\")\n    raise\n\n# COMMAND ----------\n\n# Step 4: Data Aggregation\ntry:\n    logger.info(\"Aggregating data to compute metrics.\")\n    aggregated_df = integrated_df.groupBy(\"Customer_ID\").agg(\n        count(\"Claim_ID\").alias(\"Total_Claims\"),\n        count(\"policy_id\").alias(\"Policy_Count\"),\n        max(\"Claim_Date\").alias(\"Recent_Claim_Date\"),\n        avg(\"Claim_Amount\").alias(\"Average_Claim_Amount\")\n    )\nexcept Exception as e:\n    logger.error(f\"Error during data aggregation: {e}\")\n    raise\n\n# COMMAND ----------\n\n# Step 5: Custom Calculations\ntry:\n    logger.info(\"Performing custom calculations.\")\n    calculated_df = aggregated_df.withColumn(\"Age\", expr(\"DATEDIFF(current_date(), Date_of_Birth) / 365\")) \\\n                                 .withColumn(\"Claim_To_Premium_Ratio\", expr(\"CASE WHEN total_premium_paid != 0 THEN Claim_Amount / total_premium_paid ELSE 0 END\")) \\\n                                 .withColumn(\"Claims_Per_Policy\", expr(\"CASE WHEN Policy_Count != 0 THEN Total_Claims / Policy_Count ELSE 0 END\")) \\\n                                 .withColumn(\"Retention_Rate\", expr(\"0.85\")) \\\n                                 .withColumn(\"Cross_Sell_Opportunities\", expr(\"'Multi-Policy Discount, Home Coverage Add-on'\")) \\\n                                 .withColumn(\"Upsell_Potential\", expr(\"'Premium Vehicle Coverage'\"))\nexcept Exception as e:\n    logger.error(f\"Error during custom calculations: {e}\")\n    raise\n\n# COMMAND ----------\n\n# Step 6: Comprehensive Data Consolidation\ntry:\n    logger.info(\"Consolidating all processed data into a single dataset.\")\n    final_df = calculated_df.join(broadcast(aiml_insights_df), \"Customer_ID\", \"inner\") \\\n                            .join(broadcast(scores_df), \"Customer_ID\", \"inner\")\nexcept Exception as e:\n    logger.error(f\"Error during data consolidation: {e}\")\n    raise\n\n# COMMAND ----------\n\n# Step 7: Output Data\ntry:\n    logger.info(\"Writing the final dataset to a Unity Catalog table.\")\n    final_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"genai_demo.guardian.customer_360\")\nexcept Exception as e:\n    logger.error(f\"Error writing output data: {e}\")\n    raise\n\nlogger.info(\"ETL process completed successfully.\")\n"
