In [None]:
"# Databricks notebook source\n# COMMAND ----------\n# %md\n# # ETL Process for Customer 360 Data\n# This notebook performs an ETL process to create a comprehensive Customer 360 dataset using data from Unity Catalog tables.\n\n# COMMAND ----------\n#
\nimport logging\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import count, avg, max, when, datediff, current_date, lit, broadcast\n\n# Configure logging\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n# Assume the Spark session is pre-initialized as 'spark'\n\n# COMMAND ----------\n#
\ndef load_data():\n    \"\"\"Load data from Unity Catalog tables.\"\"\"\n    logger.info(\"Loading data from Unity Catalog tables\")\n    policy_df = spark.table(\"catalog.source_db.policy\")\n    claims_df = spark.table(\"catalog.source_db.claims\")\n    demographics_df = spark.table(\"catalog.source_db.demographics\")\n    scores_df = spark.table(\"catalog.source_db.scores\")\n    aiml_insights_df = spark.table(\"catalog.source_db.aiml_insights\")\n    return policy_df, claims_df, demographics_df, scores_df, aiml_insights_df\n\n# COMMAND ----------\n#
\ndef select_and_filter_data(demographics_df, claims_df, policy_df):\n    \"\"\"Select relevant fields from each dataset.\"\"\"\n    logger.info(\"Selecting relevant fields from each dataset\")\n    selected_demographics_df = demographics_df.select(\n        \"Customer_ID\", \"Customer_Name\", \"Email\", \"Phone_Number\", \"Address\", \n        \"City\", \"State\", \"Postal_Code\", \"Date_of_Birth\", \"Gender\", \n        \"Marital_Status\", \"Occupation\", \"Income_Level\", \"Customer_Segment\"\n    )\n    selected_claims_df = claims_df.select(\n        \"Claim_ID\", \"Policy_ID\", \"Claim_Date\", \"Claim_Type\", \n        \"Claim_Status\", \"Claim_Amount\", \"Claim_Payout\"\n    )\n    selected_policy_df = policy_df.select(\n        \"policy_id\", \"customer_id\", \"policy_type\", \"policy_status\", \n        \"policy_start_date\", \"policy_end_date\", \"policy_term\", \n        \"policy_premium\", \"total_premium_paid\", \"renewal_status\", \"policy_addons\"\n    )\n    return selected_demographics_df, selected_claims_df, selected_policy_df\n\n# COMMAND ----------\n#
\ndef integrate_data(selected_demographics_df, selected_claims_df, selected_policy_df):\n    \"\"\"Join datasets based on key identifiers.\"\"\"\n    logger.info(\"Joining datasets based on key identifiers\")\n    joined_df = selected_demographics_df.join(\n        selected_policy_df, selected_demographics_df.Customer_ID == selected_policy_df.customer_id, \"inner\"\n    ).join(\n        selected_claims_df, selected_policy_df.policy_id == selected_claims_df.Policy_ID, \"inner\"\n    )\n    joined_df.cache()\n    return joined_df\n\n# COMMAND ----------\n#
\ndef aggregate_data(joined_df):\n    \"\"\"Compute aggregate metrics.\"\"\"\n    logger.info(\"Computing aggregate metrics\")\n    aggregated_df = joined_df.groupBy(\"Customer_ID\").agg(\n        count(\"Claim_ID\").alias(\"Total_Claims\"),\n        count(\"policy_id\").alias(\"Policy_Count\"),\n        max(\"Claim_Date\").alias(\"Recent_Claim_Date\"),\n        avg(\"Claim_Amount\").alias(\"Average_Claim_Amount\")\n    )\n    return aggregated_df\n\n# COMMAND ----------\n#
\ndef derive_insights(aggregated_df):\n    \"\"\"Derive additional insights.\"\"\"\n    logger.info(\"Deriving additional insights\")\n    final_df = aggregated_df.withColumn(\n        \"Age\", datediff(current_date(), \"Date_of_Birth\") / 365\n    ).withColumn(\n        \"Claim_To_Premium_Ratio\", when(aggregated_df.total_premium_paid != 0, aggregated_df.Claim_Amount / aggregated_df.total_premium_paid).otherwise(0)\n    ).withColumn(\n        \"Claims_Per_Policy\", when(aggregated_df.Policy_Count != 0, aggregated_df.Total_Claims / aggregated_df.Policy_Count).otherwise(0)\n    ).withColumn(\n        \"Retention_Rate\", lit(0.85)\n    ).withColumn(\n        \"Cross_Sell_Opportunities\", lit(\"Multi-Policy Discount, Home Coverage Add-on\")\n    ).withColumn(\n        \"Upsell_Potential\", lit(\"Premium Vehicle Coverage\")\n    )\n    return final_df\n\n# COMMAND ----------\n#
\ndef assemble_data(final_df, scores_df, aiml_insights_df):\n    \"\"\"Combine all transformed data into a single dataset.\"\"\"\n    logger.info(\"Combining all transformed data into a single dataset\")\n    comprehensive_df = final_df.join(broadcast(scores_df), \"Customer_ID\", \"inner\").join(broadcast(aiml_insights_df), \"Customer_ID\", \"inner\")\n    return comprehensive_df\n\n# COMMAND ----------\n#
\ndef write_output(comprehensive_df):\n    \"\"\"Write the final dataset to Unity Catalog table.\"\"\"\n    logger.info(\"Writing the final dataset to Unity Catalog table\")\n    comprehensive_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"catalog.target_db.customer_360\")\n\n# COMMAND ----------\n#
\ndef main():\n    try:\n        # Step 1: Data Ingestion\n        policy_df, claims_df, demographics_df, scores_df, aiml_insights_df = load_data()\n\n        # Step 2: Data Selection and Filtering\n        selected_demographics_df, selected_claims_df, selected_policy_df = select_and_filter_data(demographics_df, claims_df, policy_df)\n\n        # Step 3: Data Integration\n        joined_df = integrate_data(selected_demographics_df, selected_claims_df, selected_policy_df)\n\n        # Step 4: Data Aggregation\n        aggregated_df = aggregate_data(joined_df)\n\n        # Step 5: Custom Calculations\n        final_df = derive_insights(aggregated_df)\n\n        # Step 6: Comprehensive Data Assembly\n        comprehensive_df = assemble_data(final_df, scores_df, aiml_insights_df)\n\n        # Step 7: Output Data\n        write_output(comprehensive_df)\n\n    except Exception as e:\n        logger.error(\"An error occurred during the ETL process\", exc_info=True)\n\n# COMMAND ----------\n#
\nif __name__ == \"__main__\":\n    main()\n"
