In [None]:
"# Databricks notebook source\n# COMMAND ----------\n# MAGIC %md\n# MAGIC # ETL Process for Customer 360 Data\n# MAGIC This notebook performs an ETL process to create a comprehensive Customer 360 dataset using data from Unity Catalog tables.\n\n# COMMAND ----------\n# MAGIC
\n# Import necessary libraries\nimport logging\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.functions import expr, datediff, current_date, count, max, avg\n\n# Initialize logging\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n# Assume the Spark session is already initialized as 'spark'\n\n# COMMAND ----------\n# MAGIC
\n# Define function to load data from Unity Catalog tables\ndef load_data():\n    try:\n        logger.info(\"Loading data from Unity Catalog tables\")\n        policy_df = spark.table(\"catalog.source_db.policycsv\")\n        claims_df = spark.table(\"catalog.source_db.claimscsv\")\n        demographics_df = spark.table(\"catalog.source_db.demographicscsv\")\n        scores_df = spark.table(\"catalog.source_db.scorescsv\")\n        aiml_insights_df = spark.table(\"catalog.source_db.aiml_insightscsv\")\n        return policy_df, claims_df, demographics_df, scores_df, aiml_insights_df\n    except Exception as e:\n        logger.error(f\"Error loading data: {e}\")\n        raise\n\n# COMMAND ----------\n# MAGIC
\n# Define function to select relevant fields from each dataset\ndef select_fields(policy_df, claims_df, demographics_df, scores_df, aiml_insights_df):\n    try:\n        logger.info(\"Selecting relevant fields from each dataset\")\n        selected_demographics_df = demographics_df.select(\n            \"Customer_ID\", \"Customer_Name\", \"Email\", \"Phone_Number\", \"Address\", \"City\", \"State\", \"Postal_Code\",\n            \"Date_of_Birth\", \"Gender\", \"Marital_Status\", \"Occupation\", \"Income_Level\", \"Customer_Segment\"\n        )\n        selected_claims_df = claims_df.select(\n            \"Claim_ID\", \"Policy_ID\", \"Claim_Date\", \"Claim_Type\", \"Claim_Status\", \"Claim_Amount\", \"Claim_Payout\"\n        )\n        selected_policy_df = policy_df.select(\n            \"policy_id\", \"customer_id\", \"policy_type\", \"policy_status\", \"policy_start_date\", \"policy_end_date\",\n            \"policy_term\", \"policy_premium\", \"total_premium_paid\", \"renewal_status\", \"policy_addons\"\n        )\n        selected_scores_df = scores_df.select(\"Customer_ID\", \"Credit_Score\", \"Fraud_Score\", \"Customer_Risk_Score\")\n        selected_aiml_insights_df = aiml_insights_df.select(\n            \"Customer_ID\", \"Churn_Probability\", \"Next_Best_Offer\", \"Claims_Fraud_Probability\", \"Revenue_Potential\"\n        )\n        return selected_demographics_df, selected_claims_df, selected_policy_df, selected_scores_df, selected_aiml_insights_df\n    except Exception as e:\n        logger.error(f\"Error selecting fields: {e}\")\n        raise\n\n# COMMAND ----------\n# MAGIC
\n# Define function to perform data transformations and joins\ndef transform_data(selected_demographics_df, selected_claims_df, selected_policy_df):\n    try:\n        logger.info(\"Performing data transformations and joins\")\n        joined_df_1 = selected_demographics_df.join(\n            selected_policy_df, F.col('Customer_ID') == F.col('customer_id'), \"inner\"\n        )\n        joined_df_2 = joined_df_1.join(\n            selected_claims_df, F.col('policy_id') == F.col('Policy_ID'), \"inner\"\n        )\n        summarized_df = joined_df_2.groupBy(\"Customer_ID\").agg(\n            count(\"Claim_ID\").alias(\"Total_Claims\"),\n            count(\"policy_id\").alias(\"Policy_Count\"),\n            max(\"Claim_Date\").alias(\"Recent_Claim_Date\"),\n            avg(\"Claim_Amount\").alias(\"Average_Claim_Amount\")\n        )\n        final_joined_df = summarized_df.join(joined_df_2, \"Customer_ID\", \"inner\")\n        return final_joined_df\n    except Exception as e:\n        logger.error(f\"Error transforming data: {e}\")\n        raise\n\n# COMMAND ----------\n# MAGIC
\n# Define function to calculate custom metrics\ndef calculate_custom_metrics(final_joined_df):\n    try:\n        logger.info(\"Calculating custom metrics\")\n        age_expr = expr(\"datediff(current_date(), Date_of_Birth) / 365\")\n        claim_to_premium_ratio_expr = expr(\"CASE WHEN total_premium_paid > 0 THEN Claim_Amount / total_premium_paid ELSE 0 END\")\n        claims_per_policy_expr = expr(\"CASE WHEN Policy_Count > 0 THEN Total_Claims / Policy_Count ELSE 0 END\")\n\n        final_df = final_joined_df.withColumn(\"Age\", age_expr) \\\n            .withColumn(\"Claim_To_Premium_Ratio\", claim_to_premium_ratio_expr) \\\n            .withColumn(\"Claims_Per_Policy\", claims_per_policy_expr) \\\n            .withColumn(\"Retention_Rate\", F.lit(0.85)) \\\n            .withColumn(\"Cross_Sell_Opportunities\", F.lit(\"Multi-Policy Discount, Home Coverage Addon\")) \\\n            .withColumn(\"Upsell_Potential\", F.lit(\"Premium Vehicle Coverage\"))\n        return final_df\n    except Exception as e:\n        logger.error(f\"Error calculating custom metrics: {e}\")\n        raise\n\n# COMMAND ----------\n# MAGIC
\n# Define function to integrate all processed data into a single dataset\ndef integrate_data(final_df, selected_scores_df, selected_aiml_insights_df):\n    try:\n        logger.info(\"Integrating all processed data into a single dataset\")\n        customer_360_df = final_df.join(selected_scores_df, \"Customer_ID\", \"inner\") \\\n            .join(selected_aiml_insights_df, \"Customer_ID\", \"inner\")\n        return customer_360_df\n    except Exception as e:\n        logger.error(f\"Error integrating data: {e}\")\n        raise\n\n# COMMAND ----------\n# MAGIC
\n# Define function to export the final dataset to Unity Catalog\ndef export_data(customer_360_df):\n    try:\n        logger.info(\"Exporting the final dataset to Unity Catalog\")\n        spark.sql(\"DROP TABLE IF EXISTS catalog.target_db.Customer_360\")\n        customer_360_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"catalog.target_db.Customer_360\")\n    except Exception as e:\n        logger.error(f\"Error exporting data: {e}\")\n        raise\n\n# COMMAND ----------\n# MAGIC
\n# Main function to execute the ETL process\ndef main():\n    try:\n        policy_df, claims_df, demographics_df, scores_df, aiml_insights_df = load_data()\n        selected_demographics_df, selected_claims_df, selected_policy_df, selected_scores_df, selected_aiml_insights_df = select_fields(\n            policy_df, claims_df, demographics_df, scores_df, aiml_insights_df\n        )\n        final_joined_df = transform_data(selected_demographics_df, selected_claims_df, selected_policy_df)\n        final_df = calculate_custom_metrics(final_joined_df)\n        customer_360_df = integrate_data(final_df, selected_scores_df, selected_aiml_insights_df)\n        export_data(customer_360_df)\n        logger.info(\"ETL process completed successfully\")\n    except Exception as e:\n        logger.error(f\"ETL process failed: {e}\")\n\n# Execute the main function\nmain()\n"
