In [0]:
spark.version

'3.5.0'

In [0]:
!python --version

Python 3.11.0rc1


In [0]:
# Import necessary libraries
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

import requests
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [0]:
# Assuming the catalogs are already set up in Databricks
# Read data from the catalogs
policy_df = spark.table("genai_demo.guardian.policy")
claims_df = spark.table("genai_demo.guardian.claims")
demographics_df = spark.table("genai_demo.guardian.demographics")
scores_df = spark.table("genai_demo.guardian.scores")
aiml_insights_df = spark.table("genai_demo.guardian.aiml_insights")


INFO:py4j.clientserver:Received command c on object id p0


In [0]:

# Join dataframes
joined_df = policy_df.join(demographics_df, "Customer_ID").join(claims_df, "Policy_ID").join(scores_df, "Customer_ID").join(aiml_insights_df, "Customer_ID")



In [0]:
joined_df.printSchema()

INFO:py4j.clientserver:Received command c on object id p0


root
 |-- customer_id: string (nullable = true)
 |-- policy_id: string (nullable = true)
 |-- policy_type: string (nullable = true)
 |-- policy_status: string (nullable = true)
 |-- policy_start_date: date (nullable = true)
 |-- policy_end_date: date (nullable = true)
 |-- policy_term: long (nullable = true)
 |-- policy_premium: double (nullable = true)
 |-- total_premium_paid: double (nullable = true)
 |-- renewal_status: string (nullable = true)
 |-- policy_addons: string (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone_Number: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal_Code: long (nullable = true)
 |-- Date_of_Birth: date (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Marital_Status: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Income_Level: string (nullable = true)
 |-- Custom

In [0]:
# Aggregation and derived metrics
agg_df = joined_df.groupBy("Customer_ID").agg(
    F.count("Claim_ID").alias("Total_Claims"),
    F.avg("Claim_Amount").alias("Average_Claim_Amount"),
    F.max("Claim_Date").alias("Recent_Claim_Date"),
    F.countDistinct("Policy_ID").alias("Policy_Count")
)
print("Aggregations done")


INFO:py4j.clientserver:Received command c on object id p0


Aggregations done


In [0]:
insights_df = joined_df.join(agg_df, "Customer_ID")

INFO:py4j.clientserver:Received command c on object id p0


In [0]:
insights_df.show()

+-----------+---------+-----------+-------------+-----------------+---------------+-----------+--------------+------------------+--------------+--------------------+---------------+--------------------+------------+--------------+-------------+-----+-----------+-------------+------+--------------+----------+------------+----------------+--------+----------+----------+------------+------------+------------+------------+-----------+-------------------+-----------------+--------------------+------------------------+-----------------+------------+--------------------+-----------------+------------+
|customer_id|policy_id|policy_type|policy_status|policy_start_date|policy_end_date|policy_term|policy_premium|total_premium_paid|renewal_status|       policy_addons|  Customer_Name|               Email|Phone_Number|       Address|         City|State|Postal_Code|Date_of_Birth|Gender|Marital_Status|Occupation|Income_Level|Customer_Segment|Claim_ID|Claim_Date|Claim_Type|Claim_Status|Claim_Amount|Cl