In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Data Processing with PySpark
# MAGIC This notebook demonstrates a data processing pipeline using PySpark, including data loading, selection, integration, aggregation, and custom calculations.

# COMMAND ----------

import logging
from pyspark.sql import functions as F

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Data Loading
# MAGIC Load data from Unity Catalog tables.

# COMMAND ----------

try:
    logger.info("Loading data from Unity Catalog tables.")
    policy_df = spark.table("genai_demo.jnj.policy")
    claims_df = spark.table("genai_demo.jnj.claims")
    demographics_df = spark.table("genai_demo.jnj.demographics")
    scores_df = spark.table("genai_demo.jnj.scores")
    aiml_insights_df = spark.table("genai_demo.jnj.aiml_insights")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Data Selection and Filtering
# MAGIC Select relevant fields from each dataset.

# COMMAND ----------

    logger.info("Selecting relevant fields from each dataset.")
    selected_demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", 
        "State", "Postal_Code", "Date_of_Birth", "Gender", "Marital_Status", 
        "Occupation", "Income_Level", "Customer_Segment"
    )

    selected_claims_df = claims_df.select(
        "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", "Claim_Status", 
        "Claim_Amount", "Claim_Payout"
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Data Integration
# MAGIC Join datasets on common identifiers.

# COMMAND ----------

    logger.info("Joining datasets on common identifiers.")
    joined_df = selected_demographics_df.join(
        policy_df, selected_demographics_df.Customer_ID == policy_df.customer_id, "inner"
    ).join(
        selected_claims_df, policy_df.policy_id == selected_claims_df.Policy_ID, "inner"
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Data Aggregation
# MAGIC Aggregate data to calculate metrics.

# COMMAND ----------

    logger.info("Aggregating data to calculate metrics.")
    aggregated_df = joined_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.count("policy_id").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount"),
        F.first("Date_of_Birth").alias("Date_of_Birth"),
        F.first("total_premium_paid").alias("total_premium_paid")
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: Custom Calculations
# MAGIC Perform custom calculations on the aggregated data.

# COMMAND ----------

    logger.info("Performing custom calculations.")
    final_df = aggregated_df.withColumn(
        "Age", F.expr("DATEDIFF(current_date(), Date_of_Birth) / 365")
    ).withColumn(
        "Claim_To_Premium_Ratio", F.expr("CASE WHEN total_premium_paid != 0 THEN Average_Claim_Amount / total_premium_paid ELSE 0 END")
    ).withColumn(
        "Claims_Per_Policy", F.expr("CASE WHEN Policy_Count != 0 THEN Total_Claims / Policy_Count ELSE 0 END")
    ).withColumn(
        "Retention_Rate", F.lit(0.85)
    ).withColumn(
        "Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on")
    ).withColumn(
        "Upsell_Potential", F.lit("Premium Vehicle Coverage")
    )

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 6: Final Output
# MAGIC Write the final DataFrame to a Unity Catalog table.

# COMMAND ----------

    logger.info("Writing the final DataFrame to a Unity Catalog table.")
    spark.sql("DROP TABLE IF EXISTS genai_demo.guardian.customer_360")
    final_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.guardian.customer_360")

except Exception as e:
    logger.error("An error occurred during the ETL process: %s", e)
    raise
