In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Data Processing with PySpark
# MAGIC This notebook demonstrates loading, transforming, and writing data using PySpark in a Databricks environment.

# COMMAND ----------

import logging
from pyspark.sql.functions import col, to_date, count, avg, max, datediff, current_date, when, lit, broadcast

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume the Spark session is already initialized as 'spark'

# COMMAND ----------

def load_data():
    try:
        # Load data from CSV files into DataFrames
        policy_df = spark.read.csv("dbfs:/path/to/policy.csv", header=True, inferSchema=True)
        claims_df = spark.read.csv("dbfs:/path/to/claims.csv", header=True, inferSchema=True)
        demographics_df = spark.read.csv("dbfs:/path/to/demographics.csv", header=True, inferSchema=True)
        scores_df = spark.read.csv("dbfs:/path/to/scores.csv", header=True, inferSchema=True)
        aiml_insights_df = spark.read.csv("dbfs:/path/to/aiml_insights.csv", header=True, inferSchema=True)

        # Register DataFrames as temporary views
        policy_df.createOrReplaceTempView("policy")
        claims_df.createOrReplaceTempView("claims")
        demographics_df.createOrReplaceTempView("demographics")
        scores_df.createOrReplaceTempView("scores")
        aiml_insights_df.createOrReplaceTempView("aiml_insights")

        logger.info("Data loaded and temporary views created successfully.")
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        raise

# COMMAND ----------

def transform_data():
    try:
        # Convert data types
        demographics_df = spark.sql("""
            SELECT *, to_date(Date_of_Birth, 'yyyy-MM-dd') as Date_of_Birth
            FROM demographics
        """)
        claims_df = spark.sql("""
            SELECT *, to_date(Claim_Date, 'yyyy-MM-dd') as Claim_Date,
                   cast(Claim_Amount as double) as Claim_Amount,
                   cast(Claim_Payout as double) as Claim_Payout
            FROM claims
        """)
        policy_df = spark.sql("""
            SELECT *, to_date(policy_start_date, 'yyyy-MM-dd') as policy_start_date,
                   to_date(policy_end_date, 'yyyy-MM-dd') as policy_end_date,
                   cast(policy_premium as double) as policy_premium,
                   cast(total_premium_paid as double) as total_premium_paid
            FROM policy
        """)

        # Join demographics and policy data
        customer_policy_df = demographics_df.join(policy_df, demographics_df.Customer_ID == policy_df.customer_id, "inner")

        # Drop duplicate columns from the right DataFrame
        customer_policy_df = customer_policy_df.drop(policy_df.customer_id)

        # Join with claims data
        customer_policy_claims_df = customer_policy_df.join(claims_df, customer_policy_df.policy_id == claims_df.Policy_ID, "inner")

        # Drop duplicate columns from the right DataFrame
        customer_policy_claims_df = customer_policy_claims_df.drop(claims_df.Policy_ID)

        # Aggregate data
        aggregated_df = customer_policy_claims_df.groupBy("Customer_ID").agg(
            count("Claim_ID").alias("Total_Claims"),
            count("policy_id").alias("Policy_Count"),
            max("Claim_Date").alias("Recent_Claim_Date"),
            avg("Claim_Amount").alias("Average_Claim_Amount")
        )

        # Custom Calculations
        aggregated_df = aggregated_df.withColumn("Age", datediff(current_date(), col("Date_of_Birth")) / 365)
        aggregated_df = aggregated_df.withColumn("Claim_To_Premium_Ratio", 
                                                 when(col("total_premium_paid") != 0, col("Claim_Amount") / col("total_premium_paid")).otherwise(0))
        aggregated_df = aggregated_df.withColumn("Claims_Per_Policy", 
                                                 when(col("Policy_Count") != 0, col("Total_Claims") / col("Policy_Count")).otherwise(0))
        aggregated_df = aggregated_df.withColumn("Retention_Rate", lit(0.85))
        aggregated_df = aggregated_df.withColumn("Cross_Sell_Opportunities", lit("Multi-Policy Discount, Home Coverage Add-on"))
        aggregated_df = aggregated_df.withColumn("Upsell_Potential", lit("Premium Vehicle Coverage"))

        logger.info("Data transformation completed successfully.")
        return aggregated_df
    except Exception as e:
        logger.error(f"Error during data transformation: {e}")
        raise

# COMMAND ----------

def write_data(aggregated_df):
    try:
        # Write the final DataFrame to a Delta table
        aggregated_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.customer_360")
        logger.info("Data written to Delta table successfully.")
    except Exception as e:
        logger.error(f"Error writing data to Delta table: {e}")
        raise

# COMMAND ----------

def main():
    load_data()
    transformed_data = transform_data()
    write_data(transformed_data)

if __name__ == "__main__":
    main()
