In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC 
# MAGIC # Data Enrichment with Caching and Broadcasting
# MAGIC This notebook demonstrates how to cache a DataFrame and use broadcasting for efficient joins in PySpark.

# COMMAND ----------

# Import necessary libraries
from pyspark.sql import functions as F
import logging

# Initialize logger
logger = logging.getLogger(__name__)

# COMMAND ----------

# Assume orders_central_df is already defined and loaded
# Example of caching and broadcasting
try:
    # Cache the enriched DataFrame if it will be reused multiple times
    orders_central_df.cache()

    # Broadcast the returns DataFrame if it's small
    returns_df = spark.sql("SELECT * FROM catalog.source_db.returns").alias("returns")
    returns_df = F.broadcast(returns_df)

    # Perform join with broadcasted returns DataFrame
    orders_central_df = orders_central_df.join(returns_df, "Order ID", "left").withColumn("Returned?", F.when(F.col("Return Reason").isNotNull(), "Yes").otherwise("No"))

    logger.info("Data enrichment with broadcast join completed.")
except Exception as e:
    logger.error(f"Error during data enrichment with broadcast join: {e}")
    raise
finally:
    # Unpersist the DataFrame if no longer needed
    orders_central_df.unpersist()
