In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process with PySpark
# MAGIC This notebook performs an ETL process using PySpark, extracting data from a source table, cleaning and transforming it, and then loading it into a target table.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# Configuration settings
source_table = "catalog.source_db.source_table"
target_table = "catalog.target_db.target_table"
columns_to_check = ["column1", "column2"]
default_values = {"column1": "default_value", "column2": 0}

# COMMAND ----------

try:
    # Step 1: Data Extraction
    logger.info(f"Starting data extraction from Unity Catalog table: {source_table}")
    source_df = spark.table(source_table)
    logger.info(f"Data extraction completed successfully. Number of records: {source_df.count()}")

# COMMAND ----------

    # Step 2: Data Cleaning - Remove Invalid Entries
    logger.info("Starting data cleaning by removing invalid entries.")
    cleaned_df = source_df.dropna(subset=columns_to_check)
    logger.info(f"Invalid entries removed. Number of records after cleaning: {cleaned_df.count()}")

# COMMAND ----------

    # Step 3: Handle Missing Values
    logger.info("Handling missing values by imputing with default values.")
    filled_df = cleaned_df.fillna(default_values)
    logger.info("Missing values handled successfully.")

# COMMAND ----------

    # Step 4: Convert Data Types
    logger.info("Converting data types for accuracy.")
    if "column1" in filled_df.columns:
        converted_df = filled_df.withColumn("column1", filled_df["column1"].cast(IntegerType()))
    else:
        logger.warning("Column 'column1' not found for type conversion.")
        converted_df = filled_df
    logger.info("Data type conversion completed successfully.")

# COMMAND ----------

    # Step 5: Data Loading
    logger.info(f"Loading transformed data into Unity Catalog target table: {target_table}")
    converted_df.write.format("delta").mode("overwrite").saveAsTable(target_table)
    logger.info("Data loading completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Performance Optimizations
# MAGIC Consider caching the DataFrame if it is reused multiple times, using broadcast joins for small tables, and applying predicate pushdown and column pruning for performance improvements.

# COMMAND ----------

# Performance Optimizations
# Cache the DataFrame if it is reused multiple times
# cleaned_df.cache()

# Use broadcast join if applicable
# small_df = spark.table("catalog.db.small_table")
# broadcasted_df = broadcast(small_df)
# joined_df = converted_df.join(broadcasted_df, "join_key")

# Apply predicate pushdown and column pruning
# selected_df = converted_df.select("column1", "column2").filter(F.col("column1") > 0)

logger.info("ETL process completed successfully.")
