In [None]:
# Databricks notebook source # MAGIC %md # MAGIC # ETL Process with PySpark # MAGIC This notebook demonstrates an ETL process using PySpark, including data ingestion, cleaning, transformation, and output to a Unity Catalog table.  # COMMAND ----------  import logging from pyspark.sql import functions as F  # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__)  # COMMAND ----------  # MAGIC %md # MAGIC ## Step 1: Data Ingestion # MAGIC Ingest data from a Unity Catalog table into a DataFrame.  # COMMAND ----------  try:     logger.info("Starting data ingestion from Unity Catalog table.")     df = spark.table("catalog.source_db.source_table")     logger.info("Data ingestion completed successfully. Schema: %s", df.schema)  # COMMAND ----------  # MAGIC %md # MAGIC ## Step 2: Remove Invalid Entries # MAGIC Filter out invalid entries based on business logic.  # COMMAND ----------      logger.info("Removing invalid entries based on business logic.")     df_cleaned = df.filter(df['quantity'] > 0)     logger.info("Invalid entries removed. Remaining rows: %d", df_cleaned.count())  # COMMAND ----------  # MAGIC %md # MAGIC ## Step 3: Handle Missing Values # MAGIC Fill missing values in the DataFrame.  # COMMAND ----------      logger.info("Handling missing values in the DataFrame.")     avg_price = df_cleaned.agg(F.mean('price')).first()[0]     df_filled = df_cleaned.na.fill({'price': avg_price})     logger.info("Missing values handled. Sample data: %s", df_filled.show(5))  # COMMAND ----------  # MAGIC %md # MAGIC ## Step 4: Convert Data Types # MAGIC Convert data types for accuracy.  # COMMAND ----------      logger.info("Converting data types for accuracy.")     df_final = df_filled.withColumn('date', F.to_date(df_filled['date'], 'MM/dd/yyyy'))     logger.info("Data type conversion completed. Sample data: %s", df_final.show(5))  # COMMAND ----------  # MAGIC %md # MAGIC ## Step 5: Data Output # MAGIC Write the final DataFrame to a Unity Catalog target table.  # COMMAND ----------      logger.info("Writing the final DataFrame to Unity Catalog target table.")     df_final.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.target_table")     logger.info("Data successfully written to target table.")  except Exception as e:     logger.error("An error occurred during the ETL process: %s", e)     raise
