In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process with PySpark
# MAGIC This notebook demonstrates an ETL process using PySpark, including data loading, transformation, and writing to Unity Catalog.

# COMMAND ----------

import logging
from pyspark.sql.functions import to_date, col, broadcast
from pyspark.sql import DataFrame

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

def reformat_date_fields(df: DataFrame, date_columns: list) -> DataFrame:
    """
    Reformat date fields to a standard format 'yyyy-MM-dd'.
    """
    try:
        for date_col in date_columns:
            df = df.withColumn(date_col, to_date(col(date_col), "MM/dd/yyyy"))
        logger.info("Date fields reformatted successfully.")
    except Exception as e:
        logger.error(f"Error in reformatting date fields: {e}")
        raise
    return df

# COMMAND ----------

def standardize_column_names(df: DataFrame, column_mapping: dict) -> DataFrame:
    """
    Standardize column names to snake_case.
    """
    try:
        for old_name, new_name in column_mapping.items():
            df = df.withColumnRenamed(old_name, new_name)
        logger.info("Column names standardized successfully.")
    except Exception as e:
        logger.error(f"Error in standardizing column names: {e}")
        raise
    return df

# COMMAND ----------

def standardize_state_names(df: DataFrame, state_name_mapping: dict) -> DataFrame:
    """
    Standardize state names using a predefined mapping.
    """
    try:
        # Broadcast the mapping for performance if it's small
        state_name_mapping_broadcast = broadcast(state_name_mapping)
        df = df.replace(state_name_mapping_broadcast, subset=["state_name"])
        logger.info("State names standardized successfully.")
    except Exception as e:
        logger.error(f"Error in standardizing state names: {e}")
        raise
    return df

# COMMAND ----------

def load_data_from_unity_catalog(table_name: str) -> DataFrame:
    """
    Load data from a Unity Catalog table.
    """
    try:
        df = spark.table(table_name)
        logger.info(f"Data loaded successfully from {table_name}.")
    except Exception as e:
        logger.error(f"Error loading data from {table_name}: {e}")
        raise
    return df

# COMMAND ----------

def write_data_to_unity_catalog(df: DataFrame, table_name: str):
    """
    Write data to a Unity Catalog table in Delta format.
    """
    try:
        df.write.format("delta").mode("overwrite").saveAsTable(table_name)
        logger.info(f"Data written successfully to {table_name}.")
    except Exception as e:
        logger.error(f"Error writing data to {table_name}: {e}")
        raise

# COMMAND ----------

def main():
    try:
        # Load data from Unity Catalog table
        input_df = load_data_from_unity_catalog("catalog.source_db.source_table")

        # Reformat date fields
        date_columns = ["date_column"]  # List of date columns to reformat
        reformatted_df = reformat_date_fields(input_df, date_columns)

        # Standardize column names
        column_mapping = {
            "Date Column": "date_column",
            "State Name": "state_name"
        }
        standardized_df = standardize_column_names(reformatted_df, column_mapping)

        # Standardize state names
        state_name_mapping = {"California": "CA", "New York": "NY"}
        standardized_df = standardize_state_names(standardized_df, state_name_mapping)

        # Write the output to a Delta table in Unity Catalog
        write_data_to_unity_catalog(standardized_df, "catalog.target_db.target_table")

    except Exception as e:
        logger.error(f"An error occurred during the ETL process: {e}")
        raise

if __name__ == "__main__":
    main()
