In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process with PySpark
# MAGIC This notebook demonstrates an ETL process using PySpark, including data loading, standardization, cleaning, integration, and application of business rules.

# COMMAND ----------

import logging
from pyspark.sql import functions as F
from pyspark.sql.utils import AnalysisException

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Load Data from Unity Catalog
# MAGIC Function to load data from Unity Catalog tables.

# COMMAND ----------

def load_data_from_catalog(table_name):
    try:
        logger.info(f"Loading data from Unity Catalog table: {table_name}")
        df = spark.table(table_name)
        logger.info(f"Loaded data schema: {df.schema}")
        return df
    except AnalysisException as e:
        logger.error(f"Error loading data from table {table_name}: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Standardize Data
# MAGIC Function to standardize data formats.

# COMMAND ----------

def standardize_data(df):
    try:
        logger.info("Standardizing data formats")
        df_standardized = df.withColumn("sale_date", F.to_date(F.col("sale_date"), "MM/dd/yyyy"))
        logger.info(f"Standardized data schema: {df_standardized.schema}")
        return df_standardized
    except Exception as e:
        logger.error(f"Error during data standardization: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Clean Data
# MAGIC Function to clean data by removing duplicates and handling null values.

# COMMAND ----------

def clean_data(df):
    try:
        logger.info("Cleaning data")
        df_cleaned = df.dropDuplicates()
        df_cleaned = df_cleaned.fillna({"sales_amount": 0})
        df_cleaned = df_cleaned.withColumn("sales_amount", F.when(F.col("sales_amount") < 0, 0).otherwise(F.col("sales_amount")))
        logger.info(f"Cleaned data schema: {df_cleaned.schema}")
        return df_cleaned
    except Exception as e:
        logger.error(f"Error during data cleaning: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Integrate Data
# MAGIC Function to integrate data from multiple sources.

# COMMAND ----------

def integrate_data(df_list):
    try:
        logger.info("Integrating data from multiple sources")
        df_integrated = df_list[0]
        for df in df_list[1:]:
            df_integrated = df_integrated.unionByName(df)
        logger.info(f"Integrated data schema: {df_integrated.schema}")
        return df_integrated
    except Exception as e:
        logger.error(f"Error during data integration: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Apply Business Rules
# MAGIC Function to apply business rules to the data.

# COMMAND ----------

def apply_business_rules(df):
    try:
        logger.info("Applying business rules")
        df_final = df.withColumn("total_sales", F.col("quantity") * F.col("price"))
        logger.info(f"Final data schema: {df_final.schema}")
        return df_final
    except Exception as e:
        logger.error(f"Error applying business rules: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Write Data to Unity Catalog
# MAGIC Function to write the final DataFrame to Unity Catalog.

# COMMAND ----------

def write_data_to_catalog(df, target_table):
    try:
        logger.info(f"Writing data to Unity Catalog table: {target_table}")
        df.write.format("delta").mode("overwrite").saveAsTable(target_table)
    except Exception as e:
        logger.error(f"Error writing data to table {target_table}: {e}")
        raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Main ETL Process
# MAGIC Execute the main ETL process, which includes loading, standardizing, cleaning, integrating, applying business rules, and writing data.

# COMMAND ----------

def main():
    try:
        # Load data from Unity Catalog tables
        df_region1 = load_data_from_catalog("catalog.source_db.region1_sales")
        df_region2 = load_data_from_catalog("catalog.source_db.region2_sales")

        # Standardize data
        df_region1_standardized = standardize_data(df_region1)
        df_region2_standardized = standardize_data(df_region2)

        # Clean data
        df_region1_cleaned = clean_data(df_region1_standardized)
        df_region2_cleaned = clean_data(df_region2_standardized)

        # Integrate data
        df_integrated = integrate_data([df_region1_cleaned, df_region2_cleaned])

        # Apply business rules
        df_final = apply_business_rules(df_integrated)

        # Write the final DataFrame to Unity Catalog
        write_data_to_catalog(df_final, "catalog.target_db.sales_data_transformed")

    except Exception as e:
        logger.error(f"ETL process failed: {e}")

# Execute the main ETL process
if __name__ == "__main__":
    main()
