In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Global World Indicators
# MAGIC This notebook performs an ETL process on data from Unity Catalog tables, transforming and joining datasets, and writing the results back to a target table.

# COMMAND ----------

import logging
from pyspark.sql.functions import col, lit, substring, expr

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Step 1: Load Data from Unity Catalog Tables
    logger.info("Loading data from Unity Catalog tables.")
    global_world_indicators_df = spark.table("catalog.source_db.global_world_indicators_2000")
    consumer_price_indices_df = spark.table("catalog.source_db.ConsumerPriceIndices")

# COMMAND ----------

    # Step 2: Data Transformation
    logger.info("Starting data transformation process.")

    # Remove Columns
    columns_to_drop = ["UnnecessaryColumn1", "UnnecessaryColumn2"]  # Replace with actual column names
    global_world_indicators_df = global_world_indicators_df.drop(*columns_to_drop)

    # Change Column Type
    global_world_indicators_df = global_world_indicators_df.withColumn("GDP", col("GDP").cast("double"))

    # Add Column
    global_world_indicators_df = global_world_indicators_df.withColumn("NewColumn", lit("DefaultValue"))

    # Rename Column
    global_world_indicators_df = global_world_indicators_df.withColumnRenamed("OldColumnName", "NewColumnName")

    # Filter Operation
    global_world_indicators_df = global_world_indicators_df.filter(col("Year") > 2000)

    # Aggregate
    aggregated_df = global_world_indicators_df.groupBy("Country/Region").agg({"GDP": "sum"})

    # Unpivot
    unpivoted_df = global_world_indicators_df.selectExpr(
        "Country/Region",
        "stack(3, 'GDP', GDP, 'CO2 Emissions', CO2_Emissions, 'Health Exp % GDP', Health_Exp_Percent_GDP) as (Indicator, Value)"
    )

# COMMAND ----------

    # Step 3: Custom Calculations
    logger.info("Applying custom calculations.")
    global_world_indicators_df = global_world_indicators_df.withColumn("Year", substring(col("Year"), 1, 4))

# COMMAND ----------

    # Step 4: Data Join
    logger.info("Joining data sources.")
    joined_df = global_world_indicators_df.join(
        consumer_price_indices_df,
        global_world_indicators_df["Country/Region"] == consumer_price_indices_df["Country/Region"],
        "inner"
    )

# COMMAND ----------

    # Step 5: Output Data Configuration
    logger.info("Writing transformed data to Unity Catalog target table.")
    joined_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.GlobalIndicators")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process.", exc_info=True)
