In [None]:
# Databricks notebook source
# ETL Process for Global Indicators
# This notebook performs an ETL process on global indicators data using PySpark in Databricks.

# COMMAND ----------
import logging
from pyspark.sql.functions import col, when, lit, sum as spark_sum, expr, broadcast

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume the Spark session is already available as 'spark'

# COMMAND ----------
try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables.")
    df_indicators = spark.table("catalog.source_db.global_world_indicators_2000")
    df_indices = spark.table("catalog.source_db.ConsumerPriceIndices")

# COMMAND ----------
    # Transformation: Remove Columns
    logger.info("Removing unnecessary columns.")
    df_indicators = df_indicators.drop("UnnecessaryColumn")  # Replace with actual column name

# COMMAND ----------
    # Transformation: Change Column Type
    logger.info("Changing column types.")
    df_indicators = df_indicators.withColumn("GDP", col("GDP").cast("double"))

# COMMAND ----------
    # Transformation: Remap
    logger.info("Remapping values in 'Ease of Business' column.")
    df_indicators = df_indicators.withColumn(
        "Ease of Business",
        when(col("Ease of Business") == "Easy", "1").otherwise("0")
    )

# COMMAND ----------
    # Transformation: Add Column
    logger.info("Adding new column with default value.")
    df_indicators = df_indicators.withColumn("NewColumn", lit("DefaultValue"))

# COMMAND ----------
    # Transformation: Rename Column
    logger.info("Renaming columns.")
    df_indicators = df_indicators.withColumnRenamed("OldColumnName", "NewColumnName")  # Replace with actual column names

# COMMAND ----------
    # Transformation: Filter Operation
    logger.info("Applying filter operation on 'Year' column.")
    df_indicators = df_indicators.filter(col("Year") > 2000)

# COMMAND ----------
    # Transformation: Range Filter
    logger.info("Applying range filter on 'GDP' column.")
    df_indicators = df_indicators.filter((col("GDP") > 1000) & (col("GDP") < 5000))

# COMMAND ----------
    # Transformation: Aggregate
    logger.info("Aggregating data by 'Country/Region'.")
    df_aggregated = df_indicators.groupBy("Country/Region").agg(spark_sum("GDP").alias("TotalGDP"))

# COMMAND ----------
    # Transformation: Unpivot
    logger.info("Unpivoting data.")
    df_unpivoted = df_indicators.selectExpr(
        "Country/Region",
        "stack(3, 'GDP', GDP, 'CO2 Emissions', CO2_Emissions, 'Health Exp % GDP', Health_Exp_Percent_GDP) as (Indicator, Value)"
    )

# COMMAND ----------
    # Transformation: Join Condition
    logger.info("Joining indicators and indices data.")
    df_joined = df_indicators.join(broadcast(df_indices), "Country/Region", "inner")

# COMMAND ----------
    # Custom Calculation: Extract first 4 characters from 'Year'
    logger.info("Applying custom calculation to extract year.")
    df_joined = df_joined.withColumn("Year", expr("substring(Year, 1, 4)"))

# COMMAND ----------
    # Write the transformed data to a Unity Catalog target table
    logger.info("Writing transformed data to Unity Catalog target table.")
    df_joined.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.Global_Indicators")

    logger.info("ETL process completed successfully.")

# COMMAND ----------
except Exception as e:
    logger.error("An error occurred during the ETL process.", exc_info=True)
