In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Global Indicators
# MAGIC This notebook performs an ETL process on global indicators data using PySpark.

# COMMAND ----------

import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, lit, avg, col, broadcast

# COMMAND ----------

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Load Data from Unity Catalog Tables

# COMMAND ----------

def main():
    try:
        # Load data from Unity Catalog tables
        logger.info("Loading data from Unity Catalog tables.")
        global_indicators_df = spark.table("catalog.global_world_indicators_2000")
        consumer_price_indices_df = spark.table("catalog.ConsumerPriceIndices")

        # COMMAND ----------

        # Transformation: Remove Columns
        logger.info("Removing unnecessary columns.")
        # Replace with actual column names to be removed
        global_indicators_df = global_indicators_df.drop("UnwantedColumn1", "UnwantedColumn2")

        # COMMAND ----------

        # Transformation: Change Column Type
        logger.info("Changing column types.")
        global_indicators_df = global_indicators_df.withColumn("GDP", global_indicators_df["GDP"].cast("double"))

        # COMMAND ----------

        # Transformation: Remap
        logger.info("Remapping column values.")
        global_indicators_df = global_indicators_df.withColumn(
            "Ease_of_Business",
            when(global_indicators_df["Ease_of_Business"] == "OldValue", "NewValue")
            .otherwise(global_indicators_df["Ease_of_Business"])
        )

        # COMMAND ----------

        # Transformation: Add Column
        logger.info("Adding new columns.")
        global_indicators_df = global_indicators_df.withColumn("NewColumn", lit("DefaultValue"))

        # COMMAND ----------

        # Transformation: Rename Column
        logger.info("Renaming columns.")
        # Replace with actual column names to be renamed
        global_indicators_df = global_indicators_df.withColumnRenamed("OldColumnName", "NewColumnName")

        # COMMAND ----------

        # Transformation: Filter Operation
        logger.info("Applying filters to data.")
        global_indicators_df = global_indicators_df.filter(global_indicators_df["Year"] >= 2000)

        # COMMAND ----------

        # Transformation: Range Filter
        logger.info("Applying range filters.")
        global_indicators_df = global_indicators_df.filter(
            (global_indicators_df["GDP"] > 1000) & (global_indicators_df["GDP"] < 5000)
        )

        # COMMAND ----------

        # Transformation: Aggregate
        logger.info("Performing aggregation.")
        aggregated_df = global_indicators_df.groupBy("Country/Region").agg(avg("GDP").alias("Average_GDP"))

        # COMMAND ----------

        # Transformation: Unpivot
        logger.info("Unpivoting data.")
        unpivoted_df = global_indicators_df.selectExpr(
            "Country/Region",
            "stack(3, 'GDP', GDP, 'Health Exp % GDP', Health_Exp_Percent_GDP, 'CO2 Emissions', CO2_Emissions) as (Indicator, Value)"
        )

        # COMMAND ----------

        # Join Condition
        logger.info("Joining datasets.")
        # Select only necessary columns to avoid conflicts
        consumer_price_indices_df = consumer_price_indices_df.select("Country/Region", "CPI")

        joined_df = global_indicators_df.join(
            broadcast(consumer_price_indices_df),
            global_indicators_df["Country/Region"] == consumer_price_indices_df["Country/Region"],
            "inner"
        )

        # COMMAND ----------

        # Custom Calculations
        logger.info("Performing custom calculations.")
        joined_df = joined_df.withColumn("Year_Extracted", joined_df["Year"].substr(0, 4))

        # COMMAND ----------

        # Write the final output to Unity Catalog table
        logger.info("Writing the final output to Unity Catalog table.")
        joined_df.write.format("delta").mode("overwrite").saveAsTable("catalog.Global_Indicators")

        logger.info("ETL process completed successfully.")

    except Exception as e:
        logger.error("An error occurred during the ETL process.", exc_info=True)

# COMMAND ----------

if __name__ == "__main__":
    main()
