In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Global Indicators
# MAGIC This notebook performs an ETL process on global indicators data using PySpark in Databricks.

# COMMAND ----------

import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, sum as spark_sum

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

try:
    # Step 1: Data Source Configuration
    logger.info("Registering data sources in Databricks Unity Catalog.")
    # Assuming tables are already registered in Unity Catalog

    # Step 2: Data Ingestion
    logger.info("Loading data from Unity Catalog tables.")
    indicators_df = spark.table("catalog.global_world_indicators_2000")
    indices_df = spark.table("catalog.ConsumerPriceIndices")

# COMMAND ----------

    # Step 3: Data Transformation
    logger.info("Applying transformations to the data.")

    # Remove Columns
    indicators_df = indicators_df.drop("UnnecessaryColumn")

    # Change Column Type
    indicators_df = indicators_df.withColumn("GDP", col("GDP").cast("double"))

    # Remap
    indicators_df = indicators_df.withColumn("Ease of Business", when(col("Ease of Business") == "Easy", "1").otherwise("0"))

    # Add Column
    indicators_df = indicators_df.withColumn("NewColumn", lit("DefaultValue"))

    # Rename Column
    indicators_df = indicators_df.withColumnRenamed("OldColumnName", "NewColumnName")

    # Filter Operation
    indicators_df = indicators_df.filter(col("Year") > 2000)

    # Range Filter
    indicators_df = indicators_df.filter((col("GDP") > 1000) & (col("GDP") < 5000))

    # Aggregate
    aggregated_df = indicators_df.groupBy("Country/Region").agg(spark_sum("GDP").alias("TotalGDP"))

    # Unpivot
    unpivoted_df = indicators_df.selectExpr(
        "Country/Region",
        "stack(3, 'GDP', GDP, 'CO2 Emissions', CO2_Emissions, 'Health Exp % GDP', Health_Exp_Percent_GDP) as (Indicator, Value)"
    )

    # Join Condition
    joined_df = indicators_df.join(indices_df, indicators_df["Country/Region"] == indices_df["Country/Region"], "inner")

# COMMAND ----------

    # Step 4: Custom Calculations
    logger.info("Performing custom calculations.")
    indicators_df = indicators_df.withColumn("Year", col("Year").substr(0, 4))

# COMMAND ----------

    # Step 5: Output Data Configuration
    logger.info("Writing transformed data to Unity Catalog as a Delta table.")
    final_df = joined_df  # Assuming joined_df is the final DataFrame after all transformations
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.Global_Indicators")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process.", exc_info=True)
