In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Global Indicators
# MAGIC This notebook performs an ETL process on global indicators data using PySpark.

# COMMAND ----------

import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, avg, expr, substring

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Load Data from Unity Catalog Tables

# COMMAND ----------

def main():
    try:
        # Load data from Unity Catalog tables
        logger.info("Loading data from Unity Catalog tables.")
        indicators_df = spark.table("catalog.source_db.global_world_indicators_2000")
        cpi_df = spark.table("catalog.source_db.ConsumerPriceIndices")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Data Cleaning and Transformation

# COMMAND ----------

        # Remove unnecessary columns
        logger.info("Dropping unnecessary columns.")
        # Check if columns exist before dropping
        columns_to_drop = ["UnnecessaryColumn1", "UnnecessaryColumn2"]
        indicators_df = indicators_df.drop(*[col for col in columns_to_drop if col in indicators_df.columns])
        
        columns_to_drop_cpi = ["UnnecessaryColumn3", "UnnecessaryColumn4"]
        cpi_df = cpi_df.drop(*[col for col in columns_to_drop_cpi if col in cpi_df.columns])

        # Change column types
        logger.info("Changing column types.")
        if "Birth Rate" in indicators_df.columns:
            indicators_df = indicators_df.withColumn("Birth Rate", col("Birth Rate").cast("double"))

        # Remap values
        logger.info("Remapping values.")
        if "Region" in indicators_df.columns:
            indicators_df = indicators_df.withColumn("Region", when(col("Region") == "OldValue", "NewValue").otherwise(col("Region")))

        # Add new columns
        logger.info("Adding new columns.")
        indicators_df = indicators_df.withColumn("NewColumn", lit("DefaultValue"))

        # Rename columns
        logger.info("Renaming columns.")
        if "OldColumnName" in indicators_df.columns:
            indicators_df = indicators_df.withColumnRenamed("OldColumnName", "NewColumnName")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Filtering and Aggregation

# COMMAND ----------

        # Filter operations
        logger.info("Applying filters.")
        if "Year" in indicators_df.columns:
            indicators_df = indicators_df.filter(col("Year") > 2000)

        # Range filter
        logger.info("Applying range filters.")
        if "GDP" in indicators_df.columns:
            indicators_df = indicators_df.filter((col("GDP") > 1000) & (col("GDP") < 5000))

        # Aggregate operations
        logger.info("Performing aggregation.")
        if "Country/Region" in indicators_df.columns and "GDP" in indicators_df.columns:
            aggregated_df = indicators_df.groupBy("Country/Region").agg(avg("GDP").alias("Average GDP"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Unpivoting and Joining DataFrames

# COMMAND ----------

        # Unpivot data
        logger.info("Unpivoting data.")
        if all(col in indicators_df.columns for col in ["Birth Rate", "GDP", "CO2 Emissions"]):
            unpivoted_df = indicators_df.selectExpr("Country/Region", "stack(3, 'Birth Rate', `Birth Rate`, 'GDP', GDP, 'CO2 Emissions', `CO2 Emissions`) as (Indicator, Value)")

        # Join DataFrames
        logger.info("Joining DataFrames.")
        if "Country/Region" in cpi_df.columns:
            cpi_df = cpi_df.drop("Country/Region")  # Drop duplicate join column from right DataFrame
            final_df = indicators_df.join(cpi_df, indicators_df["Country/Region"] == cpi_df["Country/Region"], "inner")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Custom Calculations and Writing to Unity Catalog

# COMMAND ----------

        # Custom calculations
        logger.info("Applying custom calculations.")
        if "Year" in final_df.columns:
            final_df = final_df.withColumn("Year", substring("Year", 1, 4))

        # Write the final DataFrame to a Unity Catalog table in Delta format
        logger.info("Writing the final DataFrame to Unity Catalog table.")
        final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.GlobalIndicators")

        logger.info("ETL process completed successfully.")

    except Exception as e:
        logger.error("An error occurred during the ETL process.", exc_info=True)

if __name__ == "__main__":
    main()
