In [None]:
from config.config import java_home_location
from pyspark.sql import SparkSession

java_home_location()

spark = (
    SparkSession
        .builder 
        .getOrCreate()
)

spark

In [None]:
from functools import reduce
from pathlib import Path
from pyspark.sql import functions as F
from CVM import *

def consolidate_year(
    spark: SparkSession,
    base_folder: Path,
    entity: str,
    year: str,
    acronyms: list
):
    """
    Consolidates all acronyms for a given year into one DataFrame.
    """

    dfs = []

    for acronym in acronyms:
        path = base_folder / entity / year / acronym

        if not path.exists():
            print(f"[WARNING] Path not found: {path}")
            continue

        df = (
            spark.read
                .format("csv")
                .option("header", "true")
                .option("sep", ";")
                .option("inferSchema", "true")
                .option("encoding", "ISO-8859-1")
                .load(str(path))
        )

        # add metadata columns
        df = (
            df
            .withColumn("YEAR", F.lit(year))
            .withColumn("ACRONYM", F.lit(acronym))
            .withColumn("ENTITY", F.lit(entity))
        )

        dfs.append(df)

    if not dfs:
        return None

    return reduce(
        lambda a, b: a.unionByName(b, allowMissingColumns=True),
        dfs
    )


In [None]:
def consolidate_all_years(
    spark: SparkSession,
    base_folder: Path,
    entity: str,
    years: list,
    acronyms: list
):
    """
    Consolidates all years and acronyms into a single DataFrame.
    """

    dfs = []

    for year in years:
        df = consolidate_year(
            spark, base_folder, entity, year, acronyms
        )

        if df is not None:
            dfs.append(df)

    if not dfs:
        return None

    return reduce(
        lambda a, b: a.unionByName(b, allowMissingColumns=True),
        dfs
    )
