In [5]:
import polars as pl
from polars.exceptions import NoDataError
from glob import iglob
import os
import logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

In [6]:
csv_files = list(iglob("databases/*.csv"))

In [7]:
for file in csv_files:
    logger.info(f"Processing file: {file}")
    try:
        df = pl.read_csv(file, has_header=True, encoding="latin1")
        logger.info(f"Read {len(df)} rows from {file}")

        comps = df["COMP"].unique().to_list()
        logger.info(f"Found {len(comps)} unique COMP values")

        for c in comps:
            partitioned_df = df.filter(pl.col("COMP").le(c))
            filepath = os.path.join("partitioned_files", f"{c}")

            os.makedirs(filepath, exist_ok=True)
            filename = os.path.join(filepath, f"{os.path.basename(file)}")

            partitioned_df.write_csv(filename)
            logger.debug(f"Wrote partition for COMP <= {c} to {filename}")

    except NoDataError:
        logger.warning(f"Empty file: {file}")

2026-02-21 19:33:12,899 INFO Processing file: databases\Leitos_2009.csv
2026-02-21 19:33:12,992 INFO Read 89094 rows from databases\Leitos_2009.csv
2026-02-21 19:33:12,995 INFO Found 12 unique COMP values
2026-02-21 19:33:13,483 INFO Processing file: databases\Leitos_2012.csv
2026-02-21 19:33:13,567 INFO Read 87083 rows from databases\Leitos_2012.csv
2026-02-21 19:33:13,569 INFO Found 12 unique COMP values
2026-02-21 19:33:14,016 INFO Processing file: databases\Leitos_2015.csv
2026-02-21 19:33:14,080 INFO Read 83558 rows from databases\Leitos_2015.csv
2026-02-21 19:33:14,083 INFO Found 12 unique COMP values
2026-02-21 19:33:14,549 INFO Processing file: databases\Leitos_2018.csv
2026-02-21 19:33:14,620 INFO Read 82832 rows from databases\Leitos_2018.csv
2026-02-21 19:33:14,622 INFO Found 12 unique COMP values
2026-02-21 19:33:15,099 INFO Processing file: databases\Leitos_2021.csv
2026-02-21 19:33:15,165 INFO Read 85783 rows from databases\Leitos_2021.csv
2026-02-21 19:33:15,167 INFO Fou