## EPACEMS Polarization Regression Testing

In [None]:
import polars as pl
from polars.testing import assert_frame_equal

from pudl.workspace.setup import PudlPaths

### Scan table from nightly builds and local workspace
Requires that you've materialized the latest version of the `core_epacems__hourly_emissions` asset.

In [None]:
nightly_path = "https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/nightly/core_epacems__hourly_emissions.parquet"
dev_path = PudlPaths().parquet_path("core_epacems__hourly_emissions")

nightly_lf = pl.scan_parquet(nightly_path)
dev_path = pl.scan_parquet(dev_path)

### Compare old and new versions
Compare 1 year at a time to avoid memory issues. Currently `check_dtypes` leads to failures due to a mismatch between string and categorical columns. If we update the `Resource.enforce_schema` method to support Polars we could make it much easier to avoid typ

In [None]:
import traceback
pk_cols = [
    "plant_id_epa",
    "emissions_unit_id_epa",
    "operating_datetime_utc",
]
for year, in dev_path.select("year").unique().collect().iter_rows():
    print(f"Checking frames equal for year: {year}")
    try:
        assert_frame_equal(
            dev_path.filter(pl.col("year") == year).sort(by=pk_cols),
            nightly_lf.filter(pl.col("year") == year).sort(by=pk_cols),
            check_column_order=False,
            # The nightly version has categorical cols where dev has string cols
            # Need to modify existing schema enforcement to support Polars
            check_dtypes=False,
        )
    except AssertionError as e:
        print(f"Failed validation for year: {year}")
        traceback.print_exc()