Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update CEMS partitions to handle year-quarter files #3096

Merged
merged 44 commits into from Dec 12, 2023
Merged
Changes from 1 commit
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
3946f71
WIP move to year-quarter partitions
e-belfer Nov 24, 2023
e5d3ab4
WIP change extraction partitions and etl settings, fix unit tests
e-belfer Nov 29, 2023
f0d3d03
Resolve merge conflict and update settings_test
e-belfer Nov 30, 2023
78f1992
Merge branch 'dev' into cems-quarterly
e-belfer Nov 30, 2023
ec00457
Update conda-lock.yml and rendered conda environment files.
e-belfer Nov 30, 2023
77db673
Update integration tests to use quarter
e-belfer Nov 30, 2023
a0f729c
Update DOI to production
e-belfer Dec 4, 2023
9320d8a
Merge branch 'dev' into cems-quarterly
e-belfer Dec 4, 2023
748c09b
Merge branch 'cems-quarterly' of https://github.com/catalyst-cooperat…
e-belfer Dec 4, 2023
8e01373
Fix EPACEMS integration test
e-belfer Dec 4, 2023
58bffe3
Update conda-lock.yml and rendered conda environment files.
e-belfer Dec 4, 2023
610ef4c
Repartition row groups in monolith parquet, update integration test, …
e-belfer Dec 5, 2023
70c720a
Merge branch 'cems-quarterly' of https://github.com/catalyst-cooperat…
e-belfer Dec 5, 2023
64560bb
Drop year from fast ETL and add concurrency limiting
e-belfer Dec 6, 2023
e8c5542
Merge branch 'dev' into cems-quarterly
e-belfer Dec 6, 2023
2f567cf
Update conda-lock.yml and rendered conda environment files.
e-belfer Dec 6, 2023
9607248
Drop concurrency further and update integration test to use 2022 data
e-belfer Dec 6, 2023
441a9b5
Update conda-lock.yml and rendered conda environment files.
e-belfer Dec 6, 2023
7c895af
Merge branch 'dev' into cems-quarterly
cmgosnell Dec 7, 2023
f1cd9d9
point cems to a new (draft!) archive w/ year_quarter partitions
cmgosnell Dec 8, 2023
ca1b2e7
Update conda-lock.yml and rendered conda environment files.
cmgosnell Dec 8, 2023
e2f60b5
Merge branch 'dev' into cems-quarterly
cmgosnell Dec 8, 2023
078ae5b
Merge branch 'cems-quarterly' of github.com:catalyst-cooperative/pudl…
cmgosnell Dec 8, 2023
f01aef9
Merge branch 'cems-quarterly' into cems-year_quarters
cmgosnell Dec 8, 2023
99c3c37
Merge branch 'cems-year_quarters' of github.com:catalyst-cooperative/…
cmgosnell Dec 8, 2023
a0ddb5b
Update conda-lock.yml and rendered conda environment files.
cmgosnell Dec 8, 2023
e4188e2
Update conda-lock.yml and rendered conda environment files.
cmgosnell Dec 8, 2023
7197e64
Merge branch 'cems-quarterly' into cems-year_quarters
cmgosnell Dec 11, 2023
1815560
address pr concerns
cmgosnell Dec 11, 2023
5c7eaba
Merge branch 'cems-year_quarters' of github.com:catalyst-cooperative/…
cmgosnell Dec 11, 2023
1ac9e48
Merge branch 'dev' into cems-quarterly
cmgosnell Dec 11, 2023
51d9a03
Merge branch 'cems-quarterly' of github.com:catalyst-cooperative/pudl…
cmgosnell Dec 11, 2023
27c415d
Update conda-lock.yml and rendered conda environment files.
cmgosnell Dec 11, 2023
f33db7b
Merge branch 'cems-quarterly' into cems-year_quarters
cmgosnell Dec 11, 2023
55828de
Update conda-lock.yml and rendered conda environment files.
cmgosnell Dec 11, 2023
0db0d4a
Merge branch 'cems-quarterly' into cems-year_quarters
cmgosnell Dec 11, 2023
15c6069
Merge branch 'cems-year_quarters' of github.com:catalyst-cooperative/…
cmgosnell Dec 11, 2023
98861eb
Merge pull request #3139 from catalyst-cooperative/cems-year_quarters
cmgosnell Dec 11, 2023
fbd4689
add release notes for quarterly cems
cmgosnell Dec 11, 2023
84c5330
Fix some comments/docstrings; clarify Zenodo RECID regex
zaneselvans Dec 11, 2023
1c3e47a
Remove comment about epacems DOI being draft archive.
zaneselvans Dec 11, 2023
003f1d7
Adjust epacems output tests to reflect quarterly partitions.
zaneselvans Dec 12, 2023
bbef1e2
Merge branch 'dev' into cems-quarterly
zaneselvans Dec 12, 2023
bfe6203
add tests to cover a few uncovered lines
cmgosnell Dec 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
56 changes: 35 additions & 21 deletions test/integration/epacems_test.py
Expand Up @@ -10,8 +10,8 @@


@pytest.fixture(scope="module")
def epacems_year_and_state(etl_settings):
"""Find the year and state defined in pudl/package_data/settings/etl_*.yml."""
def epacems_year_and_quarter(etl_settings):
"""Find the year and quarter defined in pudl/package_data/settings/etl_*.yml."""
# the etl_settings data structure alternates dicts and lists so indexing is a pain.
return etl_settings.datasets.epacems

Expand All @@ -25,23 +25,23 @@ def epacems_parquet_path(
return epacems_io_manager(context)._base_path / "hourly_emissions_epacems.parquet"


def test_epacems_subset(epacems_year_and_state, epacems_parquet_path):
def test_epacems_subset(epacems_year_and_quarter, epacems_parquet_path):
"""Minimal integration test of epacems().

Check if it returns a DataFrame.
"""
if not epacems_year_and_state:
if not epacems_year_and_quarter:
pytest.skip("EPA CEMS not in settings file and so is not being tested.")
path = epacems_parquet_path
years = epacems_year_and_state.years
# Use only Idaho if multiple states are given
states = (
epacems_year_and_state.states
if len(epacems_year_and_state.states) == 1
else ["ID"]
years = epacems_year_and_quarter.years
# Use only Idaho if multiple quarters are given
e-belfer marked this conversation as resolved.
Show resolved Hide resolved
quarters = (
epacems_year_and_quarter.quarters
if len(epacems_year_and_quarter.quarters) == 1
else [1]
)
actual = epacems(
columns=["gross_load_mw"], epacems_path=path, years=years, states=states
columns=["gross_load_mw"], epacems_path=path, years=years, quarters=quarters
)
assert isinstance(actual, dd.DataFrame) # nosec: B101
assert actual.shape[0].compute() > 0 # nosec: B101 n rows
Expand All @@ -52,30 +52,44 @@ def test_epacems_missing_partition(pudl_datastore_fixture):

Note that this should pass for both the Fast and Full ETL because the behavior
towards a missing file is identical."""
df = extract(year=1996, state="UT", ds=pudl_datastore_fixture)
df = extract(year=1996, quarter=1, ds=pudl_datastore_fixture)
epacems_res = Resource.from_id("hourly_emissions_epacems")
expected_cols = list(epacems_res.get_field_names())
assert df.shape[0] == 0 # Check that no rows of data are there
# Check that all columns expected of EPACEMS data are present.
assert sorted(df.columns) == sorted(expected_cols)


def test_epacems_subset_input_validation(epacems_year_and_state, epacems_parquet_path):
def test_epacems_subset_input_validation(
epacems_year_and_quarter, epacems_parquet_path
):
"""Check if invalid inputs raise exceptions."""
if not epacems_year_and_state:
if not epacems_year_and_quarter:
pytest.skip("EPA CEMS not in settings file and so is not being tested.")
path = epacems_parquet_path
valid_year = epacems_year_and_state.years[-1]
valid_state = epacems_year_and_state.states[-1]
valid_year = epacems_year_and_quarter.years[-1]
valid_quarter = epacems_year_and_quarter.quarters[-1]
valid_column = "gross_load_mw"

invalid_state = "confederacy"
invalid_quarter = 6
invalid_year = 1775
invalid_column = "clean_coal"
combos = [
{"years": [valid_year], "states": [valid_state], "columns": [invalid_column]},
{"years": [valid_year], "states": [invalid_state], "columns": [valid_column]},
{"years": [invalid_year], "states": [valid_state], "columns": [valid_column]},
{
"years": [valid_year],
"quarters": [valid_quarter],
"columns": [invalid_column],
},
{
"years": [valid_year],
"quarters": [invalid_quarter],
"columns": [valid_column],
},
{
"years": [invalid_year],
"quarters": [valid_quarter],
"columns": [valid_column],
},
]
for combo in combos:
with pytest.raises(ValueError):
Expand All @@ -89,7 +103,7 @@ def test_epacems_parallel(pudl_engine, epacems_parquet_path):
# monolithic outputs.
df = dd.read_parquet(
epacems_parquet_path,
filters=year_state_filter(years=[2020], states=["ME"]),
filters=year_state_filter(years=[2020], quarters=["ME"]),
index=False,
engine="pyarrow",
split_row_groups=True,
Expand Down