Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ferc2 full etl integration test issues #2652

Merged
merged 4 commits into from Jun 10, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 8 additions & 8 deletions src/pudl/package_data/ferc2/dbc_file_map.csv
@@ -1,12 +1,12 @@
year,path
1996,UPLOADERS/FORM2/tmpwork/F2_PUB.DBC
1997,UPLOADERS/FORM2/tmpwork/F2_PUB.DBC
1998,UPLOADERS/FORM2/tmpwork/F2_PUB.DBC
1999,UPLOADERS/FORM2/tmpwork/F2_PUB.DBC
2000,UPLOADERS/FORM2/tmpwork/F2_PUB.DBC
2001,UPLOADERS/FORM2/tmpwork/F2_PUB.DBC
2002,UPLOADERS/FORM2/tmpwork/F2_PUB.DBC
2003,UPLOADERS/FORM2/tmpwork/F2_PUB.DBC
1996,FORMSADMIN/FORM2/tmpwork/F2_PUB.DBC
1997,FORMSADMIN/FORM2/tmpwork/F2_PUB.DBC
1998,FORMSADMIN/FORM2/tmpwork/F2_PUB.DBC
1999,FORMSADMIN/FORM2/tmpwork/F2_PUB.DBC
2000,FORMSADMIN/FORM2/tmpwork/F2_PUB.DBC
2001,FORMSADMIN/FORM2/tmpwork/F2_PUB.DBC
2002,FORMSADMIN/FORM2/tmpwork/F2_PUB.DBC
2003,FORMSADMIN/FORM2/tmpwork/F2_PUB.DBC
Comment on lines -2 to +9
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be easy to raise an exception in the case of invalid / non-existend DBC paths rather than silently failing?

2004,UPLOADERS/FORM2/working/F2_PUB.DBC
2005,UPLOADERS/FORM2/working/F2_PUB.DBC
2006,UPLOADERS/FORM2/working/F2_PUB.DBC
Expand Down
35 changes: 28 additions & 7 deletions test/integration/ferc_dbf_extract_test.py
Expand Up @@ -6,6 +6,8 @@
import sqlalchemy as sa

from pudl.extract.dbf import FercDbfReader
from pudl.extract.ferc1 import Ferc1DbfExtractor
from pudl.extract.ferc2 import Ferc2DbfExtractor

logger = logging.getLogger(__name__)

Expand All @@ -19,13 +21,13 @@ def test_ferc1_dbf2sqlite(ferc1_engine_dbf):


@pytest.mark.parametrize(
"dataset",
"extractor_class",
[
pytest.param("ferc1", id="ferc1"),
pytest.param("ferc2", id="ferc2"),
pytest.param(Ferc1DbfExtractor, id="ferc1"),
pytest.param(Ferc2DbfExtractor, id="ferc2"),
],
)
def test_ferc_schema(ferc_to_sqlite_settings, pudl_datastore_fixture, dataset):
def test_ferc_schema(ferc_to_sqlite_settings, pudl_datastore_fixture, extractor_class):
"""Check to make sure we aren't missing any old FERC Form N tables or fields.

Exhaustively enumerate all historical sets of FERC Form N database tables and their
Expand All @@ -34,6 +36,7 @@ def test_ferc_schema(ferc_to_sqlite_settings, pudl_datastore_fixture, dataset):
mapping from 2015, includes every single table and field that appears in the
historical FERC Form 1 data.
"""
dataset = extractor_class.DATASET
dbf_settings = getattr(ferc_to_sqlite_settings, f"{dataset}_dbf_to_sqlite_settings")
refyear = dbf_settings.refyear
dbf_reader = FercDbfReader(pudl_datastore_fixture, dataset=dataset)
Expand All @@ -47,11 +50,29 @@ def test_ferc_schema(ferc_to_sqlite_settings, pudl_datastore_fixture, dataset):
f"New {dataset} table '{table}' in {refyear} "
f"does not exist in canonical list of tables"
)

# Retrieve all supported partitions for the dataset
descriptor = pudl_datastore_fixture.get_datapackage_descriptor(dataset)
parts = list(descriptor.get_partition_filters(data_format="dbf"))
for yr in dbf_settings.years:
# Check that for each year in the settings, there are partitions defined.
yr_parts = [p for p in parts if p.get("year", None) == yr]
if not yr_parts:
logger.debug(f"Partitions supported by {dataset} are: {parts}")
raise AssertionError(f"No partitions found for {dataset} in year {yr}.")
# Check that validation function picks exactly one partition for each year.
yr_valid_parts = [p for p in yr_parts if extractor_class.is_valid_partition(p)]
if len(yr_valid_parts) != 1:
logger.debug(
f"Filter for {dataset} for year {yr} is: {yr_valid_parts} "
f"(from {yr_parts})"
)
raise AssertionError(
f"is_valid_partition() function for {dataset} "
f"should select exactly one partition for year {yr}."
)
logger.info(f"Searching for lost {dataset} tables and fields in {yr}.")
# Some early years might need part=None to eliminate split-respondent
# strange archives, but let's assume this is not needed here for now.
yr_archive = dbf_reader.get_archive(year=yr, data_format="dbf")
yr_archive = dbf_reader.get_archive(**yr_valid_parts[0])
for table in yr_archive.get_db_schema():
if table not in dbf_reader.get_table_names():
raise AssertionError(
Expand Down