Skip to content

Commit

Permalink
Get type and value checks working on SQLite load
Browse files Browse the repository at this point in the history
* Added an explicit dtype dictionary in the SQLite load call to df.to_sql() that pulls
  the column types from the SQLAlchemy metadata object we've just used to create the
  database, so they should always line up.
* Turned on check_types and check_values in the integration tests.

Other minor stuff:
* Omitted deprecated datapackage modules from the test coverage reporting.
* Replaced instances of in-place list.sort() with sorted(list) in a few places, since
  it's idiomatically more similar to what we're used to with dataframes, and I've been
  bitten several times by in-place sorting returning None when I didn't expect it.

Closes: #1197
  • Loading branch information
zaneselvans committed Sep 17, 2021
1 parent 578ac85 commit 50c2d53
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 21 deletions.
8 changes: 4 additions & 4 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[run]
omit =
*glue/zipper.py
*analysis/analysis.py
*analysis/demand_mapping.py
*epaipm.py
*convert/datapkg_to_sqlite.py # Deprecated with direct SQLite/Parquet ETL
*convert/merge_datapkgs.py # Deprecated with direct SQLite/Parquet ETL
*load/csv.py # Deprecated with direct SQLite/Parquet ETL
*load/metadata.py # Deprecated with direct SQLite/Parquet ETL
*__main__.py
*__init__.py
*_test.py
Expand Down
6 changes: 3 additions & 3 deletions src/pudl/convert/ferc1_to_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ def main(): # noqa: C901
f"({min(pc.data_years['ferc1'])}-"
f"{max(pc.data_years['ferc1'])})."
)
script_settings["ferc1_to_sqlite_years"] = list(
set(script_settings["ferc1_to_sqlite_years"]))
script_settings["ferc1_to_sqlite_years"].sort()
script_settings["ferc1_to_sqlite_years"] = sorted(
set(script_settings["ferc1_to_sqlite_years"])
)

# This field is optional and generally unused...
bad_cols = script_settings.get("ferc1_to_sqlite_bad_cols", ())
Expand Down
15 changes: 5 additions & 10 deletions src/pudl/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ def _validate_params_eia(etl_params):
# when nothing is set in the settings file, the years will default as none
eia_input_dict['eia860_years'] = etl_params.get('eia860_years', [])
# Ensure there are no duplicate years:
eia_input_dict["eia860_years"] = list(set(eia_input_dict["eia860_years"]))
eia_input_dict["eia860_years"].sort()
eia_input_dict["eia860_years"] = sorted(set(eia_input_dict["eia860_years"]))

# the tables will default to all of the tables if nothing is given
eia_input_dict['eia860_tables'] = etl_params.get(
Expand All @@ -91,8 +90,7 @@ def _validate_params_eia(etl_params):

eia_input_dict['eia923_years'] = etl_params.get('eia923_years', [])
# Ensure no duplicate years:
eia_input_dict['eia923_years'] = list(set(eia_input_dict['eia923_years']))
eia_input_dict['eia923_years'].sort()
eia_input_dict['eia923_years'] = sorted(set(eia_input_dict['eia923_years']))

eia_input_dict['eia923_tables'] = etl_params.get(
'eia923_tables', pc.pudl_tables['eia923']
Expand Down Expand Up @@ -256,8 +254,7 @@ def _validate_params_ferc1(etl_params):
# pull out the etl_params from the dictionary passed into this function
ferc1_dict['ferc1_years'] = etl_params.get('ferc1_years', [])
# Ensure no there are no duplicate years by converting to set and back:
ferc1_dict["ferc1_years"] = list(set(ferc1_dict["ferc1_years"]))
ferc1_dict["ferc1_years"].sort()
ferc1_dict["ferc1_years"] = sorted(set(ferc1_dict["ferc1_years"]))

# the tables will default to all of the tables if nothing is given
ferc1_dict['ferc1_tables'] = etl_params.get(
Expand Down Expand Up @@ -415,10 +412,8 @@ def etl_epacems(etl_params, pudl_settings, ds_kwargs) -> None:
# Deduplicate the years and states just in case. This happens outside of
# the settings validation because this ETL function is also called directly
# in the epacems_to_parquet() script.
epacems_years = list(set(epacems_dict['epacems_years']))
epacems_years.sort()
epacems_states = list(set(epacems_dict['epacems_states']))
epacems_states.sort()
epacems_years = sorted(set(epacems_dict['epacems_years']))
epacems_states = sorted(set(epacems_dict['epacems_states']))

# If we're not doing CEMS, just stop here to avoid printing messages like
# "Reading EPA CEMS data...", which could be confusing.
Expand Down
3 changes: 1 addition & 2 deletions src/pudl/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,7 @@ def organize_cols(df, cols):
"""
# Generate a list of all the columns in the dataframe that are not
# included in cols
data_cols = [c for c in df.columns.tolist() if c not in cols]
data_cols.sort()
data_cols = sorted([c for c in df.columns.tolist() if c not in cols])
organized_cols = cols + data_cols
return df[organized_cols]

Expand Down
1 change: 1 addition & 0 deletions src/pudl/load/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,5 @@ def _set_sqlite_pragma(dbapi_connection, connection_record):
engine,
if_exists="append",
index=False,
dtype={c.name: c.type for c in table.columns},
)
4 changes: 2 additions & 2 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ def pudl_sql_engine(
# These checks should all be True but there are bugs at the moment.
# See: https://github.com/catalyst-cooperative/pudl/issues/1196
check_foreign_keys=False,
check_types=False,
check_values=False
check_types=True,
check_values=True,
)
# Grab a connection to the freshly populated PUDL DB, and hand it off.
# All the hard work here is being done by the datapkg and
Expand Down

0 comments on commit 50c2d53

Please sign in to comment.