Get type and value checks working on SQLite load

* Added an explicit dtype dictionary in the SQLite load call to df.to_sql() that pulls the column types from the SQLAlchemy metadata object we've just used to create the database, so they should always line up. * Turned on check_types and check_values in the integration tests. Other minor stuff: * Omitted deprecated datapackage modules from the test coverage reporting. * Replaced instances of in-place list.sort() with sorted(list) in a few places, since it's idiomatically more similar to what we're used to with dataframes, and I've been bitten several times by in-place sorting returning None when I didn't expect it. Closes: #1197
catalyst-cooperative · Sep 17, 2021 · 50c2d53 · 50c2d53
1 parent 578ac85
commit 50c2d53
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 21 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -1,9 +1,9 @@
 [run]
 omit =
-    *glue/zipper.py
-    *analysis/analysis.py
-    *analysis/demand_mapping.py
-    *epaipm.py
+    *convert/datapkg_to_sqlite.py   # Deprecated with direct SQLite/Parquet ETL
+    *convert/merge_datapkgs.py      # Deprecated with direct SQLite/Parquet ETL
+    *load/csv.py                    # Deprecated with direct SQLite/Parquet ETL
+    *load/metadata.py               # Deprecated with direct SQLite/Parquet ETL
     *__main__.py
     *__init__.py
     *_test.py

diff --git a/src/pudl/convert/ferc1_to_sqlite.py b/src/pudl/convert/ferc1_to_sqlite.py
@@ -118,9 +118,9 @@ def main():  # noqa: C901
                 f"({min(pc.data_years['ferc1'])}-"
                 f"{max(pc.data_years['ferc1'])})."
             )
-    script_settings["ferc1_to_sqlite_years"] = list(
-        set(script_settings["ferc1_to_sqlite_years"]))
-    script_settings["ferc1_to_sqlite_years"].sort()
+    script_settings["ferc1_to_sqlite_years"] = sorted(
+        set(script_settings["ferc1_to_sqlite_years"])
+    )
 
     # This field is optional and generally unused...
     bad_cols = script_settings.get("ferc1_to_sqlite_bad_cols", ())

diff --git a/src/pudl/etl.py b/src/pudl/etl.py
@@ -76,8 +76,7 @@ def _validate_params_eia(etl_params):
     # when nothing is set in the settings file, the years will default as none
     eia_input_dict['eia860_years'] = etl_params.get('eia860_years', [])
     # Ensure there are no duplicate years:
-    eia_input_dict["eia860_years"] = list(set(eia_input_dict["eia860_years"]))
-    eia_input_dict["eia860_years"].sort()
+    eia_input_dict["eia860_years"] = sorted(set(eia_input_dict["eia860_years"]))
 
     # the tables will default to all of the tables if nothing is given
     eia_input_dict['eia860_tables'] = etl_params.get(
@@ -91,8 +90,7 @@ def _validate_params_eia(etl_params):
 
     eia_input_dict['eia923_years'] = etl_params.get('eia923_years', [])
     # Ensure no duplicate years:
-    eia_input_dict['eia923_years'] = list(set(eia_input_dict['eia923_years']))
-    eia_input_dict['eia923_years'].sort()
+    eia_input_dict['eia923_years'] = sorted(set(eia_input_dict['eia923_years']))
 
     eia_input_dict['eia923_tables'] = etl_params.get(
         'eia923_tables', pc.pudl_tables['eia923']
@@ -256,8 +254,7 @@ def _validate_params_ferc1(etl_params):
     # pull out the etl_params from the dictionary passed into this function
     ferc1_dict['ferc1_years'] = etl_params.get('ferc1_years', [])
     # Ensure no there are no duplicate years by converting to set and back:
-    ferc1_dict["ferc1_years"] = list(set(ferc1_dict["ferc1_years"]))
-    ferc1_dict["ferc1_years"].sort()
+    ferc1_dict["ferc1_years"] = sorted(set(ferc1_dict["ferc1_years"]))
 
     # the tables will default to all of the tables if nothing is given
     ferc1_dict['ferc1_tables'] = etl_params.get(
@@ -415,10 +412,8 @@ def etl_epacems(etl_params, pudl_settings, ds_kwargs) -> None:
     # Deduplicate the years and states just in case. This happens outside of
     # the settings validation because this ETL function is also called directly
     # in the epacems_to_parquet() script.
-    epacems_years = list(set(epacems_dict['epacems_years']))
-    epacems_years.sort()
-    epacems_states = list(set(epacems_dict['epacems_states']))
-    epacems_states.sort()
+    epacems_years = sorted(set(epacems_dict['epacems_years']))
+    epacems_states = sorted(set(epacems_dict['epacems_states']))
 
     # If we're not doing CEMS, just stop here to avoid printing messages like
     # "Reading EPA CEMS data...", which could be confusing.

diff --git a/src/pudl/helpers.py b/src/pudl/helpers.py
@@ -346,8 +346,7 @@ def organize_cols(df, cols):
     """
     # Generate a list of all the columns in the dataframe that are not
     # included in cols
-    data_cols = [c for c in df.columns.tolist() if c not in cols]
-    data_cols.sort()
+    data_cols = sorted([c for c in df.columns.tolist() if c not in cols])
     organized_cols = cols + data_cols
     return df[organized_cols]
 

diff --git a/src/pudl/load/sqlite.py b/src/pudl/load/sqlite.py
@@ -53,4 +53,5 @@ def _set_sqlite_pragma(dbapi_connection, connection_record):
             engine,
             if_exists="append",
             index=False,
+            dtype={c.name: c.type for c in table.columns},
         )
diff --git a/test/conftest.py b/test/conftest.py
@@ -171,8 +171,8 @@ def pudl_sql_engine(
             # These checks should all be True but there are bugs at the moment.
             # See: https://github.com/catalyst-cooperative/pudl/issues/1196
             check_foreign_keys=False,
-            check_types=False,
-            check_values=False
+            check_types=True,
+            check_values=True,
         )
     # Grab a connection to the freshly populated PUDL DB, and hand it off.
     # All the hard work here is being done by the datapkg and