Address loose ends in versioned release mechanics (#3421)

* Silence verbose GCS/S3 terminal output in logs * Rename column valid_till_date to valid_until_date * Use context manager when writing datastore archives. * Limit concurrency to 4 CPUs and don't collect coverage during validation * Ignore Pydantic deprecation warning coming from mlflow * Relock dependencies to includ pandas 2.2.1 for testing * Skip schema based asset checks on core_epacems__hourly_emissions * Place hold on versioned data to prevent accidental deletion * Update release notes
catalyst-cooperative · Feb 25, 2024 · 730cb52 · 730cb52
1 parent c9e1f70
commit 730cb52
Show file tree

Hide file tree

Showing 14 changed files with 511 additions and 385 deletions.
diff --git a/Makefile b/Makefile
@@ -129,12 +129,12 @@ pytest-coverage: coverage-erase docs-build pytest-ci
 
 .PHONY: pytest-integration-full
 pytest-integration-full:
-	pytest ${pytest_args} -n auto --live-dbs --etl-settings ${etl_full_yml} test/integration
+	pytest ${pytest_args} -n 4 --no-cov --live-dbs --etl-settings ${etl_full_yml} test/integration
 
 .PHONY: pytest-validate
 pytest-validate:
 	pudl_check_fks
-	pytest ${pytest_args} -n auto --live-dbs test/validate
+	pytest ${pytest_args} -n 4 --no-cov --live-dbs test/validate
 
 # Run the full ETL, generating new FERC & PUDL SQLite DBs and EPA CEMS Parquet files.
 # Then run the full integration tests and data validations on all years of data.
@@ -144,8 +144,8 @@ pytest-validate:
 .PHONY: nuke
 nuke: coverage-erase docs-build pytest-unit ferc pudl
 	pudl_check_fks
-	pytest ${pytest_args} -n auto --live-dbs --etl-settings ${etl_full_yml} test/integration
-	pytest ${pytest_args} -n auto --live-dbs test/validate
+	pytest ${pytest_args} -n 4 --live-dbs --etl-settings ${etl_full_yml} test/integration
+	pytest ${pytest_args} -n 4 --live-dbs test/validate
 	coverage report
 
 # Check that designated Jupyter notebooks can be run against the current DB

diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh
@@ -71,7 +71,7 @@ function run_pudl_etl() {
 
 function save_outputs_to_gcs() {
     echo "Copying outputs to GCP bucket $PUDL_GCS_OUTPUT" && \
-    gsutil -m cp -r "$PUDL_OUTPUT" "$PUDL_GCS_OUTPUT" && \
+    gsutil -q -m cp -r "$PUDL_OUTPUT" "$PUDL_GCS_OUTPUT" && \
     rm -f "$PUDL_OUTPUT/success"
 }
 
@@ -85,14 +85,14 @@ function upload_to_dist_path() {
         # If the old outputs don't exist, these will exit with status 1, so we
         # don't && them with the rest of the commands.
         echo "Removing old outputs from $GCS_PATH."
-        gsutil -m -u "$GCP_BILLING_PROJECT" rm -r "$GCS_PATH"
+        gsutil -q -m -u "$GCP_BILLING_PROJECT" rm -r "$GCS_PATH"
         echo "Removing old outputs from $AWS_PATH."
-        aws s3 rm --recursive "$AWS_PATH"
+        aws s3 rm --quiet --recursive "$AWS_PATH"
 
         echo "Copying outputs to $GCS_PATH:" && \
-        gsutil -m -u "$GCP_BILLING_PROJECT" cp -r "$PUDL_OUTPUT/*" "$GCS_PATH" && \
+        gsutil -q -m -u "$GCP_BILLING_PROJECT" cp -r "$PUDL_OUTPUT/*" "$GCS_PATH" && \
         echo "Copying outputs to $AWS_PATH" && \
-        aws s3 cp --recursive "$PUDL_OUTPUT/" "$AWS_PATH"
+        aws s3 cp --quiet --recursive "$PUDL_OUTPUT/" "$AWS_PATH"
     else
         echo "No distribution path provided. Not updating outputs."
         exit 1
@@ -113,12 +113,12 @@ function distribute_parquet() {
             DIST_PATH="$BUILD_REF"
         fi
         echo "Copying outputs to $PARQUET_BUCKET/$DIST_PATH" && \
-        gsutil -m -u "$GCP_BILLING_PROJECT" cp -r "$PUDL_OUTPUT/parquet/*" "$PARQUET_BUCKET/$DIST_PATH"
+        gsutil -q -m -u "$GCP_BILLING_PROJECT" cp -r "$PUDL_OUTPUT/parquet/*" "$PARQUET_BUCKET/$DIST_PATH"
 
         # If running a tagged release, ALSO update the stable distribution bucket path:
         if [[ "$GITHUB_ACTION_TRIGGER" == "push" && "$BUILD_REF" == v20* ]]; then
             echo "Copying outputs to $PARQUET_BUCKET/stable" && \
-            gsutil -m -u "$GCP_BILLING_PROJECT" cp -r "$PUDL_OUTPUT/parquet/*" "$PARQUET_BUCKET/stable"
+            gsutil -q -m -u "$GCP_BILLING_PROJECT" cp -r "$PUDL_OUTPUT/parquet/*" "$PARQUET_BUCKET/stable"
         fi
     fi
 }
@@ -175,6 +175,7 @@ function notify_slack() {
     message+="DATASETTE_SUCCESS: $DATASETTE_SUCCESS\n"
     message+="CLEAN_UP_OUTPUTS_SUCCESS: $CLEAN_UP_OUTPUTS_SUCCESS\n"
     message+="DISTRIBUTION_BUCKET_SUCCESS: $DISTRIBUTION_BUCKET_SUCCESS\n"
+    message+="GCLOUD_TEMPORARY_HOLD_SUCCESS: $GCLOUD_TEMPORARY_HOLD_SUCCESS \n"
     message+="ZENODO_SUCCESS: $ZENODO_SUCCESS\n\n"
 
     message+="*Query* logs on <https://console.cloud.google.com/batch/jobsDetail/regions/us-west1/jobs/run-etl-$BUILD_ID/logs?project=catalyst-cooperative-pudl|Google Batch Console>.\n\n"
@@ -228,6 +229,7 @@ DISTRIBUTE_PARQUET_SUCCESS=0
 CLEAN_UP_OUTPUTS_SUCCESS=0
 DISTRIBUTION_BUCKET_SUCCESS=0
 ZENODO_SUCCESS=0
+GCLOUD_TEMPORARY_HOLD_SUCCESS=0
 
 # Set these variables *only* if they are not already set by the container or workflow:
 : "${PUDL_GCS_OUTPUT:=gs://builds.catalyst.coop/$BUILD_ID}"
@@ -281,10 +283,16 @@ if [[ $ETL_SUCCESS == 0 ]]; then
         zenodo_data_release "$ZENODO_TARGET_ENV" 2>&1 | tee -a "$LOGFILE"
         ZENODO_SUCCESS=${PIPESTATUS[0]}
     fi
+    # If running a tagged release, ensure that outputs can't be accidentally deleted
+    # It's not clear that an object lock can be applied in S3 with the AWS CLI
+    if [[ "$GITHUB_ACTION_TRIGGER" == "push" && "$BUILD_REF" == v20* ]]; then
+        gcloud storage objects update "gs://pudl.catalyst.coop/$BUILD_REF/*" --temporary-hold 2>&1 | tee -a "$LOGFILE"
+        GCLOUD_TEMPORARY_HOLD_SUCCESS=${PIPESTATUS[0]}
+    fi
 fi
 
 # This way we also save the logs from latter steps in the script
-gsutil cp "$LOGFILE" "$PUDL_GCS_OUTPUT"
+gsutil -q cp "$LOGFILE" "$PUDL_GCS_OUTPUT"
 
 # Notify slack about entire pipeline's success or failure;
 if [[ $ETL_SUCCESS == 0 && \
@@ -295,6 +303,7 @@ if [[ $ETL_SUCCESS == 0 && \
       $DISTRIBUTE_PARQUET_SUCCESS == 0 && \
       $CLEAN_UP_OUTPUTS_SUCCESS == 0 && \
       $DISTRIBUTION_BUCKET_SUCCESS == 0 && \
+      $GCLOUD_TEMPORARY_HOLD_SUCCESS == 0 && \
       $ZENODO_SUCCESS == 0
 ]]; then
     notify_slack "success"

diff --git a/docs/release_notes.rst b/docs/release_notes.rst
@@ -11,11 +11,18 @@ New Data Coverage
 * Add EIA860M data through December 2023 :issue:`3313`, :pr:`3367`.
 * Add 2023 Q4 of CEMS data. See :issue:`3315`, :pr:`3379`.
 * Add EIA923 monthly data through November 2023 :issue:`3314`, :pr:`3398,3422`.
+* Create a new table :ref:`core_eia860m__changelog_generators` which tracks the
+  evolution of all generator data reported in the EIA860M, in particular the stated
+  retirement dates. see issue :issue:`3330` and PR :pr:`3331`. Previously only the most
+  recent month of reported EIA860M data was available within the PUDL DB.
 
 Release Infrastructure
 ^^^^^^^^^^^^^^^^^^^^^^
 * Use the same logic to merge version tags into the ``stable`` branch as we are using
   to merge the nightly build tags into the ``nightly`` branch. See PR :pr:`3347`
+* Automatically place a `temporary object hold <https://cloud.google.com/storage/docs/holding-objects#use-object-holds>`__
+  on all versioned data releases that we publish to GCS, to ensure that they can't be
+  accidentally deleted. See issue :issue:`3400` and PR :pr:`3421`.
 
 Schema Changes
 ^^^^^^^^^^^^^^
@@ -27,6 +34,16 @@ Schema Changes
   :ref:`out_ferc1__yearly_pumped_storage_plants_sched408`
   See issue :issue:`3416` & PR :pr:`3417`
 
+Data Validation with Pandera
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+We've started integrating :mod:`pandera` dataframe schemas and checks with
+:mod:`dagster` `asset checks <https://docs.dagster.io/concepts/assets/asset-checks>`__
+to validate data while our ETL pipeline is running instead of only after all the data
+has been produced. Initially we are using the various database schema checks that are
+generated by our metadata, but the goal is to migrate all of our data validation tests
+into this framework over time, and to start using it to encode any new data validations
+immediately. See issues :issue:`941,1572,3318,3412` and PR :pr:`3282`.
+
 Pandas 2.2
 ^^^^^^^^^^
 We've updated to Pandas 2.2, which has a number of changes and deprecations.  See PRs
@@ -43,8 +60,8 @@ We've updated to Pandas 2.2, which has a number of changes and deprecations.  Se
 * We've switched to using the ``calamine`` engine for reading Excel files, which is
   much faster than the old ``openpyxl`` library.
 
-Parquet Output
-^^^^^^^^^^^^^^
+Parquet Outputs
+^^^^^^^^^^^^^^^
 The ETL now outputs PyArrow Parquet files for all tables that are written to the PUDL
 DB. The Parquet outputs are used as the interim storage for the ETL, rather than reading
 all tables out of the SQLite DB. We aren't publicly distributing the Parquet outputs

diff --git a/environments/conda-linux-64.lock.yml b/environments/conda-linux-64.lock.yml