From 21e88da73a58b826d188b0592f80d0f8985349c0 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 2 Dec 2025 15:28:41 -0300 Subject: [PATCH 1/2] Catch archive command timeout Signed-off-by: Marcelo Henrique Neppel --- src/backups.py | 10 ++++++++++ tests/unit/test_backups.py | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/src/backups.py b/src/backups.py index 141b9cec6d..626d319319 100644 --- a/src/backups.py +++ b/src/backups.py @@ -698,9 +698,19 @@ def check_stanza(self) -> bool: f"--stanza={self.stanza_name}", "check", ]) + if return_code == 82: + # Raise an error if the archive command timeouts, so the user has the possibility + # to fix network issues and call juju resolve to re-trigger the hook that calls + # this method. + logger.error( + f"error: {stderr} - please fix the error and call juju resolve on this unit" + ) + raise TimeoutError if return_code != 0: raise Exception(stderr) self.charm._set_primary_status_message() + except TimeoutError as e: + raise e except Exception as e: # If the check command doesn't succeed, remove the stanza name # and rollback the configuration. diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index 18d5b73e0e..7cec03095b 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -814,6 +814,15 @@ def test_check_stanza(harness): FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE ) + # Test when the failure in the stanza check is due to an archive timeout. + _execute_command.reset_mock() + _s3_initialization_set_failure.reset_mock() + _execute_command.return_value = (82, "", "fake stderr") + with pytest.raises(TimeoutError): + harness.charm.backup.check_stanza() + assert False + _s3_initialization_set_failure.assert_not_called() + _execute_command.reset_mock() _s3_initialization_set_failure.reset_mock() _execute_command.return_value = (0, "fake stdout", "") From b6819f19d2c5e725517fa932dc65664f6f594ca9 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 2 Dec 2025 15:47:04 -0300 Subject: [PATCH 2/2] Extract warning and error messages from pgBackRest logs Sometimes the error message is sent to the stdout instead of the stderr, so it's important to check both Signed-off-by: Marcelo Henrique Neppel --- src/backups.py | 67 ++++++++++++++++++++++++++------ tests/unit/test_backups.py | 78 +++++++++++++++++++++++++++++++++++++- 2 files changed, 133 insertions(+), 12 deletions(-) diff --git a/src/backups.py b/src/backups.py index 626d319319..6a86815dc4 100644 --- a/src/backups.py +++ b/src/backups.py @@ -210,7 +210,8 @@ def can_use_s3_repository(self) -> tuple[bool, str | None]: else: if return_code != 0: - logger.error(f"Failed to run pgbackrest: {stderr}") + extracted_error = self._extract_error_message(stdout, stderr) + logger.error(f"Failed to run pgbackrest: {extracted_error}") return False, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE for stanza in json.loads(stdout): @@ -369,6 +370,41 @@ def result(): ) return process.returncode, process.stdout.decode(), process.stderr.decode() + @staticmethod + def _extract_error_message(stdout: str, stderr: str) -> str: + """Extract key error message from pgBackRest output. + + Args: + stdout: Standard output from pgBackRest command. + stderr: Standard error from pgBackRest command. + + Returns: + Extracted error message, prioritizing ERROR/WARN lines from output. + """ + combined_output = f"{stdout}\n{stderr}".strip() + if not combined_output: + return f"Unknown error occurred. Please check the logs at {PGBACKREST_LOGS_PATH}" + + # Extract lines with ERROR or WARN markers from pgBackRest output + error_lines = [] + for line in combined_output.splitlines(): + if "ERROR:" in line or "WARN:" in line: + # Clean up the line by removing debug prefixes like "P00 ERROR:" + cleaned = re.sub(r"^.*?(ERROR:|WARN:)", r"\1", line).strip() + error_lines.append(cleaned) + + # If we found error/warning lines, return them joined + if error_lines: + return " ".join(error_lines) + + # Otherwise return the last non-empty line from stderr or stdout + if stderr.strip(): + return stderr.strip().splitlines()[-1] + if stdout.strip(): + return stdout.strip().splitlines()[-1] + + return f"Unknown error occurred. Please check the logs at {PGBACKREST_LOGS_PATH}" + def _format_backup_list(self, backup_list) -> str: """Formats provided list of backups as a table.""" s3_parameters, _ = self._retrieve_s3_parameters() @@ -417,7 +453,8 @@ def _generate_backup_list_output(self) -> str: "--output=json", ]) if return_code != 0: - raise ListBackupsError(f"Failed to list backups with error: {stderr}") + extracted_error = self._extract_error_message(output, stderr) + raise ListBackupsError(f"Failed to list backups with error: {extracted_error}") backups = json.loads(output)[0]["backup"] for backup in backups: @@ -490,7 +527,8 @@ def _list_backups(self, show_failed: bool, parse=True) -> dict[str, tuple[str, s "--output=json", ]) if return_code != 0: - raise ListBackupsError(f"Failed to list backups with error: {stderr}") + extracted_error = self._extract_error_message(output, stderr) + raise ListBackupsError(f"Failed to list backups with error: {extracted_error}") repository_info = next(iter(json.loads(output)), None) @@ -525,7 +563,8 @@ def _list_timelines(self) -> dict[str, tuple[str, str]]: "--output=json", ]) if return_code != 0: - raise ListBackupsError(f"Failed to list repository with error: {stderr}") + extracted_error = self._extract_error_message(output, stderr) + raise ListBackupsError(f"Failed to list repository with error: {extracted_error}") repository = json.loads(output).items() if repository is None: @@ -692,7 +731,7 @@ def check_stanza(self) -> bool: # for that or else the s3 initialization sequence will fail. for attempt in Retrying(stop=stop_after_attempt(6), wait=wait_fixed(10), reraise=True): with attempt: - return_code, _, stderr = self._execute_command([ + return_code, stdout, stderr = self._execute_command([ PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, f"--stanza={self.stanza_name}", @@ -702,8 +741,9 @@ def check_stanza(self) -> bool: # Raise an error if the archive command timeouts, so the user has the possibility # to fix network issues and call juju resolve to re-trigger the hook that calls # this method. + extracted_error = self._extract_error_message(stdout, stderr) logger.error( - f"error: {stderr} - please fix the error and call juju resolve on this unit" + f"error: {extracted_error} - please fix the error and call juju resolve on this unit" ) raise TimeoutError if return_code != 0: @@ -757,15 +797,16 @@ def _is_primary_pgbackrest_service_running(self) -> bool: if not self.charm.primary_endpoint: logger.warning("Failed to contact pgBackRest TLS server: no primary endpoint") return False - return_code, _, stderr = self._execute_command([ + return_code, stdout, stderr = self._execute_command([ PGBACKREST_EXECUTABLE, "server-ping", "--io-timeout=10", self.charm.primary_endpoint, ]) if return_code != 0: + extracted_error = self._extract_error_message(stdout, stderr) logger.warning( - f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {stderr}" + f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {extracted_error}" ) return return_code == 0 @@ -980,7 +1021,8 @@ def _run_backup( f"backup/{self.stanza_name}/{backup_id}/backup.log", s3_parameters, ) - error_message = f"Failed to backup PostgreSQL with error: {stderr}" + extracted_error = self._extract_error_message(stdout, stderr) + error_message = f"Failed to backup PostgreSQL with error: {extracted_error}" logger.error(f"Backup failed: {error_message}") event.fail(error_message) else: @@ -1135,7 +1177,7 @@ def _on_restore_action(self, event): # noqa: C901 # Remove previous cluster information to make it possible to initialise a new cluster. logger.info("Removing previous cluster information") - return_code, _, stderr = self._execute_command( + return_code, stdout, stderr = self._execute_command( [ "charmed-postgresql.patronictl", "-c", @@ -1147,7 +1189,10 @@ def _on_restore_action(self, event): # noqa: C901 timeout=10, ) if return_code != 0: - error_message = f"Failed to remove previous cluster information with error: {stderr}" + extracted_error = self._extract_error_message(stdout, stderr) + error_message = ( + f"Failed to remove previous cluster information with error: {extracted_error}" + ) logger.error(f"Restore failed: {error_message}") event.fail(error_message) return diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index 7cec03095b..f6dddeb6a0 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -14,7 +14,7 @@ from ops.testing import Harness from tenacity import RetryError, wait_fixed -from backups import ListBackupsError +from backups import ListBackupsError, PostgreSQLBackups from charm import PostgresqlOperatorCharm from constants import PEER @@ -38,6 +38,82 @@ def harness(): harness.cleanup() +def test_extract_error_message_with_error_in_stdout(): + """Test extracting error from stdout with ERROR marker.""" + stdout = """2025-11-07 07:21:11.120 P00 ERROR: [056]: unable to find primary cluster - cannot proceed + HINT: are all available clusters in recovery?""" + stderr = "" + result = PostgreSQLBackups._extract_error_message(stdout, stderr) + assert result == "ERROR: [056]: unable to find primary cluster - cannot proceed" + + +def test_extract_error_message_with_error_in_stderr(): + """Test extracting error from stderr when no ERROR marker.""" + stdout = "" + stderr = "Connection refused: cannot connect to S3" + result = PostgreSQLBackups._extract_error_message(stdout, stderr) + assert result == "Connection refused: cannot connect to S3" + + +def test_extract_error_message_with_both_stdout_and_stderr(): + """Test extracting error when ERROR marker is in stdout and stderr has additional info.""" + stdout = "P00 ERROR: database error occurred" + stderr = "Additional stderr info" + result = PostgreSQLBackups._extract_error_message(stdout, stderr) + assert result == "ERROR: database error occurred" + + +def test_extract_error_message_with_warning(): + """Test extracting warning message from output.""" + stdout = "P00 WARN: configuration issue detected" + stderr = "" + result = PostgreSQLBackups._extract_error_message(stdout, stderr) + assert result == "WARN: configuration issue detected" + + +def test_extract_error_message_with_multiple_errors(): + """Test extracting multiple ERROR/WARN lines.""" + stdout = """P00 ERROR: first error +P00 WARN: warning message +P00 ERROR: second error""" + stderr = "" + result = PostgreSQLBackups._extract_error_message(stdout, stderr) + assert result == "ERROR: first error WARN: warning message ERROR: second error" + + +def test_extract_error_message_with_empty_output(): + """Test with empty stdout and stderr returns helpful message.""" + result = PostgreSQLBackups._extract_error_message("", "") + assert ( + result + == "Unknown error occurred. Please check the logs at /var/snap/charmed-postgresql/common/var/log/pgbackrest" + ) + + +def test_extract_error_message_fallback_to_stdout_last_line(): + """Test fallback to last line of stdout when no ERROR/WARN markers.""" + stdout = "Some generic output\nMore details here\nFinal error message" + stderr = "" + result = PostgreSQLBackups._extract_error_message(stdout, stderr) + assert result == "Final error message" + + +def test_extract_error_message_fallback_to_stderr_last_line(): + """Test fallback to last line of stderr when no ERROR/WARN markers and no stdout.""" + stdout = "" + stderr = "Line 1\nLine 2\nFinal error message" + result = PostgreSQLBackups._extract_error_message(stdout, stderr) + assert result == "Final error message" + + +def test_extract_error_message_cleans_debug_prefix(): + """Test that debug prefixes like 'P00 ERROR:' are cleaned up.""" + stdout = "2025-11-07 07:21:11.120 P00 ERROR: test error message" + stderr = "" + result = PostgreSQLBackups._extract_error_message(stdout, stderr) + assert result == "ERROR: test error message" + + def test_stanza_name(harness): assert ( harness.charm.backup.stanza_name