Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 65 additions & 10 deletions src/backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@ def can_use_s3_repository(self) -> tuple[bool, str | None]:

else:
if return_code != 0:
logger.error(f"Failed to run pgbackrest: {stderr}")
extracted_error = self._extract_error_message(stdout, stderr)
logger.error(f"Failed to run pgbackrest: {extracted_error}")
return False, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE

for stanza in json.loads(stdout):
Expand Down Expand Up @@ -369,6 +370,41 @@ def result():
)
return process.returncode, process.stdout.decode(), process.stderr.decode()

@staticmethod
def _extract_error_message(stdout: str, stderr: str) -> str:
"""Extract key error message from pgBackRest output.

Args:
stdout: Standard output from pgBackRest command.
stderr: Standard error from pgBackRest command.

Returns:
Extracted error message, prioritizing ERROR/WARN lines from output.
"""
combined_output = f"{stdout}\n{stderr}".strip()
if not combined_output:
return f"Unknown error occurred. Please check the logs at {PGBACKREST_LOGS_PATH}"

# Extract lines with ERROR or WARN markers from pgBackRest output
error_lines = []
for line in combined_output.splitlines():
if "ERROR:" in line or "WARN:" in line:
# Clean up the line by removing debug prefixes like "P00 ERROR:"
cleaned = re.sub(r"^.*?(ERROR:|WARN:)", r"\1", line).strip()
error_lines.append(cleaned)

# If we found error/warning lines, return them joined
if error_lines:
return " ".join(error_lines)

# Otherwise return the last non-empty line from stderr or stdout
if stderr.strip():
return stderr.strip().splitlines()[-1]
if stdout.strip():
return stdout.strip().splitlines()[-1]

return f"Unknown error occurred. Please check the logs at {PGBACKREST_LOGS_PATH}"

def _format_backup_list(self, backup_list) -> str:
"""Formats provided list of backups as a table."""
s3_parameters, _ = self._retrieve_s3_parameters()
Expand Down Expand Up @@ -417,7 +453,8 @@ def _generate_backup_list_output(self) -> str:
"--output=json",
])
if return_code != 0:
raise ListBackupsError(f"Failed to list backups with error: {stderr}")
extracted_error = self._extract_error_message(output, stderr)
raise ListBackupsError(f"Failed to list backups with error: {extracted_error}")

backups = json.loads(output)[0]["backup"]
for backup in backups:
Expand Down Expand Up @@ -490,7 +527,8 @@ def _list_backups(self, show_failed: bool, parse=True) -> dict[str, tuple[str, s
"--output=json",
])
if return_code != 0:
raise ListBackupsError(f"Failed to list backups with error: {stderr}")
extracted_error = self._extract_error_message(output, stderr)
raise ListBackupsError(f"Failed to list backups with error: {extracted_error}")

repository_info = next(iter(json.loads(output)), None)

Expand Down Expand Up @@ -525,7 +563,8 @@ def _list_timelines(self) -> dict[str, tuple[str, str]]:
"--output=json",
])
if return_code != 0:
raise ListBackupsError(f"Failed to list repository with error: {stderr}")
extracted_error = self._extract_error_message(output, stderr)
raise ListBackupsError(f"Failed to list repository with error: {extracted_error}")

repository = json.loads(output).items()
if repository is None:
Expand Down Expand Up @@ -692,15 +731,26 @@ def check_stanza(self) -> bool:
# for that or else the s3 initialization sequence will fail.
for attempt in Retrying(stop=stop_after_attempt(6), wait=wait_fixed(10), reraise=True):
with attempt:
return_code, _, stderr = self._execute_command([
return_code, stdout, stderr = self._execute_command([
PGBACKREST_EXECUTABLE,
PGBACKREST_CONFIGURATION_FILE,
f"--stanza={self.stanza_name}",
"check",
])
if return_code == 82:
# Raise an error if the archive command timeouts, so the user has the possibility
# to fix network issues and call juju resolve to re-trigger the hook that calls
# this method.
extracted_error = self._extract_error_message(stdout, stderr)
logger.error(
f"error: {extracted_error} - please fix the error and call juju resolve on this unit"
)
raise TimeoutError
if return_code != 0:
raise Exception(stderr)
self.charm._set_primary_status_message()
except TimeoutError as e:
raise e
except Exception as e:
# If the check command doesn't succeed, remove the stanza name
# and rollback the configuration.
Expand Down Expand Up @@ -747,15 +797,16 @@ def _is_primary_pgbackrest_service_running(self) -> bool:
if not self.charm.primary_endpoint:
logger.warning("Failed to contact pgBackRest TLS server: no primary endpoint")
return False
return_code, _, stderr = self._execute_command([
return_code, stdout, stderr = self._execute_command([
PGBACKREST_EXECUTABLE,
"server-ping",
"--io-timeout=10",
self.charm.primary_endpoint,
])
if return_code != 0:
extracted_error = self._extract_error_message(stdout, stderr)
logger.warning(
f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {stderr}"
f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {extracted_error}"
)
return return_code == 0

Expand Down Expand Up @@ -970,7 +1021,8 @@ def _run_backup(
f"backup/{self.stanza_name}/{backup_id}/backup.log",
s3_parameters,
)
error_message = f"Failed to backup PostgreSQL with error: {stderr}"
extracted_error = self._extract_error_message(stdout, stderr)
error_message = f"Failed to backup PostgreSQL with error: {extracted_error}"
logger.error(f"Backup failed: {error_message}")
event.fail(error_message)
else:
Expand Down Expand Up @@ -1125,7 +1177,7 @@ def _on_restore_action(self, event): # noqa: C901

# Remove previous cluster information to make it possible to initialise a new cluster.
logger.info("Removing previous cluster information")
return_code, _, stderr = self._execute_command(
return_code, stdout, stderr = self._execute_command(
[
"charmed-postgresql.patronictl",
"-c",
Expand All @@ -1137,7 +1189,10 @@ def _on_restore_action(self, event): # noqa: C901
timeout=10,
)
if return_code != 0:
error_message = f"Failed to remove previous cluster information with error: {stderr}"
extracted_error = self._extract_error_message(stdout, stderr)
error_message = (
f"Failed to remove previous cluster information with error: {extracted_error}"
)
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return
Expand Down
87 changes: 86 additions & 1 deletion tests/unit/test_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ops.testing import Harness
from tenacity import RetryError, wait_fixed

from backups import ListBackupsError
from backups import ListBackupsError, PostgreSQLBackups
from charm import PostgresqlOperatorCharm
from constants import PEER

Expand All @@ -38,6 +38,82 @@ def harness():
harness.cleanup()


def test_extract_error_message_with_error_in_stdout():
"""Test extracting error from stdout with ERROR marker."""
stdout = """2025-11-07 07:21:11.120 P00 ERROR: [056]: unable to find primary cluster - cannot proceed
HINT: are all available clusters in recovery?"""
stderr = ""
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
assert result == "ERROR: [056]: unable to find primary cluster - cannot proceed"


def test_extract_error_message_with_error_in_stderr():
"""Test extracting error from stderr when no ERROR marker."""
stdout = ""
stderr = "Connection refused: cannot connect to S3"
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
assert result == "Connection refused: cannot connect to S3"


def test_extract_error_message_with_both_stdout_and_stderr():
"""Test extracting error when ERROR marker is in stdout and stderr has additional info."""
stdout = "P00 ERROR: database error occurred"
stderr = "Additional stderr info"
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
assert result == "ERROR: database error occurred"


def test_extract_error_message_with_warning():
"""Test extracting warning message from output."""
stdout = "P00 WARN: configuration issue detected"
stderr = ""
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
assert result == "WARN: configuration issue detected"


def test_extract_error_message_with_multiple_errors():
"""Test extracting multiple ERROR/WARN lines."""
stdout = """P00 ERROR: first error
P00 WARN: warning message
P00 ERROR: second error"""
stderr = ""
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
assert result == "ERROR: first error WARN: warning message ERROR: second error"


def test_extract_error_message_with_empty_output():
"""Test with empty stdout and stderr returns helpful message."""
result = PostgreSQLBackups._extract_error_message("", "")
assert (
result
== "Unknown error occurred. Please check the logs at /var/snap/charmed-postgresql/common/var/log/pgbackrest"
)


def test_extract_error_message_fallback_to_stdout_last_line():
"""Test fallback to last line of stdout when no ERROR/WARN markers."""
stdout = "Some generic output\nMore details here\nFinal error message"
stderr = ""
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
assert result == "Final error message"


def test_extract_error_message_fallback_to_stderr_last_line():
"""Test fallback to last line of stderr when no ERROR/WARN markers and no stdout."""
stdout = ""
stderr = "Line 1\nLine 2\nFinal error message"
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
assert result == "Final error message"


def test_extract_error_message_cleans_debug_prefix():
"""Test that debug prefixes like 'P00 ERROR:' are cleaned up."""
stdout = "2025-11-07 07:21:11.120 P00 ERROR: test error message"
stderr = ""
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
assert result == "ERROR: test error message"


def test_stanza_name(harness):
assert (
harness.charm.backup.stanza_name
Expand Down Expand Up @@ -814,6 +890,15 @@ def test_check_stanza(harness):
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE
)

# Test when the failure in the stanza check is due to an archive timeout.
_execute_command.reset_mock()
_s3_initialization_set_failure.reset_mock()
_execute_command.return_value = (82, "", "fake stderr")
with pytest.raises(TimeoutError):
harness.charm.backup.check_stanza()
assert False
_s3_initialization_set_failure.assert_not_called()

_execute_command.reset_mock()
_s3_initialization_set_failure.reset_mock()
_execute_command.return_value = (0, "fake stdout", "")
Expand Down