From 0437d47ee175bea23bd3b5ff73178e3ea6ed1032 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 3 Nov 2025 15:24:09 +0100 Subject: [PATCH 1/5] Reorder call to refresh logic and don't immediatelly exit hook when refresh is not ready Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 79 ++++++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/src/charm.py b/src/charm.py index 3b918cf289..8be61c8b03 100755 --- a/src/charm.py +++ b/src/charm.py @@ -24,6 +24,7 @@ AuthorisationRulesChangeCharmEvents, AuthorisationRulesObserver, ) +from refresh import PostgreSQLRefresh # First platform-specific import, will fail on wrong architecture try: @@ -144,7 +145,6 @@ ) from ldap import PostgreSQLLDAP from patroni import NotReadyError, Patroni, SwitchoverFailedError, SwitchoverNotSyncError -from refresh import PostgreSQLRefresh from relations.async_replication import ( REPLICATION_CONSUMER_RELATION, REPLICATION_OFFER_RELATION, @@ -212,16 +212,6 @@ def __init__(self, *args): if isinstance(handler, JujuLogHandler): handler.setFormatter(logging.Formatter("{name}:{message}", style="{")) - # Support for disabling the operator. - disable_file = Path(f"{os.environ.get('CHARM_DIR')}/disable") - if disable_file.exists(): - logger.warning( - f"\n\tDisable file `{disable_file.resolve()}` found, the charm will skip all events." - "\n\tTo resume normal operations, please remove the file." - ) - self.set_unit_status(BlockedStatus("Disabled")) - sys.exit(0) - self.peer_relation_app = DataPeerData( self.model, relation_name=PEER, @@ -280,27 +270,12 @@ def __init__(self, *args): self.restart_manager = RollingOpsManager( charm=self, relation="restart", callback=self._restart ) - self._observer.start_authorisation_rules_observer() - self.grafana_dashboards = GrafanaDashboardProvider(self) - self.metrics_endpoint = MetricsEndpointProvider( - self, - refresh_event=[self.on.start], - jobs=self._generate_metrics_jobs(self.is_tls_enabled), - ) - self.loki_push = LogProxyConsumer( - self, - logs_scheme={"postgresql": {"log-files": POSTGRES_LOG_FILES}}, - relation_name="logging", - ) if self.model.juju_version.supports_open_port_on_k8s: try: self.unit.set_ports(5432, 8008) except ModelError: logger.exception("failed to open port") - self.tracing = TracingEndpointRequirer( - self, relation_name=TRACING_RELATION_NAME, protocols=[TRACING_PROTOCOL] - ) try: self.refresh = charm_refresh.Kubernetes( @@ -311,17 +286,24 @@ def __init__(self, *args): _charm=self, ) ) - except charm_refresh.KubernetesJujuAppNotTrusted: - sys.exit() - except charm_refresh.PeerRelationNotReady: - if self.unit.is_leader(): - self.app.status = MaintenanceStatus("Waiting for peer relation") - sys.exit() - except charm_refresh.UnitTearingDown: - self.set_unit_status(MaintenanceStatus("Tearing down")) - sys.exit() + except ( + charm_refresh.KubernetesJujuAppNotTrusted, + charm_refresh.PeerRelationNotReady, + charm_refresh.UnitTearingDown, + ): + self.refresh = None self._reconcile_refresh_status() + # Support for disabling the operator. + disable_file = Path(f"{os.environ.get('CHARM_DIR')}/disable") + if disable_file.exists(): + logger.warning( + f"\n\tDisable file `{disable_file.resolve()}` found, the charm will skip all events." + "\n\tTo resume normal operations, please remove the file." + ) + self.set_unit_status(BlockedStatus("Disabled")) + sys.exit(0) + if ( self.refresh is not None and self.refresh.workload_allowed_to_start @@ -332,6 +314,22 @@ def __init__(self, *args): else: self.refresh.next_unit_allowed_to_refresh = True + self._observer.start_authorisation_rules_observer() + self.grafana_dashboards = GrafanaDashboardProvider(self) + self.metrics_endpoint = MetricsEndpointProvider( + self, + refresh_event=[self.on.start], + jobs=self._generate_metrics_jobs(self.is_tls_enabled), + ) + self.loki_push = LogProxyConsumer( + self, + logs_scheme={"postgresql": {"log-files": POSTGRES_LOG_FILES}}, + relation_name="logging", + ) + self.tracing = TracingEndpointRequirer( + self, relation_name=TRACING_RELATION_NAME, protocols=[TRACING_PROTOCOL] + ) + def reconcile(self): """Reconcile the unit state on refresh.""" self.set_unit_status(MaintenanceStatus("starting services")) @@ -366,8 +364,9 @@ def reconcile(self): BlockedStatus("upgrade failed. Check logs for rollback instruction") ) else: - self.refresh.next_unit_allowed_to_refresh = True - self.set_unit_status(ActiveStatus()) + if self.refresh is not None: + self.refresh.next_unit_allowed_to_refresh = True + self.set_unit_status(ActiveStatus()) def _reconcile_refresh_status(self, _=None): if self.unit.is_leader(): @@ -1094,7 +1093,7 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None: return # Create resources and add labels needed for replication. - if not self.refresh.in_progress: + if self.refresh is not None and not self.refresh.in_progress: try: self._create_services() except ApiError: @@ -1296,7 +1295,7 @@ def _initialize_cluster(self, event: HookEvent) -> bool: return False # Create resources and add labels needed for replication - if not self.refresh.in_progress: + if self.refresh is not None and not self.refresh.in_progress: try: self._create_services() except ApiError: @@ -1608,7 +1607,7 @@ def _fix_pod(self) -> None: # Recreate k8s resources and add labels required for replication # when the pod loses them (like when it's deleted). self.push_tls_files_to_workload() - if not self.refresh.in_progress: + if self.refresh is not None and not self.refresh.in_progress: try: self._create_services() except ApiError: From 2dd2b23a66e4170b5269a71766320a967bb067e1 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 19 Nov 2025 09:02:59 -0300 Subject: [PATCH 2/5] Log more info when creating a backup and send SIGHUP when the pgBackRest is already active Signed-off-by: Marcelo Henrique Neppel --- src/backups.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/backups.py b/src/backups.py index 061949b1fc..029835d0e1 100644 --- a/src/backups.py +++ b/src/backups.py @@ -7,6 +7,7 @@ import logging import os import re +import signal import tempfile import time from datetime import UTC, datetime @@ -26,7 +27,7 @@ from ops.framework import Object from ops.jujuversion import JujuVersion from ops.model import ActiveStatus, MaintenanceStatus -from ops.pebble import ChangeError, ExecError +from ops.pebble import ChangeError, ExecError, ServiceStatus from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed from constants import ( @@ -820,6 +821,8 @@ def _on_create_backup_action(self, event) -> None: "pgbackrest", f"--stanza={self.stanza_name}", "--log-level-console=debug", + "--log-level-file=debug", + "--log-subprocess", f"--type={BACKUP_TYPE_OVERRIDES[backup_type]}", "backup", ] @@ -1303,7 +1306,18 @@ def start_stop_pgbackrest_service(self) -> bool: return False # Start the service. - self.container.restart(self.charm.pgbackrest_server_service) + services = self.container.pebble.get_services(names=[self.charm.pgbackrest_server_service]) + if len(services) == 0: + return False + + if services[0].current == ServiceStatus.ACTIVE: + logger.error("Sending SIGHUP to pgBackRest TLS server to reload configuration") + self.container.pebble.send_signal( + signal.SIGHUP, services=[self.charm.pgbackrest_server_service] + ) + else: + logger.error("Starting pgBackRest TLS server service") + self.container.restart(self.charm.pgbackrest_server_service) return True def _upload_content_to_s3( From 674e2a3addf19bb2d9ab72204626a997b94b32ff Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 19 Nov 2025 09:18:39 -0300 Subject: [PATCH 3/5] Change log level of messages to debug Signed-off-by: Marcelo Henrique Neppel --- src/backups.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backups.py b/src/backups.py index 029835d0e1..7b04cb1962 100644 --- a/src/backups.py +++ b/src/backups.py @@ -1311,12 +1311,12 @@ def start_stop_pgbackrest_service(self) -> bool: return False if services[0].current == ServiceStatus.ACTIVE: - logger.error("Sending SIGHUP to pgBackRest TLS server to reload configuration") + logger.debug("Sending SIGHUP to pgBackRest TLS server to reload configuration") self.container.pebble.send_signal( signal.SIGHUP, services=[self.charm.pgbackrest_server_service] ) else: - logger.error("Starting pgBackRest TLS server service") + logger.debug("Starting pgBackRest TLS server service") self.container.restart(self.charm.pgbackrest_server_service) return True From 0a2739067dc4193bcfa0d78b28d85644c63ea643 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 19 Nov 2025 15:32:27 -0300 Subject: [PATCH 4/5] Update unit test Signed-off-by: Marcelo Henrique Neppel --- tests/unit/test_backups.py | 44 +++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index d8e876c88f..063ba74024 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -1894,6 +1894,12 @@ def test_retrieve_s3_parameters( def test_start_stop_pgbackrest_service(harness): + # Enable Pebble connectivity + harness.set_can_connect("postgresql", True) + + # Get the container to set up pebble mocking + container = harness.model.unit.get_container("postgresql") + with ( patch( "charm.PostgreSQLBackups._is_primary_pgbackrest_service_running", @@ -1915,6 +1921,7 @@ def test_start_stop_pgbackrest_service(harness): "charm.PostgreSQLBackups._render_pgbackrest_conf_file" ) as _render_pgbackrest_conf_file, patch("charm.PostgreSQLBackups._are_backup_settings_ok") as _are_backup_settings_ok, + patch.object(container.pebble, "send_signal") as _send_signal, ): # Test when S3 parameters are not ok (no operation, but returns success). _are_backup_settings_ok.return_value = (False, "fake error message") @@ -1961,18 +1968,49 @@ def test_start_stop_pgbackrest_service(harness): _stop.assert_not_called() _restart.assert_not_called() - # Test when the service has already started in the primary. + # Test when the service has already started in the primary and is ACTIVE. + # This should send SIGHUP signal to reload configuration. _is_primary_pgbackrest_service_running.return_value = True + + # Add pgbackrest service to Pebble plan using the container's pebble client + container.pebble.add_layer( + "pgbackrest", + { + "services": { + harness.charm.pgbackrest_server_service: { + "override": "replace", + "summary": "pgbackrest server", + "command": "/bin/true", + "startup": "enabled", + } + } + }, + ) + + # Start the service to make it ACTIVE + container.start(harness.charm.pgbackrest_server_service) + assert harness.charm.backup.start_stop_pgbackrest_service() is True _stop.assert_not_called() - _restart.assert_called_once() + _restart.assert_not_called() + _send_signal.assert_called_once() - # Test when this unit is the primary. + # Test when this unit is the primary and service is NOT ACTIVE (INACTIVE). + # This should restart the service. _restart.reset_mock() + _send_signal.reset_mock() _is_primary.return_value = True _is_primary_pgbackrest_service_running.return_value = False + + # Stop the service using the Testing Pebble Client's stop_services method + container.pebble.stop_services([harness.charm.pgbackrest_server_service]) + + # Reset the stop mock (it was called by the testing framework when stopping) + _stop.reset_mock() + assert harness.charm.backup.start_stop_pgbackrest_service() is True _stop.assert_not_called() + _send_signal.assert_not_called() _restart.assert_called_once() From 5a963670433ed2acbabec700f5727443e04c1daf Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 19 Nov 2025 15:44:08 -0300 Subject: [PATCH 5/5] Fix integration test job collection Signed-off-by: Marcelo Henrique Neppel --- .github/workflows/integration_test.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 4d2e5449b5..12867cd3ec 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -19,19 +19,24 @@ jobs: uses: actions/checkout@v5 - name: Set up environment run: | - sudo snap install charmcraft --classic + sudo snap install go --classic + go install github.com/snapcore/spread/cmd/spread@latest pipx install tox poetry - name: Collect spread jobs id: collect-jobs shell: python run: | import json + import pathlib import os import subprocess spread_jobs = ( subprocess.run( - ["charmcraft", "test", "--list", "github-ci"], capture_output=True, check=True, text=True + [pathlib.Path.home() / "go/bin/spread", "-list", "github-ci"], + capture_output=True, + check=True, + text=True, ) .stdout.strip() .split("\n")