diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 4d2e5449b5..12867cd3ec 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -19,19 +19,24 @@ jobs: uses: actions/checkout@v5 - name: Set up environment run: | - sudo snap install charmcraft --classic + sudo snap install go --classic + go install github.com/snapcore/spread/cmd/spread@latest pipx install tox poetry - name: Collect spread jobs id: collect-jobs shell: python run: | import json + import pathlib import os import subprocess spread_jobs = ( subprocess.run( - ["charmcraft", "test", "--list", "github-ci"], capture_output=True, check=True, text=True + [pathlib.Path.home() / "go/bin/spread", "-list", "github-ci"], + capture_output=True, + check=True, + text=True, ) .stdout.strip() .split("\n") diff --git a/src/backups.py b/src/backups.py index 061949b1fc..7b04cb1962 100644 --- a/src/backups.py +++ b/src/backups.py @@ -7,6 +7,7 @@ import logging import os import re +import signal import tempfile import time from datetime import UTC, datetime @@ -26,7 +27,7 @@ from ops.framework import Object from ops.jujuversion import JujuVersion from ops.model import ActiveStatus, MaintenanceStatus -from ops.pebble import ChangeError, ExecError +from ops.pebble import ChangeError, ExecError, ServiceStatus from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed from constants import ( @@ -820,6 +821,8 @@ def _on_create_backup_action(self, event) -> None: "pgbackrest", f"--stanza={self.stanza_name}", "--log-level-console=debug", + "--log-level-file=debug", + "--log-subprocess", f"--type={BACKUP_TYPE_OVERRIDES[backup_type]}", "backup", ] @@ -1303,7 +1306,18 @@ def start_stop_pgbackrest_service(self) -> bool: return False # Start the service. - self.container.restart(self.charm.pgbackrest_server_service) + services = self.container.pebble.get_services(names=[self.charm.pgbackrest_server_service]) + if len(services) == 0: + return False + + if services[0].current == ServiceStatus.ACTIVE: + logger.debug("Sending SIGHUP to pgBackRest TLS server to reload configuration") + self.container.pebble.send_signal( + signal.SIGHUP, services=[self.charm.pgbackrest_server_service] + ) + else: + logger.debug("Starting pgBackRest TLS server service") + self.container.restart(self.charm.pgbackrest_server_service) return True def _upload_content_to_s3( diff --git a/src/charm.py b/src/charm.py index 3b918cf289..8be61c8b03 100755 --- a/src/charm.py +++ b/src/charm.py @@ -24,6 +24,7 @@ AuthorisationRulesChangeCharmEvents, AuthorisationRulesObserver, ) +from refresh import PostgreSQLRefresh # First platform-specific import, will fail on wrong architecture try: @@ -144,7 +145,6 @@ ) from ldap import PostgreSQLLDAP from patroni import NotReadyError, Patroni, SwitchoverFailedError, SwitchoverNotSyncError -from refresh import PostgreSQLRefresh from relations.async_replication import ( REPLICATION_CONSUMER_RELATION, REPLICATION_OFFER_RELATION, @@ -212,16 +212,6 @@ def __init__(self, *args): if isinstance(handler, JujuLogHandler): handler.setFormatter(logging.Formatter("{name}:{message}", style="{")) - # Support for disabling the operator. - disable_file = Path(f"{os.environ.get('CHARM_DIR')}/disable") - if disable_file.exists(): - logger.warning( - f"\n\tDisable file `{disable_file.resolve()}` found, the charm will skip all events." - "\n\tTo resume normal operations, please remove the file." - ) - self.set_unit_status(BlockedStatus("Disabled")) - sys.exit(0) - self.peer_relation_app = DataPeerData( self.model, relation_name=PEER, @@ -280,27 +270,12 @@ def __init__(self, *args): self.restart_manager = RollingOpsManager( charm=self, relation="restart", callback=self._restart ) - self._observer.start_authorisation_rules_observer() - self.grafana_dashboards = GrafanaDashboardProvider(self) - self.metrics_endpoint = MetricsEndpointProvider( - self, - refresh_event=[self.on.start], - jobs=self._generate_metrics_jobs(self.is_tls_enabled), - ) - self.loki_push = LogProxyConsumer( - self, - logs_scheme={"postgresql": {"log-files": POSTGRES_LOG_FILES}}, - relation_name="logging", - ) if self.model.juju_version.supports_open_port_on_k8s: try: self.unit.set_ports(5432, 8008) except ModelError: logger.exception("failed to open port") - self.tracing = TracingEndpointRequirer( - self, relation_name=TRACING_RELATION_NAME, protocols=[TRACING_PROTOCOL] - ) try: self.refresh = charm_refresh.Kubernetes( @@ -311,17 +286,24 @@ def __init__(self, *args): _charm=self, ) ) - except charm_refresh.KubernetesJujuAppNotTrusted: - sys.exit() - except charm_refresh.PeerRelationNotReady: - if self.unit.is_leader(): - self.app.status = MaintenanceStatus("Waiting for peer relation") - sys.exit() - except charm_refresh.UnitTearingDown: - self.set_unit_status(MaintenanceStatus("Tearing down")) - sys.exit() + except ( + charm_refresh.KubernetesJujuAppNotTrusted, + charm_refresh.PeerRelationNotReady, + charm_refresh.UnitTearingDown, + ): + self.refresh = None self._reconcile_refresh_status() + # Support for disabling the operator. + disable_file = Path(f"{os.environ.get('CHARM_DIR')}/disable") + if disable_file.exists(): + logger.warning( + f"\n\tDisable file `{disable_file.resolve()}` found, the charm will skip all events." + "\n\tTo resume normal operations, please remove the file." + ) + self.set_unit_status(BlockedStatus("Disabled")) + sys.exit(0) + if ( self.refresh is not None and self.refresh.workload_allowed_to_start @@ -332,6 +314,22 @@ def __init__(self, *args): else: self.refresh.next_unit_allowed_to_refresh = True + self._observer.start_authorisation_rules_observer() + self.grafana_dashboards = GrafanaDashboardProvider(self) + self.metrics_endpoint = MetricsEndpointProvider( + self, + refresh_event=[self.on.start], + jobs=self._generate_metrics_jobs(self.is_tls_enabled), + ) + self.loki_push = LogProxyConsumer( + self, + logs_scheme={"postgresql": {"log-files": POSTGRES_LOG_FILES}}, + relation_name="logging", + ) + self.tracing = TracingEndpointRequirer( + self, relation_name=TRACING_RELATION_NAME, protocols=[TRACING_PROTOCOL] + ) + def reconcile(self): """Reconcile the unit state on refresh.""" self.set_unit_status(MaintenanceStatus("starting services")) @@ -366,8 +364,9 @@ def reconcile(self): BlockedStatus("upgrade failed. Check logs for rollback instruction") ) else: - self.refresh.next_unit_allowed_to_refresh = True - self.set_unit_status(ActiveStatus()) + if self.refresh is not None: + self.refresh.next_unit_allowed_to_refresh = True + self.set_unit_status(ActiveStatus()) def _reconcile_refresh_status(self, _=None): if self.unit.is_leader(): @@ -1094,7 +1093,7 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None: return # Create resources and add labels needed for replication. - if not self.refresh.in_progress: + if self.refresh is not None and not self.refresh.in_progress: try: self._create_services() except ApiError: @@ -1296,7 +1295,7 @@ def _initialize_cluster(self, event: HookEvent) -> bool: return False # Create resources and add labels needed for replication - if not self.refresh.in_progress: + if self.refresh is not None and not self.refresh.in_progress: try: self._create_services() except ApiError: @@ -1608,7 +1607,7 @@ def _fix_pod(self) -> None: # Recreate k8s resources and add labels required for replication # when the pod loses them (like when it's deleted). self.push_tls_files_to_workload() - if not self.refresh.in_progress: + if self.refresh is not None and not self.refresh.in_progress: try: self._create_services() except ApiError: diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index d8e876c88f..063ba74024 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -1894,6 +1894,12 @@ def test_retrieve_s3_parameters( def test_start_stop_pgbackrest_service(harness): + # Enable Pebble connectivity + harness.set_can_connect("postgresql", True) + + # Get the container to set up pebble mocking + container = harness.model.unit.get_container("postgresql") + with ( patch( "charm.PostgreSQLBackups._is_primary_pgbackrest_service_running", @@ -1915,6 +1921,7 @@ def test_start_stop_pgbackrest_service(harness): "charm.PostgreSQLBackups._render_pgbackrest_conf_file" ) as _render_pgbackrest_conf_file, patch("charm.PostgreSQLBackups._are_backup_settings_ok") as _are_backup_settings_ok, + patch.object(container.pebble, "send_signal") as _send_signal, ): # Test when S3 parameters are not ok (no operation, but returns success). _are_backup_settings_ok.return_value = (False, "fake error message") @@ -1961,18 +1968,49 @@ def test_start_stop_pgbackrest_service(harness): _stop.assert_not_called() _restart.assert_not_called() - # Test when the service has already started in the primary. + # Test when the service has already started in the primary and is ACTIVE. + # This should send SIGHUP signal to reload configuration. _is_primary_pgbackrest_service_running.return_value = True + + # Add pgbackrest service to Pebble plan using the container's pebble client + container.pebble.add_layer( + "pgbackrest", + { + "services": { + harness.charm.pgbackrest_server_service: { + "override": "replace", + "summary": "pgbackrest server", + "command": "/bin/true", + "startup": "enabled", + } + } + }, + ) + + # Start the service to make it ACTIVE + container.start(harness.charm.pgbackrest_server_service) + assert harness.charm.backup.start_stop_pgbackrest_service() is True _stop.assert_not_called() - _restart.assert_called_once() + _restart.assert_not_called() + _send_signal.assert_called_once() - # Test when this unit is the primary. + # Test when this unit is the primary and service is NOT ACTIVE (INACTIVE). + # This should restart the service. _restart.reset_mock() + _send_signal.reset_mock() _is_primary.return_value = True _is_primary_pgbackrest_service_running.return_value = False + + # Stop the service using the Testing Pebble Client's stop_services method + container.pebble.stop_services([harness.charm.pgbackrest_server_service]) + + # Reset the stop mock (it was called by the testing framework when stopping) + _stop.reset_mock() + assert harness.charm.backup.start_stop_pgbackrest_service() is True _stop.assert_not_called() + _send_signal.assert_not_called() _restart.assert_called_once()