diff --git a/.github/renovate.json5 b/.github/renovate.json5 index c69f3357cb..31e3995853 100644 --- a/.github/renovate.json5 +++ b/.github/renovate.json5 @@ -15,6 +15,5 @@ allowedVersions: '<2.0.0', }, ], - customManagers: [ - ], + customManagers: [], } diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index f7120e7515..fe7ec21189 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -84,17 +84,8 @@ jobs: needs: - collect-integration-tests runs-on: ${{ matrix.job.runner }} - timeout-minutes: 226 # Sum of steps `timeout-minutes` + 5 + timeout-minutes: 216 # Sum of steps `timeout-minutes` + 5 steps: - - name: Free up disk space - timeout-minutes: 10 - run: | - printf '\nDisk usage before cleanup\n' - df --human-readable - # Based on https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 - rm -r /opt/hostedtoolcache/ - printf '\nDisk usage after cleanup\n' - df --human-readable - name: Checkout timeout-minutes: 3 uses: actions/checkout@v5 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 773852181d..a6beae53fe 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -40,7 +40,7 @@ jobs: - ci-tests uses: canonical/data-platform-workflows/.github/workflows/release_charm_edge.yaml@v35.0.2 with: - track: ${{ needs.tag.outputs.track }} + track: 16 artifact-prefix: ${{ needs.ci-tests.outputs.artifact-prefix }} secrets: charmhub-token: ${{ secrets.CHARMHUB_TOKEN }} diff --git a/spread.yaml b/spread.yaml index fe01ada361..ce4b4e088c 100644 --- a/spread.yaml +++ b/spread.yaml @@ -82,6 +82,9 @@ backends: sudo passwd -d runner ADDRESS localhost + + sudo mkdir -p /var/snap/lxd/common/lxd/storage-pools + sudo mount --bind /mnt /var/snap/lxd/common/lxd/storage-pools # HACK: spread does not pass environment variables set on runner # Manually pass specific environment variables environment: diff --git a/src/charm.py b/src/charm.py index b18194124d..d037ac7d30 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1040,7 +1040,11 @@ def _reconfigure_cluster(self, event: HookEvent | RelationEvent) -> bool: return False if ip_to_remove in self.members_ips: self._remove_from_members_ips(ip_to_remove) - self._add_members(event) + try: + self._add_members(event) + except Exception: + logger.debug("Deferring on_peer_relation_changed: Unable to add members") + return False return True def _update_member_ip(self) -> bool: diff --git a/templates/patroni.yml.j2 b/templates/patroni.yml.j2 index aad1ed2a39..94e80f772d 100644 --- a/templates/patroni.yml.j2 +++ b/templates/patroni.yml.j2 @@ -53,6 +53,7 @@ bootstrap: retry_timeout: 10 maximum_lag_on_failover: 1048576 synchronous_mode: true + synchronous_mode_strict: false synchronous_node_count: {{ synchronous_node_count }} postgresql: use_pg_rewind: true diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index c592a9ab21..2b0d3fda65 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -241,7 +241,9 @@ async def is_cluster_updated( # Verify that no writes to the database were missed after stopping the writes. logger.info("checking that no writes to the database were missed after stopping the writes") - total_expected_writes = await check_writes(ops_test, use_ip_from_inside) + for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(5), reraise=True): + with attempt: + total_expected_writes = await check_writes(ops_test, use_ip_from_inside) # Verify that old primary is up-to-date. logger.info("checking that the former primary is up to date with the cluster after restarting") diff --git a/tests/unit/test_async_replication.py b/tests/unit/test_async_replication.py new file mode 100644 index 0000000000..ee8462b0ca --- /dev/null +++ b/tests/unit/test_async_replication.py @@ -0,0 +1,508 @@ +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +from unittest.mock import MagicMock, PropertyMock, patch + +import pytest +from ops import Application +from tenacity import RetryError + +from src.relations.async_replication import ( + READ_ONLY_MODE_BLOCKING_MESSAGE, + REPLICATION_CONSUMER_RELATION, + PostgreSQLAsyncReplication, +) + + +def create_mock_unit(name="unit"): + unit = MagicMock() + unit.name = name + return unit + + +def test_on_secret_changed(): + # 1. relation is None + mock_charm = MagicMock() + mock_event = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + + with ( + patch.object( + PostgreSQLAsyncReplication, "_relation", new_callable=PropertyMock, return_value=None + ), + patch("logging.Logger.debug") as mock_debug, + ): + relation._on_secret_changed(mock_event) + + mock_debug.assert_called_once_with("Early exit on_secret_changed: No relation found.") + mock_event.defer.assert_not_called() + + +def test__configure_primary_cluster(): + # 1. + mock_charm = MagicMock() + mock_event = MagicMock() + mock_charm.app = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + + result = relation._configure_primary_cluster(None, mock_event) + assert result is False + + # 2. + mock_charm = MagicMock() + mock_event = MagicMock() + mock_charm.app = MagicMock() + mock_charm.unit.is_leader.return_value = False + mock_charm.update_config = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + relation.is_primary_cluster = MagicMock(return_value=False) + result = relation._configure_primary_cluster(mock_charm.app, mock_event) + mock_charm.update_config.assert_called_once() + assert result is True + + # 3. + mock_charm = MagicMock() + mock_event = MagicMock() + mock_charm.app = MagicMock() + mock_charm.unit.is_leader.return_value = True + mock_charm.update_config = MagicMock() + mock_charm._patroni.get_standby_leader.return_value = True + mock_charm._patroni.promote_standby_cluster = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + relation.is_primary_cluster = MagicMock(return_value=True) + + relation._update_primary_cluster_data = MagicMock() + + result = relation._configure_primary_cluster(mock_charm.app, mock_event) + + mock_charm.update_config.assert_called_once() + relation._update_primary_cluster_data.assert_called_once() + mock_charm._patroni.promote_standby_cluster() + assert result is True + + # 4. + mock_charm = MagicMock() + mock_event = MagicMock() + mock_charm.app = MagicMock() + mock_charm.unit.is_leader.return_value = True + mock_charm.update_config = MagicMock() + mock_charm._patroni.get_standby_leader.return_value = None + + relation = PostgreSQLAsyncReplication(mock_charm) + relation.is_primary_cluster = MagicMock(return_value=True) + + relation._update_primary_cluster_data = MagicMock() + + result = relation._configure_primary_cluster(mock_charm.app, mock_event) + + mock_charm.update_config.assert_called_once() + relation._update_primary_cluster_data.assert_called_once() + assert result is True + + +def test__on_async_relation_departed(): + mock_charm = MagicMock() + mock_event = MagicMock() + mock_unit_data = {} + mock_charm.unit_peer_data = mock_unit_data + mock_event.departing_unit = MagicMock() + mock_charm.unit = mock_event.departing_unit + + relation = PostgreSQLAsyncReplication(mock_charm) + + result = relation._on_async_relation_departed(mock_event) + assert result is None + assert mock_unit_data == {"departing": "True"} + + +def test_on_async_relation_joined(): + mock_charm = MagicMock() + mock_event = MagicMock() + mock_unit_data = {} + mock_charm.unit_peer_data = mock_unit_data + + mock_charm._unit_ip = "10.0.0.1" + + relation = PostgreSQLAsyncReplication(mock_charm) + + relation._get_highest_promoted_cluster_counter_value = MagicMock(return_value="1") + + result = relation._on_async_relation_joined(mock_event) + + assert result is None + + assert mock_unit_data == {"unit-promoted-cluster-counter": "1"} + + relation._get_highest_promoted_cluster_counter_value.assert_called_once() + + +def test_on_create_replication(): + # 1. + mock_charm = MagicMock() + mock_event = MagicMock() + relation = PostgreSQLAsyncReplication(mock_charm) + + mock_application = MagicMock(spec=Application) + relation._get_primary_cluster = MagicMock(return_value=mock_application) + + result = relation._on_create_replication(mock_event) + + assert result is None + mock_event.fail.assert_called_once_with("There is already a replication set up.") + + # 2. + mock_charm = MagicMock() + mock_event = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + + relation._get_primary_cluster = MagicMock(return_value=None) + + mock_relation = MagicMock() + mock_relation.name = REPLICATION_CONSUMER_RELATION + type(relation)._relation = PropertyMock(return_value=mock_relation) + + result = relation._on_create_replication(mock_event) + + assert result is None + mock_event.fail.assert_called_once_with( + "This action must be run in the cluster where the offer was created." + ) + # 3. + mock_charm = MagicMock() + mock_event = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + + relation._get_primary_cluster = MagicMock(return_value=None) + + relation._handle_replication_change = MagicMock(return_value=True) + + mock_relation = MagicMock() + mock_relation.name = "Something" + type(relation)._relation = PropertyMock(return_value=mock_relation) + + result = relation._on_create_replication(mock_event) + + assert result is None + + # 4. + mock_charm = MagicMock() + mock_event = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + + relation._get_primary_cluster = MagicMock(return_value=None) + + relation._handle_replication_change = MagicMock(return_value=False) + + mock_relation = MagicMock() + mock_relation.name = "Something" + type(relation)._relation = PropertyMock(return_value=mock_relation) + + result = relation._on_create_replication(mock_event) + + assert result is None + + +def test_promote_to_primary(): + # 1. + mock_charm = MagicMock() + mock_event = MagicMock() + mock_relation = MagicMock() + mock_relation.status = MagicMock() + mock_relation.status.message = "Something" + + relation = PostgreSQLAsyncReplication(mock_charm) + relation._get_primary_cluster = MagicMock(return_value=None) + + type(relation).app = PropertyMock(return_value=mock_relation) + result = relation.promote_to_primary(mock_event) + assert result is None + + mock_event.fail.assert_called_once_with( + "No primary cluster found. Run `create-replication` action in the cluster where the offer was created." + ) + + # 2. + mock_charm = MagicMock() + mock_event = MagicMock() + mock_relation = MagicMock() + mock_app = MagicMock(spec=Application) + mock_relation.status = MagicMock() + mock_relation.status.message = READ_ONLY_MODE_BLOCKING_MESSAGE + + relation = PostgreSQLAsyncReplication(mock_charm) + relation._get_primary_cluster = MagicMock(return_value=None) + + type(relation).app = PropertyMock(return_value=mock_app) + relation._handle_replication_change = MagicMock(return_value=False) + + result = relation.promote_to_primary(mock_event) + + assert result is None + + +def test__configure_standby_cluster(): + mock_charm = MagicMock() + mock_event = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + relation._relation = MagicMock() + relation._relation.name = REPLICATION_CONSUMER_RELATION + relation._update_internal_secret = MagicMock(return_value=False) + + result = relation._configure_standby_cluster(mock_event) + + assert result is False + + mock_event.defer.assert_called_once() + + # 2. + mock_charm = MagicMock() + mock_event = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + relation._relation = MagicMock() + relation._relation.name = "something_else" + relation._update_internal_secret = MagicMock(return_value=True) + relation.get_system_identifier = MagicMock(return_value=(None, 2)) + + with pytest.raises(Exception) as exc_info: + relation._configure_standby_cluster(mock_event) + + assert str(exc_info.value) == "2" + + # 3. + mock_charm = MagicMock() + mock_event = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + relation._relation = MagicMock() + relation._relation.name = "some_relation" + relation._relation.app = "remote-app" + relation._relation.data = {relation._relation.app: {"system-id": "123"}} + + relation._update_internal_secret = MagicMock(return_value=True) + relation.get_system_identifier = MagicMock(return_value=("456", None)) + relation.charm = MagicMock() + relation.charm.app_peer_data = {} + + with patch("subprocess.check_call") as mock_check_call: + result = relation._configure_standby_cluster(mock_event) + + assert result is True + mock_check_call.assert_called_once() + + +def test_wait_for_standby_leader(): + # 1. + mock_charm = MagicMock() + mock_event = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + + mock_charm._patroni.get_standby_leader.return_value = None + mock_charm.unit.is_leader.return_value = False + mock_charm._patroni.is_member_isolated = True + mock_charm._patroni.restart_patroni = MagicMock() + + result = relation._wait_for_standby_leader(mock_event) + assert result is True + mock_charm._patroni.restart_patroni.assert_called_once() + mock_event.defer.assert_called_once() + + # 2. + mock_charm = MagicMock() + mock_event = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + + mock_charm._patroni.get_standby_leader.return_value = None + mock_charm.unit.is_leader.return_value = False + mock_charm._patroni.is_member_isolated = False + + result = relation._wait_for_standby_leader(mock_event) + assert result is True + mock_event.defer.assert_called_once() + + # 3. + mock_charm = MagicMock() + mock_event = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + mock_charm._patroni.get_standby_leader.return_value = None + mock_charm.unit.is_leader.return_value = True + + result = relation._wait_for_standby_leader(mock_event) + assert result is False + + +def test_get_partner_addresses(): + mock_charm = MagicMock() + + mock_charm._peer_members_ips = ["str"] + mock_charm.app = MagicMock() + mock_charm.unit = MagicMock() + mock_charm.unit.is_leader.return_value = True + mock_charm._peers = MagicMock() + mock_charm._peers.data = {mock_charm.unit: {}} + + relation = PostgreSQLAsyncReplication(mock_charm) + relation._get_primary_cluster = MagicMock(return_value=None) + relation._get_highest_promoted_cluster_counter_value = MagicMock(return_value=None) + + result = relation.get_partner_addresses() + + assert result == ["str"] + + +def test_handle_replication_change(): + # 1. + mock_charm = MagicMock() + mock_event = MagicMock() + relation = PostgreSQLAsyncReplication(mock_charm) + relation._can_promote_cluster = MagicMock(return_value=False) + result = relation._handle_replication_change(mock_event) + assert result is False + # 2. + mock_charm = MagicMock() + mock_event = MagicMock() + relation = PostgreSQLAsyncReplication(mock_charm) + relation._can_promote_cluster = MagicMock(return_value=True) + relation.get_system_identifier = MagicMock(return_value=(12345, "some error")) + result = relation._handle_replication_change(mock_event) + assert result is False + + # 3. + mock_charm = MagicMock() + mock_event = MagicMock() + mock_relation = MagicMock() + + mock_unit1 = MagicMock() + mock_unit2 = MagicMock() + mock_relation.units = [mock_unit1, mock_unit2] + mock_relation.data = { + mock_unit1: {"unit-address": "10.0.0.1"}, + mock_unit2: {"unit-address": "10.0.0.2"}, + mock_charm.app: {}, + } + + relation = PostgreSQLAsyncReplication(mock_charm) + relation._relation = mock_relation + relation._can_promote_cluster = MagicMock(return_value=True) + relation.get_system_identifier = MagicMock(return_value=(12345, None)) + relation._get_highest_promoted_cluster_counter_value = MagicMock(return_value="1") + relation._update_primary_cluster_data = MagicMock() + relation._re_emit_async_relation_changed_event = MagicMock() + + result = relation._handle_replication_change(mock_event) + + assert result is True + relation._can_promote_cluster.assert_called_once_with(mock_event) + relation.get_system_identifier.assert_called_once() + relation._get_highest_promoted_cluster_counter_value.assert_called_once() + relation._update_primary_cluster_data.assert_called_once_with(2, 12345) + relation._re_emit_async_relation_changed_event.assert_called_once() + mock_event.fail.assert_not_called() + + +def test_handle_forceful_promotion(): + # 1. + mock_charm = MagicMock() + mock_event = MagicMock() + + mock_event.params.get.return_value = True + relation = PostgreSQLAsyncReplication(mock_charm) + result = relation._handle_forceful_promotion(mock_event) + + assert result is True + # 2. + mock_charm = MagicMock() + mock_event = MagicMock() + + mock_event.params.get.return_value = False + + relation = PostgreSQLAsyncReplication(mock_charm) + + relation._relation = MagicMock() + relation._relation.app = MagicMock() + relation._relation.app.name = "test-app" + + relation.get_all_primary_cluster_endpoints = MagicMock(return_value=[1, 2, 3]) + + mock_charm._patroni.get_primary.side_effect = RetryError("timeout") + + result = relation._handle_forceful_promotion(mock_event) + + mock_event.fail.assert_called_once_with( + "test-app isn't reachable. Pass `force=true` to promote anyway." + ) + assert result is False + # 3. + mock_charm = MagicMock() + mock_event = MagicMock() + + mock_event.params.get.return_value = False + + relation = PostgreSQLAsyncReplication(mock_charm) + + relation._relation = MagicMock() + relation._relation.app = MagicMock() + relation._relation.app.name = "test-app" + + relation.get_all_primary_cluster_endpoints = MagicMock(return_value=[1, 2, 3]) + + mock_charm._patroni.get_primary.side_effect = None + + result = relation._handle_forceful_promotion(mock_event) + + assert result is True + # 4. + mock_charm = MagicMock() + mock_event = MagicMock() + + mock_event.params.get.return_value = False + + relation = PostgreSQLAsyncReplication(mock_charm) + + relation._relation = MagicMock() + relation._relation.app = MagicMock() + relation._relation.app.name = "test-app" + + relation.get_all_primary_cluster_endpoints = MagicMock(return_value=[]) + + mock_charm._patroni.get_primary.side_effect = None + + result = relation._handle_forceful_promotion(mock_event) + + assert result is True + + +def test_on_async_relation_broken(): + # 1. + mock_charm = MagicMock() + mock_event = MagicMock() + mock_charm._peers = True + + relation = PostgreSQLAsyncReplication(mock_charm) + + result = relation._on_async_relation_broken(mock_event) + + assert result is None + # 2. + mock_charm = MagicMock() + mock_charm._peers = MagicMock() + mock_charm.is_unit_departing = False + mock_charm._patroni.get_standby_leader.return_value = None + mock_charm.unit.is_leader.return_value = True + mock_event = MagicMock() + + relation = PostgreSQLAsyncReplication(mock_charm) + relation._on_async_relation_broken(mock_event) + + assert mock_charm.update_config.called