diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index f7120e7515..9aed6c5657 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -86,15 +86,6 @@ jobs: runs-on: ${{ matrix.job.runner }} timeout-minutes: 226 # Sum of steps `timeout-minutes` + 5 steps: - - name: Free up disk space - timeout-minutes: 10 - run: | - printf '\nDisk usage before cleanup\n' - df --human-readable - # Based on https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 - rm -r /opt/hostedtoolcache/ - printf '\nDisk usage after cleanup\n' - df --human-readable - name: Checkout timeout-minutes: 3 uses: actions/checkout@v5 diff --git a/pyproject.toml b/pyproject.toml index 01a7af6bda..eeeca91230 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,6 +98,7 @@ minversion = "6.0" log_cli_level = "INFO" asyncio_mode = "auto" markers = ["juju3", "juju_secrets"] +addopts = "--exitfirst" # Formatting tools configuration [tool.black] diff --git a/spread.yaml b/spread.yaml index fe01ada361..ce4b4e088c 100644 --- a/spread.yaml +++ b/spread.yaml @@ -82,6 +82,9 @@ backends: sudo passwd -d runner ADDRESS localhost + + sudo mkdir -p /var/snap/lxd/common/lxd/storage-pools + sudo mount --bind /mnt /var/snap/lxd/common/lxd/storage-pools # HACK: spread does not pass environment variables set on runner # Manually pass specific environment variables environment: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 2740fa317d..b36aa192af 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -105,28 +105,15 @@ def juju(request: pytest.FixtureRequest): This adds command line parameter ``--keep-models`` (see help for details). """ - controller = request.config.getoption("--controller") model = request.config.getoption("--model") - controller_and_model = None - if controller and model: - controller_and_model = f"{controller}:{model}" - elif controller: - controller_and_model = controller - elif model: - controller_and_model = model keep_models = bool(request.config.getoption("--keep-models")) - if controller_and_model: - juju = jubilant.Juju(model=controller_and_model) # type: ignore + if model: + juju = jubilant.Juju(model=model) yield juju - log = juju.debug_log(limit=1000) else: with jubilant.temp_model(keep=keep_models) as juju: yield juju - log = juju.debug_log(limit=1000) - - if request.session.testsfailed: - print(log, end="") @pytest.fixture(scope="module") diff --git a/tests/integration/ha_tests/test_async_replication.py b/tests/integration/ha_tests/test_async_replication.py deleted file mode 100644 index 1e94786436..0000000000 --- a/tests/integration/ha_tests/test_async_replication.py +++ /dev/null @@ -1,566 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import contextlib -import logging -import subprocess -from asyncio import gather - -import psycopg2 -import pytest as pytest -from juju.model import Model -from pytest_operator.plugin import OpsTest -from tenacity import Retrying, stop_after_delay, wait_fixed - -from .. import architecture -from ..helpers import ( - APPLICATION_NAME, - DATABASE_APP_NAME, - get_leader_unit, - get_password, - get_primary, - get_unit_address, - scale_application, - wait_for_relation_removed_between, -) -from .helpers import ( - app_name, - are_writes_increasing, - check_writes, - get_standby_leader, - get_sync_standby, - start_continuous_writes, -) - -logger = logging.getLogger(__name__) - - -CLUSTER_SIZE = 3 -FAST_INTERVAL = "10s" -IDLE_PERIOD = 5 -TIMEOUT = 2000 - -DATA_INTEGRATOR_APP_NAME = "data-integrator" - - -@contextlib.asynccontextmanager -async def fast_forward(model: Model, fast_interval: str = "10s", slow_interval: str | None = None): - """Adaptation of OpsTest.fast_forward to work with different models.""" - update_interval_key = "update-status-hook-interval" - interval_after = ( - slow_interval if slow_interval else (await model.get_config())[update_interval_key] - ) - - await model.set_config({update_interval_key: fast_interval}) - yield - await model.set_config({update_interval_key: interval_after}) - - -@pytest.fixture(scope="module") -def first_model(ops_test: OpsTest) -> Model: - """Return the first model.""" - first_model = ops_test.model - return first_model - - -@pytest.fixture(scope="module") -async def second_model(ops_test: OpsTest, first_model, request) -> Model: - """Create and return the second model.""" - second_model_name = f"{first_model.info.name}-other" - if second_model_name not in await ops_test._controller.list_models(): - await ops_test._controller.add_model(second_model_name) - subprocess.run(["juju", "switch", second_model_name], check=True) - subprocess.run( - ["juju", "set-model-constraints", f"arch={architecture.architecture}"], check=True - ) - subprocess.run(["juju", "switch", first_model.info.name], check=True) - second_model = Model() - await second_model.connect(model_name=second_model_name) - yield second_model - if request.config.getoption("--keep-models"): - return - logger.info("Destroying second model") - await ops_test._controller.destroy_model(second_model_name, destroy_storage=True) - - -@pytest.fixture -async def second_model_continuous_writes(second_model) -> None: - """Cleans up continuous writes on the second model after a test run.""" - yield - # Clear the written data at the end. - for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3), reraise=True): - with attempt: - action = ( - await second_model.applications[APPLICATION_NAME] - .units[0] - .run_action("clear-continuous-writes") - ) - await action.wait() - assert action.results["result"] == "True", "Unable to clear up continuous_writes table" - - -@pytest.mark.abort_on_fail -async def test_deploy_async_replication_setup( - ops_test: OpsTest, first_model: Model, second_model: Model, charm -) -> None: - """Build and deploy two PostgreSQL cluster in two separate models to test async replication.""" - if not await app_name(ops_test): - await ops_test.model.deploy( - charm, - num_units=CLUSTER_SIZE, - config={"profile": "testing"}, - ) - if not await app_name(ops_test, DATA_INTEGRATOR_APP_NAME): - await ops_test.model.deploy( - DATA_INTEGRATOR_APP_NAME, - num_units=1, - channel="latest/edge", - config={"database-name": "testdb"}, - ) - await ops_test.model.relate(DATABASE_APP_NAME, DATA_INTEGRATOR_APP_NAME) - if not await app_name(ops_test, model=second_model): - await second_model.deploy( - charm, - num_units=CLUSTER_SIZE, - config={"profile": "testing"}, - ) - await ops_test.model.deploy( - APPLICATION_NAME, channel="latest/edge", num_units=1, config={"sleep_interval": 1000} - ) - await second_model.deploy( - APPLICATION_NAME, channel="latest/edge", num_units=1, config={"sleep_interval": 1000} - ) - - async with ops_test.fast_forward(), fast_forward(second_model): - await gather( - first_model.wait_for_idle(apps=[APPLICATION_NAME], status="blocked"), - second_model.wait_for_idle(apps=[APPLICATION_NAME], status="blocked"), - ) - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME, DATA_INTEGRATOR_APP_NAME], - status="active", - timeout=TIMEOUT, - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], - status="active", - timeout=TIMEOUT, - ), - ) - - -@pytest.mark.abort_on_fail -async def test_async_replication( - ops_test: OpsTest, - first_model: Model, - second_model: Model, - continuous_writes, -) -> None: - """Test async replication between two PostgreSQL clusters.""" - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - first_offer_command = f"offer {DATABASE_APP_NAME}:replication-offer replication-offer" - await ops_test.juju(*first_offer_command.split()) - first_consume_command = ( - f"consume -m {second_model.info.name} admin/{first_model.info.name}.replication-offer" - ) - await ops_test.juju(*first_consume_command.split()) - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - await second_model.relate(DATABASE_APP_NAME, "replication-offer") - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Run the promote action. - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - logger.info("promoting the first cluster") - run_action = await leader_unit.run_action("create-replication") - await run_action.wait() - assert (run_action.results.get("return-code", None) == 0) or ( - run_action.results.get("Code", None) == "0" - ), "Promote action failed" - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test, extra_model=second_model) - - -@pytest.mark.abort_on_fail -async def test_get_data_integrator_credentials( - ops_test: OpsTest, -): - unit = ops_test.model.applications[DATA_INTEGRATOR_APP_NAME].units[0] - action = await unit.run_action(action_name="get-credentials") - result = await action.wait() - global data_integrator_credentials - data_integrator_credentials = result.results - - -@pytest.mark.abort_on_fail -async def test_switchover( - ops_test: OpsTest, - first_model: Model, - second_model: Model, - second_model_continuous_writes, -): - """Test switching over to the second cluster.""" - second_offer_command = f"offer {DATABASE_APP_NAME}:replication replication" - await ops_test.juju(*second_offer_command.split()) - second_consume_command = ( - f"consume -m {second_model.info.name} admin/{first_model.info.name}.replication" - ) - await ops_test.juju(*second_consume_command.split()) - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - # Run the promote action. - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME, model=second_model) - assert leader_unit is not None, "No leader unit found" - logger.info("promoting the second cluster") - run_action = await leader_unit.run_action("promote-to-primary", scope="cluster", force=True) - await run_action.wait() - assert (run_action.results.get("return-code", None) == 0) or ( - run_action.results.get("Code", None) == "0" - ), "Promote action failed" - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME, model=second_model) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test, extra_model=second_model) - - -@pytest.mark.abort_on_fail -async def test_data_integrator_creds_keep_on_working( - ops_test: OpsTest, - second_model: Model, -) -> None: - user = data_integrator_credentials["postgresql"]["username"] - password = data_integrator_credentials["postgresql"]["password"] - database = data_integrator_credentials["postgresql"]["database"] - - any_unit = second_model.applications[DATABASE_APP_NAME].units[0].name - primary = await get_primary(ops_test, any_unit, second_model) - address = second_model.units.get(primary).public_address - - connstr = f"dbname='{database}' user='{user}' host='{address}' port='5432' password='{password}' connect_timeout=1" - try: - with psycopg2.connect(connstr) as connection: - pass - finally: - connection.close() - - -@pytest.mark.abort_on_fail -async def test_promote_standby( - ops_test: OpsTest, - first_model: Model, - second_model: Model, - second_model_continuous_writes, -) -> None: - """Test promoting the standby cluster.""" - logger.info("breaking the relations") - await first_model.applications[DATABASE_APP_NAME].remove_relation( - "database", f"{APPLICATION_NAME}:database" - ) - await second_model.applications[DATABASE_APP_NAME].remove_relation( - "replication", "replication-offer" - ) - wait_for_relation_removed_between(ops_test, "replication-offer", "replication", second_model) - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - first_model.block_until( - lambda: first_model.applications[DATABASE_APP_NAME].status == "blocked", - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - # Run the promote action. - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - logger.info("promoting the first cluster") - run_action = await leader_unit.run_action("promote-to-primary", scope="cluster") - await run_action.wait() - assert (run_action.results.get("return-code", None) == 0) or ( - run_action.results.get("Code", None) == "0" - ), "Promote action failed" - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("removing the previous data") - any_unit = ops_test.model.applications[DATABASE_APP_NAME].units[0].name - primary = await get_primary(ops_test, any_unit) - address = get_unit_address(ops_test, primary) - password = await get_password(ops_test) - database_name = f"{APPLICATION_NAME.replace('-', '_')}_database" - connection = None - try: - connection = psycopg2.connect( - f"dbname={database_name} user=operator password={password} host={address}" - ) - connection.autocommit = True - cursor = connection.cursor() - cursor.execute("DROP TABLE IF EXISTS continuous_writes;") - except psycopg2.Error as e: - assert False, f"Failed to drop continuous writes table: {e}" - finally: - if connection is not None: - connection.close() - - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - -@pytest.mark.abort_on_fail -async def test_reestablish_relation( - ops_test: OpsTest, first_model: Model, second_model: Model, continuous_writes -) -> None: - """Test that the relation can be broken and re-established.""" - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - logger.info("reestablishing the relation") - await second_model.relate(DATABASE_APP_NAME, "replication-offer") - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Run the promote action. - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - logger.info("promoting the first cluster") - run_action = await leader_unit.run_action("create-replication") - await run_action.wait() - assert (run_action.results.get("return-code", None) == 0) or ( - run_action.results.get("Code", None) == "0" - ), "Promote action failed" - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test, extra_model=second_model) - - -@pytest.mark.abort_on_fail -async def test_async_replication_failover_in_main_cluster( - ops_test: OpsTest, first_model: Model, second_model: Model, continuous_writes -) -> None: - """Test that async replication fails over correctly.""" - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - sync_standby = await get_sync_standby(ops_test, first_model, DATABASE_APP_NAME) - logger.info(f"Sync-standby: {sync_standby}") - logger.info("deleting the sync-standby") - await first_model.applications[DATABASE_APP_NAME].destroy_units(sync_standby) - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], - status="active", - idle_period=IDLE_PERIOD, - timeout=TIMEOUT, - wait_for_exact_units=(CLUSTER_SIZE - 1), - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - # Check that the sync-standby unit is not the same as before. - new_sync_standby = await get_sync_standby(ops_test, first_model, DATABASE_APP_NAME) - logger.info(f"New sync-standby: {new_sync_standby}") - assert new_sync_standby != sync_standby, "Sync-standby is the same as before" - - logger.info("Ensure continuous_writes after the crashed unit") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test, extra_model=second_model) - - -@pytest.mark.abort_on_fail -async def test_async_replication_failover_in_secondary_cluster( - ops_test: OpsTest, first_model: Model, second_model: Model, continuous_writes -) -> None: - """Test that async replication fails back correctly.""" - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - standby_leader = await get_standby_leader(second_model, DATABASE_APP_NAME) - logger.info(f"Standby leader: {standby_leader}") - logger.info("deleting the standby leader") - await second_model.applications[DATABASE_APP_NAME].destroy_units(standby_leader) - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("Ensure continuous_writes after the crashed unit") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test, extra_model=second_model) - - -@pytest.mark.abort_on_fail -async def test_scaling( - ops_test: OpsTest, first_model: Model, second_model: Model, continuous_writes -) -> None: - """Test that async replication works when scaling the clusters.""" - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - logger.info("scaling out the clusters") - first_cluster_original_size = len(first_model.applications[DATABASE_APP_NAME].units) - second_cluster_original_size = len(second_model.applications[DATABASE_APP_NAME].units) - await gather( - scale_application(ops_test, DATABASE_APP_NAME, first_cluster_original_size + 1), - scale_application( - ops_test, - DATABASE_APP_NAME, - second_cluster_original_size + 1, - model=second_model, - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test, extra_model=second_model) - - logger.info("scaling in the clusters") - await gather( - scale_application(ops_test, DATABASE_APP_NAME, first_cluster_original_size), - scale_application( - ops_test, DATABASE_APP_NAME, second_cluster_original_size, model=second_model - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test, extra_model=second_model) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test, extra_model=second_model) diff --git a/tests/integration/ha_tests/test_upgrade.py b/tests/integration/ha_tests/test_upgrade.py deleted file mode 100644 index c2714d7ac7..0000000000 --- a/tests/integration/ha_tests/test_upgrade.py +++ /dev/null @@ -1,226 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -import logging -import platform -import shutil -import zipfile -from asyncio import gather -from pathlib import Path - -import pytest -import tomli -import tomli_w -from pytest_operator.plugin import OpsTest - -from ..helpers import ( - APPLICATION_NAME, - DATABASE_APP_NAME, - count_switchovers, - get_leader_unit, - get_primary, -) -from .helpers import ( - are_writes_increasing, - check_writes, - start_continuous_writes, -) - -logger = logging.getLogger(__name__) - -TIMEOUT = 30 * 60 - - -@pytest.mark.abort_on_fail -async def test_deploy_latest(ops_test: OpsTest) -> None: - """Simple test to ensure that the PostgreSQL and application charms get deployed.""" - await gather( - ops_test.model.deploy( - DATABASE_APP_NAME, num_units=3, channel="16/edge", config={"profile": "testing"} - ), - ops_test.model.deploy( - APPLICATION_NAME, - num_units=1, - channel="latest/edge", - config={"sleep_interval": 500}, - ), - ) - await ops_test.model.relate(DATABASE_APP_NAME, f"{APPLICATION_NAME}:database") - logger.info("Wait for applications to become active") - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active", timeout=1500 - ) - assert len(ops_test.model.applications[DATABASE_APP_NAME].units) == 3 - - -@pytest.mark.abort_on_fail -async def test_pre_refresh_check(ops_test: OpsTest) -> None: - """Test that the pre-refresh-check action runs successfully.""" - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - - logger.info("Run pre-refresh-check action") - action = await leader_unit.run_action("pre-refresh-check") - await action.wait() - - -@pytest.mark.abort_on_fail -async def test_upgrade_from_edge(ops_test: OpsTest, continuous_writes, charm) -> None: - # Start an application that continuously writes data to the database. - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - primary_name = await get_primary(ops_test, f"{DATABASE_APP_NAME}/0") - initial_number_of_switchovers = count_switchovers(ops_test, primary_name) - - application = ops_test.model.applications[DATABASE_APP_NAME] - - logger.info("Refresh the charm") - await application.refresh(path=charm) - - logger.info("Wait for upgrade to start") - try: - # Blocked status is expected due to: - # (on PR) compatibility checks (on PR charm revision is '16/1.25.0+dirty...') - # (non-PR) the first unit upgraded and paused (pause-after-unit-refresh=first) - await ops_test.model.block_until(lambda: application.status == "blocked", timeout=60 * 3) - - logger.info("Wait for refresh to block as paused or incompatible") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT - ) - - # Highest to lowest unit number - refresh_order = sorted( - application.units, key=lambda unit: int(unit.name.split("/")[1]), reverse=True - ) - - if "Refresh incompatible" in application.status_message: - logger.info("Application refresh is blocked due to incompatibility") - - action = await refresh_order[0].run_action( - "force-refresh-start", **{"check-compatibility": False} - ) - await action.wait() - - logger.info("Wait for first incompatible unit to upgrade") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT - ) - - logger.info("Run resume-refresh action") - action = await refresh_order[1].run_action("resume-refresh") - await action.wait() - except TimeoutError: - # If the application didn't get into the blocked state, it should have upgraded only - # the charm code because the snap revision didn't change. - logger.info("Upgrade completed without snap refresh (charm.py upgrade only)") - assert application.status == "active", ( - "Application didn't reach blocked or active state after refresh attempt" - ) - - logger.info("Wait for upgrade to complete") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=30, timeout=TIMEOUT - ) - - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test) - - logger.info("checking the number of switchovers") - final_number_of_switchovers = count_switchovers(ops_test, primary_name) - assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( - "Number of switchovers is greater than 2" - ) - - -@pytest.mark.abort_on_fail -async def test_fail_and_rollback(ops_test, charm, continuous_writes) -> None: - # Start an application that continuously writes data to the database. - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - - logger.info("Run pre-refresh-check action") - action = await leader_unit.run_action("pre-refresh-check") - await action.wait() - - filename = Path(charm).name - fault_charm = Path("/tmp", f"{filename}.fault.charm") - shutil.copy(charm, fault_charm) - - logger.info("Inject dependency fault") - await inject_dependency_fault(fault_charm) - - application = ops_test.model.applications[DATABASE_APP_NAME] - - logger.info("Refresh the charm") - await application.refresh(path=fault_charm) - - logger.info("Wait for upgrade to fail") - await ops_test.model.block_until( - lambda: application.status == "blocked" - and "incompatible" in application.status_message.lower(), - timeout=TIMEOUT, - ) - - logger.info("Ensure continuous_writes while in failure state on remaining units") - await are_writes_increasing(ops_test) - - logger.info("Re-refresh the charm") - await application.refresh(path=charm) - - logger.info("Wait for upgrade to start") - await ops_test.model.block_until(lambda: application.status == "blocked", timeout=TIMEOUT) - - logger.info("Wait for application to recover") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", timeout=TIMEOUT - ) - - logger.info("Ensure continuous_writes after rollback procedure") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("Checking whether no writes were lost") - await check_writes(ops_test) - - # Remove fault charm file. - fault_charm.unlink() - - -async def inject_dependency_fault(charm_file: str | Path) -> None: - """Inject a dependency fault into the PostgreSQL charm.""" - with Path("refresh_versions.toml").open("rb") as file: - versions = tomli.load(file) - - versions["charm"] = "16/0.0.0" - versions["snap"]["revisions"][platform.machine()] = "1" - - # Overwrite refresh_versions.toml with incompatible version. - with zipfile.ZipFile(charm_file, mode="a") as charm_zip: - charm_zip.writestr("refresh_versions.toml", tomli_w.dumps(versions)) diff --git a/tests/integration/ha_tests/test_upgrade_from_stable.py b/tests/integration/ha_tests/test_upgrade_from_stable.py deleted file mode 100644 index d95e4567be..0000000000 --- a/tests/integration/ha_tests/test_upgrade_from_stable.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. -import logging -from asyncio import gather - -import pytest -from pytest_operator.plugin import OpsTest - -from ..helpers import ( - APPLICATION_NAME, - DATABASE_APP_NAME, - count_switchovers, - get_leader_unit, - get_primary, -) -from .helpers import ( - are_writes_increasing, - check_writes, - start_continuous_writes, -) - -logger = logging.getLogger(__name__) - -TIMEOUT = 25 * 60 - - -@pytest.mark.abort_on_fail -async def test_deploy_stable(ops_test: OpsTest) -> None: - """Simple test to ensure that the PostgreSQL and application charms get deployed.""" - await gather( - ops_test.model.deploy( - DATABASE_APP_NAME, num_units=3, channel="16/stable", config={"profile": "testing"} - ), - ops_test.model.deploy( - APPLICATION_NAME, - num_units=1, - channel="latest/edge", - config={"sleep_interval": 500}, - ), - ) - await ops_test.model.relate(DATABASE_APP_NAME, f"{APPLICATION_NAME}:database") - logger.info("Wait for applications to become active") - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active", timeout=(20 * 60) - ) - assert len(ops_test.model.applications[DATABASE_APP_NAME].units) == 3 - - -@pytest.mark.abort_on_fail -async def test_pre_refresh_check(ops_test: OpsTest) -> None: - """Test that the pre-refresh-check action runs successfully.""" - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - - logger.info("Run pre-refresh-check action") - action = await leader_unit.run_action("pre-refresh-check") - await action.wait() - - -@pytest.mark.abort_on_fail -async def test_upgrade_from_stable(ops_test: OpsTest, charm): - """Test updating from stable channel.""" - # Start an application that continuously writes data to the database. - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - primary_name = await get_primary(ops_test, f"{DATABASE_APP_NAME}/0") - initial_number_of_switchovers = count_switchovers(ops_test, primary_name) - - application = ops_test.model.applications[DATABASE_APP_NAME] - - logger.info("Refresh the charm") - await application.refresh(path=charm) - - logger.info("Wait for upgrade to start") - try: - # Blocked status is expected due to: - # (on PR) compatibility checks (on PR charm revision is '16/1.25.0+dirty...') - # (non-PR) the first unit upgraded and paused (pause-after-unit-refresh=first) - await ops_test.model.block_until(lambda: application.status == "blocked", timeout=60 * 3) - - logger.info("Wait for refresh to block as paused or incompatible") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT - ) - - # Highest to lowest unit number - refresh_order = sorted( - application.units, key=lambda unit: int(unit.name.split("/")[1]), reverse=True - ) - - if "Refresh incompatible" in application.status_message: - logger.info("Application refresh is blocked due to incompatibility") - - action = await refresh_order[0].run_action( - "force-refresh-start", **{"check-compatibility": False} - ) - await action.wait() - - logger.info("Wait for first incompatible unit to upgrade") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT - ) - - logger.info("Run resume-refresh action") - action = await refresh_order[1].run_action("resume-refresh") - await action.wait() - except TimeoutError: - # If the application didn't get into the blocked state, it should have upgraded only - # the charm code because the snap revision didn't change. - logger.info("Upgrade completed without snap refresh (charm.py upgrade only)") - assert application.status == "active", ( - "Application didn't reach blocked or active state after refresh attempt" - ) - - logger.info("Wait for upgrade to complete") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=30, timeout=TIMEOUT - ) - - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test) - - logger.info("checking the number of switchovers") - final_number_of_switchovers = count_switchovers(ops_test, primary_name) - assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( - "Number of switchovers is greater than 2" - ) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index a047038990..eb39cac6bb 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -1431,3 +1431,40 @@ async def backup_operations( "backup wasn't correctly restored: table 'backup_table_3' exists" ) connection.close() + + +### Ported Mysql jubilant helpers + + +def execute_queries_on_unit( + unit_address: str, + username: str, + password: str, + queries: list[str], + database: str, + commit: bool = False, +) -> list: + """Execute given MySQL queries on a unit. + + Args: + unit_address: The public IP address of the unit to execute the queries on + username: The PostgreSQL username + password: The PostgreSQL password + queries: A list of queries to execute + database: Database to execute in + commit: A keyword arg indicating whether there are any writes queries + + Returns: + A list of rows that were potentially queried + """ + with ( + psycopg2.connect( + f"dbname='{database}' user='{username}' host='{unit_address}' password='{password}' connect_timeout=10" + ) as connection, + connection.cursor() as cursor, + ): + for query in queries: + cursor.execute(query) + output = list(itertools.chain(*cursor.fetchall())) + + return output diff --git a/tests/integration/high_availability/__init__.py b/tests/integration/high_availability/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/high_availability/conftest.py b/tests/integration/high_availability/conftest.py new file mode 100644 index 0000000000..7d02b894fa --- /dev/null +++ b/tests/integration/high_availability/conftest.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import pytest +from tenacity import Retrying, stop_after_attempt + +from .high_availability_helpers_new import get_app_leader + +logger = logging.getLogger(__name__) + +DB_TEST_APP_NAME = "postgresql-test-app" + + +@pytest.fixture() +def continuous_writes(juju): + """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" + application_unit = get_app_leader(juju, DB_TEST_APP_NAME) + + logger.info("Clearing continuous writes") + juju.run(unit=application_unit, action="clear-continuous-writes", wait=120).raise_on_failure() + + logger.info("Starting continuous writes") + + for attempt in Retrying(stop=stop_after_attempt(10), reraise=True): + with attempt: + result = juju.run(unit=application_unit, action="start-continuous-writes") + result.raise_on_failure() + + assert result.results["result"] == "True" + + yield + + logger.info("Clearing continuous writes") + juju.run(unit=application_unit, action="clear-continuous-writes", wait=120).raise_on_failure() diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py new file mode 100644 index 0000000000..0c09581fee --- /dev/null +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +import json +import subprocess +from collections.abc import Callable + +import jubilant +import requests +from jubilant import Juju +from jubilant.statustypes import Status, UnitStatus +from tenacity import Retrying, stop_after_delay, wait_fixed + +from constants import PEER + +from ..helpers import execute_queries_on_unit + +MINUTE_SECS = 60 +SERVER_CONFIG_USERNAME = "operator" + +JujuModelStatusFn = Callable[[Status], bool] +JujuAppsStatusFn = Callable[[Status, str], bool] + + +def check_db_units_writes_increment( + juju: Juju, + app_name: str, + app_units: list[str] | None = None, + db_name: str = "postgresql_test_app_database", +) -> None: + """Ensure that continuous writes is incrementing on all units. + + Also, ensure that all continuous writes up to the max written value is available + on all units (ensure that no committed data is lost). + """ + if not app_units: + app_units = get_app_units(juju, app_name) + + app_primary = get_db_primary_unit(juju, app_name) + app_max_value = get_db_max_written_value(juju, app_name, app_primary, db_name) + + for unit_name in app_units: + for attempt in Retrying( + reraise=True, + stop=stop_after_delay(5 * MINUTE_SECS), + wait=wait_fixed(10), + ): + with attempt: + unit_max_value = get_db_max_written_value(juju, app_name, unit_name, db_name) + assert unit_max_value > app_max_value, "Writes not incrementing" + app_max_value = unit_max_value + + +def get_app_leader(juju: Juju, app_name: str) -> str: + """Get the leader unit for the given application.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + for name, status in app_status.units.items(): + if status.leader: + return name + + raise Exception("No leader unit found") + + +def get_app_name(juju: Juju, charm_name: str) -> str | None: + """Get the application name for the given charm.""" + model_status = juju.status() + app_statuses = model_status.apps + for name, status in app_statuses.items(): + if status.charm_name == charm_name: + return name + + raise Exception("No application name found") + + +def get_app_units(juju: Juju, app_name: str) -> dict[str, UnitStatus]: + """Get the units for the given application.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + return app_status.units + + +def get_unit_by_number(juju: Juju, app_name: str, unit_number: int) -> str: + """Get unit by number.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + for name in app_status.units: + if name == f"{app_name}/{unit_number}": + return name + + raise Exception("No application unit found") + + +def get_unit_ip(juju: Juju, app_name: str, unit_name: str) -> str: + """Get the application unit IP.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + for name, status in app_status.units.items(): + if name == unit_name: + return status.public_address + + raise Exception("No application unit found") + + +def get_unit_info(juju: Juju, unit_name: str) -> dict: + """Return a dictionary with the show-unit data.""" + output = subprocess.check_output( + ["juju", "show-unit", f"--model={juju.model}", "--format=json", unit_name], + text=True, + ) + + return json.loads(output) + + +def get_unit_status_log(juju: Juju, unit_name: str, log_lines: int = 0) -> list[dict]: + """Get the status log for a unit. + + Args: + juju: The juju instance to use. + unit_name: The name of the unit to retrieve the status log for + log_lines: The number of status logs to retrieve (optional) + """ + # fmt: off + output = subprocess.check_output( + ["juju", "show-status-log", f"--model={juju.model}", "--format=json", unit_name, "-n", f"{log_lines}"], + text=True, + ) + + return json.loads(output) + + +def get_relation_data(juju: Juju, app_name: str, rel_name: str) -> list[dict]: + """Returns a list that contains the relation-data. + + Args: + juju: The juju instance to use. + app_name: The name of the application + rel_name: name of the relation to get connection data from + + Returns: + A list that contains the relation-data + """ + app_leader = get_app_leader(juju, app_name) + app_leader_info = get_unit_info(juju, app_leader) + if not app_leader_info: + raise ValueError(f"No unit info could be grabbed for unit {app_leader}") + + relation_data = [ + value + for value in app_leader_info[app_leader]["relation-info"] + if value["endpoint"] == rel_name + ] + if not relation_data: + raise ValueError(f"No relation data could be grabbed for relation {rel_name}") + + return relation_data + + +def get_db_unit_name(instance_label: str) -> str: + """Builds a Juju unit name out of a MySQL instance label.""" + return "/".join(instance_label.rsplit("-", 1)) + + +def get_db_primary_unit(juju: Juju, app_name: str) -> str: + """Get the current primary node of the cluster.""" + postgresql_primary = get_app_leader(juju, app_name) + task = juju.run(unit=postgresql_primary, action="get-primary", wait=5 * MINUTE_SECS) + task.raise_on_failure() + + primary = task.results.get("primary") + if primary != "None": + return primary + + raise Exception("No primary node found") + + +def get_db_standby_leader_unit(juju: Juju, app_name: str) -> str: + """Get the current standby node of the cluster.""" + unit_address = get_unit_ip(juju, app_name, get_app_leader(juju, app_name)) + + for member in requests.get(f"https://{unit_address}:8008/history", verify=False).json()[ + "members" + ]: + if member["role"] == "standby_leader": + return member["name"][::-1].replace("-", "/")[::-1] + + raise Exception("No standby primary node found") + + +def get_db_max_written_value( + juju: Juju, app_name: str, unit_name: str, db_name: str = "postgresql_test_app_database" +) -> int: + """Retrieve the max written value in the MySQL database. + + Args: + juju: The Juju model. + app_name: The application name. + unit_name: The unit name. + db_name: The database to connect to. + """ + password = get_user_password(juju, app_name, SERVER_CONFIG_USERNAME) + + output = execute_queries_on_unit( + get_unit_ip(juju, app_name, unit_name), + SERVER_CONFIG_USERNAME, + password, + ["SELECT MAX(number) FROM continuous_writes;"], + db_name, + ) + return output[0] + + +def wait_for_apps_status(jubilant_status_func: JujuAppsStatusFn, *apps: str) -> JujuModelStatusFn: + """Waits for Juju agents to be idle, and for applications to reach a certain status. + + Args: + jubilant_status_func: The Juju apps status function to wait for. + apps: The applications to wait for. + + Returns: + Juju model status function. + """ + return lambda status: all(( + jubilant.all_agents_idle(status, *apps), + jubilant_status_func(status, *apps), + )) + + +def wait_for_unit_status(app_name: str, unit_name: str, unit_status: str) -> JujuModelStatusFn: + """Returns whether a Juju unit to have a specific status.""" + return lambda status: ( + status.apps[app_name].units[unit_name].workload_status.current == unit_status + ) + + +def wait_for_unit_message(app_name: str, unit_name: str, unit_message: str) -> JujuModelStatusFn: + """Returns whether a Juju unit to have a specific message.""" + return lambda status: ( + status.apps[app_name].units[unit_name].workload_status.message == unit_message + ) + + +# PG helpers + + +def get_user_password(juju: Juju, app_name: str, user: str) -> str | None: + """Get a system user's password.""" + for secret in juju.secrets(): + if secret.label == f"{PEER}.{app_name}.app": + revealed_secret = juju.show_secret(secret.uri, reveal=True) + return revealed_secret.content.get(f"{user}-password") + + +def count_switchovers(juju: Juju, app_name: str) -> int: + """Return the number of performed switchovers.""" + app_primary = get_db_primary_unit(juju, app_name) + unit_address = get_unit_ip(juju, app_name, app_primary) + switchover_history_info = requests.get(f"https://{unit_address}:8008/history", verify=False) + return len(switchover_history_info.json()) diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py new file mode 100644 index 0000000000..72f1b46c4d --- /dev/null +++ b/tests/integration/high_availability/test_async_replication.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +import time +from collections.abc import Generator + +import jubilant +import pytest +from jubilant import Juju +from tenacity import Retrying, stop_after_attempt + +from .. import architecture +from .high_availability_helpers_new import ( + get_app_leader, + get_app_units, + get_db_max_written_value, + get_db_primary_unit, + get_db_standby_leader_unit, + wait_for_apps_status, +) + +DB_APP_1 = "db1" +DB_APP_2 = "db2" +DB_TEST_APP_NAME = "postgresql-test-app" +DB_TEST_APP_1 = "test-app1" +DB_TEST_APP_2 = "test-app2" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +@pytest.fixture(scope="module") +def first_model(juju: Juju, request: pytest.FixtureRequest) -> Generator: + """Creates and return the first model.""" + yield juju.model + + +@pytest.fixture(scope="module") +def second_model(juju: Juju, request: pytest.FixtureRequest) -> Generator: + """Creates and returns the second model.""" + model_name = f"{juju.model}-other" + + logging.info(f"Creating model: {model_name}") + juju.add_model(model_name) + + yield model_name + if request.config.getoption("--keep-models"): + return + + logging.info(f"Destroying model: {model_name}") + juju.destroy_model(model_name, destroy_storage=True, force=True) + + +@pytest.fixture() +def first_model_continuous_writes(first_model: str) -> Generator: + """Starts continuous writes to the cluster for a test and clear the writes at the end.""" + model_1 = Juju(model=first_model) + application_unit = get_app_leader(model_1, DB_TEST_APP_1) + + logging.info("Clearing continuous writes") + model_1.run( + unit=application_unit, action="clear-continuous-writes", wait=120 + ).raise_on_failure() + + logging.info("Starting continuous writes") + + for attempt in Retrying(stop=stop_after_attempt(10), reraise=True): + with attempt: + result = model_1.run(unit=application_unit, action="start-continuous-writes") + result.raise_on_failure() + + assert result.results["result"] == "True" + + yield + + logging.info("Clearing continuous writes") + model_1.run( + unit=application_unit, action="clear-continuous-writes", wait=120 + ).raise_on_failure() + + +def test_deploy(first_model: str, second_model: str, charm: str) -> None: + """Simple test to ensure that the database application charms get deployed.""" + configuration = {"profile": "testing"} + constraints = {"arch": architecture.architecture} + + logging.info("Deploying postgresql clusters") + model_1 = Juju(model=first_model) + model_1.deploy( + charm=charm, + app=DB_APP_1, + base="ubuntu@24.04", + config=configuration, + constraints=constraints, + num_units=3, + ) + model_2 = Juju(model=second_model) + model_2.deploy( + charm=charm, + app=DB_APP_2, + base="ubuntu@24.04", + config=configuration, + constraints=constraints, + num_units=3, + ) + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS + ) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=20 * MINUTE_SECS + ) + + +def test_async_relate(first_model: str, second_model: str) -> None: + """Relate the two MySQL clusters.""" + logging.info("Creating offers in first model") + model_1 = Juju(model=first_model) + model_1.offer(f"{first_model}.{DB_APP_1}", endpoint="replication-offer") + + logging.info("Consuming offer in second model") + model_2 = Juju(model=second_model) + model_2.consume(f"{first_model}.{DB_APP_1}") + + logging.info("Relating the two postgresql clusters") + model_2.integrate(f"{DB_APP_1}", f"{DB_APP_2}:replication") + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant.any_active, DB_APP_1), + timeout=10 * MINUTE_SECS, + ) + model_2.wait( + ready=wait_for_apps_status(jubilant.any_active, DB_APP_2), + timeout=10 * MINUTE_SECS, + ) + + +def test_deploy_app(first_model: str, second_model: str) -> None: + """Deploy the router and the test application.""" + constraints = {"arch": architecture.architecture} + logging.info("Deploying test application") + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + model_1.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_1, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + constraints=constraints, + ) + model_2.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_2, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + constraints=constraints, + ) + + logging.info("Relating test application") + model_1.integrate(f"{DB_TEST_APP_1}:database", f"{DB_APP_1}:database") + model_2.integrate(f"{DB_TEST_APP_2}:database", f"{DB_APP_2}:database") + + model_1.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_TEST_APP_1), timeout=10 * MINUTE_SECS + ) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_TEST_APP_2), timeout=10 * MINUTE_SECS + ) + + +def test_create_replication(first_model: str, second_model: str) -> None: + """Run the create-replication action and wait for the applications to settle.""" + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + + logging.info("Running create replication action") + task = model_1.run( + unit=get_app_leader(model_1, DB_APP_1), action="create-replication", wait=5 * MINUTE_SECS + ) + task.raise_on_failure() + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS + ) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=20 * MINUTE_SECS + ) + + +def test_data_replication( + first_model: str, second_model: str, first_model_continuous_writes +) -> None: + """Test to write to primary, and read the same data back from replicas.""" + logging.info("Testing data replication") + results = get_db_max_written_values(first_model, second_model, first_model, DB_TEST_APP_1) + + assert len(results) == 6 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + +def test_standby_promotion(first_model: str, second_model: str) -> None: + """Test graceful promotion of a standby cluster to primary.""" + model_2 = Juju(model=second_model) + model_2_postgresql_leader = get_app_leader(model_2, DB_APP_2) + + logging.info("Promoting standby cluster to primary") + promotion_task = model_2.run( + unit=model_2_postgresql_leader, action="promote-to-primary", params={"scope": "cluster"} + ) + promotion_task.raise_on_failure() + + rerelate_test_app(model_2, DB_APP_2, DB_TEST_APP_2) + + results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_2) + assert len(results) == 6 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + +def test_failover_in_main_cluster(first_model: str, second_model: str) -> None: + """Test that async replication fails over correctly.""" + model_2 = Juju(model=second_model) + + rerelate_test_app(model_2, DB_APP_2, DB_TEST_APP_2) + + primary = get_db_primary_unit(model_2, DB_APP_2) + model_2.remove_unit(primary, force=True) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=10 * MINUTE_SECS + ) + + results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_2) + + assert len(results) == 5 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + assert primary != get_db_primary_unit(model_2, DB_APP_2) + + +def test_failover_in_standby_cluster(first_model: str, second_model: str) -> None: + """Test that async replication fails over correctly.""" + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + + rerelate_test_app(model_2, DB_APP_2, DB_TEST_APP_2) + + standby = get_db_standby_leader_unit(model_1, DB_APP_2) + model_1.remove_unit(standby, force=True) + + results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_2) + + assert len(results) == 4 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + assert standby != get_db_standby_leader_unit(model_1, DB_APP_2) + + +def test_unrelate_and_relate(first_model: str, second_model: str) -> None: + """Test removing and re-relating the two postgresql clusters.""" + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + + logging.info("Remove async relation") + model_2.remove_relation(f"{DB_APP_1}", f"{DB_APP_2}:replication") + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=10 * MINUTE_SECS + ) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_blocked, DB_APP_2), timeout=10 * MINUTE_SECS + ) + + logging.info("Re-relating the two postgresql clusters") + model_2.integrate(f"{DB_APP_1}", f"{DB_APP_2}:replication") + model_1.wait( + ready=wait_for_apps_status(jubilant.any_blocked, DB_APP_1), timeout=5 * MINUTE_SECS + ) + + rerelate_test_app(model_1, DB_APP_1, DB_TEST_APP_1) + + logging.info("Running create replication action") + model_1.run( + unit=get_app_leader(model_1, DB_APP_1), action="create-replication", wait=5 * MINUTE_SECS + ).raise_on_failure() + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS + ) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=20 * MINUTE_SECS + ) + + results = get_db_max_written_values(first_model, second_model, first_model, DB_TEST_APP_1) + assert len(results) == 6 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + +def get_db_max_written_values( + first_model: str, second_model: str, test_model: str, test_app: str +) -> list[int]: + """Return list with max written value from all units.""" + db_name = f"{test_app.replace('-', '_')}_database" + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + test_app_model = model_1 if test_model == first_model else model_2 + + logging.info("Stopping continuous writes") + test_app_model.run( + unit=get_app_leader(test_app_model, test_app), action="stop-continuous-writes" + ).raise_on_failure() + + time.sleep(5) + results = [] + + logging.info(f"Querying max value on all {DB_APP_1} units") + for unit_name in get_app_units(model_1, DB_APP_1): + unit_max_value = get_db_max_written_value(model_1, DB_APP_1, unit_name, db_name) + results.append(unit_max_value) + + logging.info(f"Querying max value on all {DB_APP_2} units") + for unit_name in get_app_units(model_2, DB_APP_2): + unit_max_value = get_db_max_written_value(model_2, DB_APP_2, unit_name, db_name) + results.append(unit_max_value) + + return results + + +def rerelate_test_app(juju: Juju, db_name: str, test_app_name: str) -> None: + logging.info(f"Reintegrating {db_name} and {test_app_name}") + juju.remove_relation(db_name, f"{test_app_name}:database") + juju.wait( + ready=wait_for_apps_status(jubilant.all_blocked, test_app_name) + and wait_for_apps_status(jubilant.all_active, db_name), + timeout=10 * MINUTE_SECS, + ) + + juju.integrate(f"{db_name}:database", f"{test_app_name}:database") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, test_app_name, db_name), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Clearing continuous writes") + application_unit = get_app_leader(juju, test_app_name) + juju.run(unit=application_unit, action="clear-continuous-writes", wait=120).raise_on_failure() + + logging.info("Starting continuous writes") + for attempt in Retrying(stop=stop_after_attempt(10), reraise=True): + with attempt: + result = juju.run(unit=application_unit, action="start-continuous-writes") + result.raise_on_failure() + + assert result.results["result"] == "True" diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py new file mode 100644 index 0000000000..ebea59537b --- /dev/null +++ b/tests/integration/high_availability/test_upgrade.py @@ -0,0 +1,184 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +import platform +import shutil +import zipfile +from pathlib import Path + +import jubilant +import tomli +import tomli_w +from jubilant import Juju + +from .high_availability_helpers_new import ( + check_db_units_writes_increment, + count_switchovers, + get_app_leader, + get_app_units, + wait_for_apps_status, +) + +DB_APP_NAME = "postgresql" +DB_TEST_APP_NAME = "postgresql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +def test_deploy_latest(juju: Juju) -> None: + """Simple test to ensure that the PostgreSQL and application charms get deployed.""" + logging.info("Deploying PostgreSQL cluster") + juju.deploy( + charm=DB_APP_NAME, + app=DB_APP_NAME, + base="ubuntu@24.04", + channel="16/edge", + config={"profile": "testing"}, + num_units=3, + ) + juju.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + ) + + juju.integrate( + f"{DB_APP_NAME}:database", + f"{DB_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME, DB_TEST_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + +def test_pre_refresh_check(juju: Juju) -> None: + """Test that the pre-refresh-check action runs successfully.""" + db_leader = get_app_leader(juju, DB_APP_NAME) + + logging.info("Run pre-refresh-check action") + juju.run(unit=db_leader, action="pre-refresh-check") + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + +def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: + """Update the second cluster.""" + logging.info("Ensure continuous writes are incrementing") + check_db_units_writes_increment(juju, DB_APP_NAME) + + initial_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + + logging.info("Refresh the charm") + juju.refresh(app=DB_APP_NAME, path=charm) + logging.info("Wait for refresh to block as paused or incompatible") + try: + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") + juju.run( + unit=unit_names[-1], + action="force-refresh-start", + params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, + ) + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + logging.info("Run resume-refresh action") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + except TimeoutError: + logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") + assert juju.status().apps[DB_APP_NAME].is_active + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes are incrementing") + check_db_units_writes_increment(juju, DB_APP_NAME) + + logging.info("checking the number of switchovers") + final_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( + "Number of switchovers is greater than 2" + ) + + +def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: + """Test an upgrade failure and its rollback.""" + db_app_leader = get_app_leader(juju, DB_APP_NAME) + db_app_units = get_app_units(juju, DB_APP_NAME) + + logging.info("Run pre-refresh-check action") + juju.run(unit=db_app_leader, action="pre-refresh-check") + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + tmp_folder = Path("tmp") + tmp_folder.mkdir(exist_ok=True) + tmp_folder_charm = Path(tmp_folder, charm).absolute() + + shutil.copy(charm, tmp_folder_charm) + + logging.info("Inject dependency fault") + inject_dependency_fault(juju, DB_APP_NAME, tmp_folder_charm) + + logging.info("Refresh the charm") + juju.refresh(app=DB_APP_NAME, path=tmp_folder_charm) + + logging.info("Wait for upgrade to fail on leader") + juju.wait( + ready=wait_for_apps_status(jubilant.any_blocked, DB_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes on all units") + check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) + + logging.info("Re-refresh the charm") + juju.refresh(app=DB_APP_NAME, path=charm) + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes after rollback procedure") + check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) + + # Remove fault charm file + tmp_folder_charm.unlink() + + +def inject_dependency_fault(juju: Juju, app_name: str, charm_file: str | Path) -> None: + """Inject a dependency fault into the PostgreSQL charm.""" + with Path("refresh_versions.toml").open("rb") as file: + versions = tomli.load(file) + + versions["charm"] = "16/0.0.0" + versions["snap"]["revisions"][platform.machine()] = "1" + + # Overwrite refresh_versions.toml with incompatible version. + with zipfile.ZipFile(charm_file, mode="a") as charm_zip: + charm_zip.writestr("refresh_versions.toml", tomli_w.dumps(versions)) diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py new file mode 100644 index 0000000000..cc594d7fb3 --- /dev/null +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -0,0 +1,113 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import jubilant +from jubilant import Juju + +from .high_availability_helpers_new import ( + check_db_units_writes_increment, + count_switchovers, + get_app_leader, + get_app_units, + wait_for_apps_status, +) + +DB_APP_NAME = "postgresql" +DB_TEST_APP_NAME = "postgresql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +def test_deploy_stable(juju: Juju) -> None: + """Simple test to ensure that the PostgreSQL and application charms get deployed.""" + logging.info("Deploying PostgreSQL cluster") + juju.deploy( + charm=DB_APP_NAME, + app=DB_APP_NAME, + base="ubuntu@24.04", + channel="16/stable", + config={"profile": "testing"}, + num_units=3, + ) + juju.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + ) + + juju.integrate( + f"{DB_APP_NAME}:database", + f"{DB_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME, DB_TEST_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + +def test_pre_refresh_check(juju: Juju) -> None: + """Test that the pre-refresh-check action runs successfully.""" + db_leader = get_app_leader(juju, DB_APP_NAME) + + logging.info("Run pre-refresh-check action") + juju.run(unit=db_leader, action="pre-refresh-check") + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + +def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> None: + """Update the second cluster.""" + logging.info("Ensure continuous writes are incrementing") + check_db_units_writes_increment(juju, DB_APP_NAME) + + initial_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + + logging.info("Refresh the charm") + juju.refresh(app=DB_APP_NAME, path=charm) + + logging.info("Wait for refresh to block as paused or incompatible") + try: + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") + juju.run( + unit=unit_names[-1], + action="force-refresh-start", + params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, + ) + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + logging.info("Run resume-refresh action") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + except TimeoutError: + logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") + assert juju.status().apps[DB_APP_NAME].is_active + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes are incrementing") + check_db_units_writes_increment(juju, DB_APP_NAME) + + logging.info("checking the number of switchovers") + final_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( + "Number of switchovers is greater than 2" + ) diff --git a/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py b/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py new file mode 100644 index 0000000000..67806e3796 --- /dev/null +++ b/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py @@ -0,0 +1,140 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import jubilant +from jubilant import Juju + +from .high_availability_helpers_new import ( + check_db_units_writes_increment, + count_switchovers, + get_app_units, + wait_for_apps_status, +) + +DB_APP_NAME = "postgresql" +DB_TEST_APP_NAME = "postgresql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +def test_deploy_stable(juju: Juju) -> None: + """Simple test to ensure that the PostgreSQL and application charms get deployed.""" + logging.info("Deploying PostgreSQL cluster") + juju.deploy( + charm=DB_APP_NAME, + app=DB_APP_NAME, + base="ubuntu@24.04", + channel="16/stable", + config={"profile": "testing"}, + num_units=3, + ) + juju.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + ) + + juju.integrate( + f"{DB_APP_NAME}:database", + f"{DB_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME, DB_TEST_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + +def test_refresh_without_pre_refresh_check(juju: Juju, charm: str, continuous_writes) -> None: + """Test updating from stable channel.""" + initial_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + + logging.info("Refresh the charm") + juju.refresh(app=DB_APP_NAME, path=charm) + + logging.info("Wait for refresh to block as paused or incompatible") + try: + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") + juju.run( + unit=unit_names[-1], + action="force-refresh-start", + params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, + ) + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + logging.info("Run resume-refresh action") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + except TimeoutError: + logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") + assert juju.status().apps[DB_APP_NAME].is_active + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes are incrementing") + check_db_units_writes_increment(juju, DB_APP_NAME) + + logging.info("checking the number of switchovers") + final_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( + "Number of switchovers is greater than 2" + ) + + +async def test_rollback_without_pre_refresh_check( + juju: Juju, charm: str, continuous_writes +) -> None: + """Test refresh back to stable channel.""" + # Early Jubilant 1.X.Y versions do not support the `switch` option + logging.info("Refresh the charm to stable channel") + juju.cli("refresh", "--channel=16/stable", f"--switch={DB_APP_NAME}", DB_APP_NAME) + + logging.info("Wait for refresh to block as paused or incompatible") + try: + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") + juju.run( + unit=unit_names[-1], + action="force-refresh-start", + params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, + ) + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + logging.info("Run resume-refresh action") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + except TimeoutError: + logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") + assert juju.status().apps[DB_APP_NAME].is_active + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + check_db_units_writes_increment(juju, DB_APP_NAME) diff --git a/tests/spread/test_async_replication.py/task.yaml b/tests/spread/test_async_replication.py/task.yaml index 4fbf3b6b36..cfadb00ee5 100644 --- a/tests/spread/test_async_replication.py/task.yaml +++ b/tests/spread/test_async_replication.py/task.yaml @@ -1,9 +1,7 @@ summary: test_async_replication.py environment: - TEST_MODULE: ha_tests/test_async_replication.py + TEST_MODULE: high_availability/test_async_replication.py execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: - allure-results -variants: - - -juju29 diff --git a/tests/spread/test_scaling.py/task.yaml b/tests/spread/test_scaling.py/task.yaml index 32358243db..656780e30d 100644 --- a/tests/spread/test_scaling.py/task.yaml +++ b/tests/spread/test_scaling.py/task.yaml @@ -5,5 +5,3 @@ execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: - allure-results -variants: - - -juju29 diff --git a/tests/spread/test_scaling_three_units.py/task.yaml b/tests/spread/test_scaling_three_units.py/task.yaml index ae8dcc1006..f46a54dab3 100644 --- a/tests/spread/test_scaling_three_units.py/task.yaml +++ b/tests/spread/test_scaling_three_units.py/task.yaml @@ -5,5 +5,3 @@ execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: - allure-results -variants: - - -juju29 diff --git a/tests/spread/test_scaling_three_units_async.py/task.yaml b/tests/spread/test_scaling_three_units_async.py/task.yaml index cd8a7ba5aa..686116f361 100644 --- a/tests/spread/test_scaling_three_units_async.py/task.yaml +++ b/tests/spread/test_scaling_three_units_async.py/task.yaml @@ -5,5 +5,3 @@ execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: - allure-results -variants: - - -juju29 diff --git a/tests/spread/test_upgrade.py/task.yaml b/tests/spread/test_upgrade.py/task.yaml index b3be366921..f99ac69384 100644 --- a/tests/spread/test_upgrade.py/task.yaml +++ b/tests/spread/test_upgrade.py/task.yaml @@ -1,6 +1,6 @@ summary: test_upgrade.py environment: - TEST_MODULE: ha_tests/test_upgrade.py + TEST_MODULE: high_availability/test_upgrade.py execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: diff --git a/tests/spread/test_upgrade_from_stable.py/task.yaml b/tests/spread/test_upgrade_from_stable.py/task.yaml index 047617ab39..ffdb002d25 100644 --- a/tests/spread/test_upgrade_from_stable.py/task.yaml +++ b/tests/spread/test_upgrade_from_stable.py/task.yaml @@ -1,6 +1,6 @@ summary: test_upgrade_from_stable.py environment: - TEST_MODULE: ha_tests/test_upgrade_from_stable.py + TEST_MODULE: high_availability/test_upgrade_from_stable.py execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: diff --git a/tests/spread/test_upgrade_skip_pre_upgrade_check.py/task.yaml b/tests/spread/test_upgrade_skip_pre_upgrade_check.py/task.yaml new file mode 100644 index 0000000000..79ed8357d0 --- /dev/null +++ b/tests/spread/test_upgrade_skip_pre_upgrade_check.py/task.yaml @@ -0,0 +1,7 @@ +summary: test_upgrade.py +environment: + TEST_MODULE: high_availability/test_upgrade_skip_pre_upgrade_check.py +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results