From d67f7a47af4ba373119c2cf3acf843fd7fcfd77f Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 3 Oct 2025 02:13:34 +0300 Subject: [PATCH 01/33] Porting tests WIP --- poetry.lock | 21 +- pyproject.toml | 1 + tests/integration/conftest.py | 16 +- tests/integration/helpers.py | 37 ++ .../integration/high_availability/__init__.py | 0 .../high_availability_helpers_new.py | 274 ++++++++++++ .../test_async_replication.py | 397 ++++++++++++++++++ .../test_primary_switchover.py | 105 +++++ .../high_availability/test_upgrade.py | 186 ++++++++ .../test_upgrade_from_stable.py | 106 +++++ .../test_upgrade_rollback_incompat.py | 269 ++++++++++++ .../test_upgrade_skip_pre_upgrade_check.py | 97 +++++ 12 files changed, 1495 insertions(+), 14 deletions(-) create mode 100644 tests/integration/high_availability/__init__.py create mode 100644 tests/integration/high_availability/high_availability_helpers_new.py create mode 100644 tests/integration/high_availability/test_async_replication.py create mode 100644 tests/integration/high_availability/test_primary_switchover.py create mode 100644 tests/integration/high_availability/test_upgrade.py create mode 100644 tests/integration/high_availability/test_upgrade_from_stable.py create mode 100644 tests/integration/high_availability/test_upgrade_rollback_incompat.py create mode 100644 tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py diff --git a/poetry.lock b/poetry.lock index 0cb4009a74..cdc98e7557 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.0 and should not be changed by hand. [[package]] name = "allure-pytest" @@ -1216,6 +1216,21 @@ files = [ [package.dependencies] PyYAML = "==6.*" +[[package]] +name = "jubilant-backports" +version = "1.0.0a1" +description = "Extends Jubilant to include support for Juju 2.9" +optional = false +python-versions = ">=3.8" +groups = ["integration"] +files = [ + {file = "jubilant_backports-1.0.0a1-py3-none-any.whl", hash = "sha256:ff8d73e17afaae4418c588496978ac42ee9eb9d6d4e77ce103102772038796cc"}, + {file = "jubilant_backports-1.0.0a1.tar.gz", hash = "sha256:03f0788a2301e1a71ebab56bc59515361c37e5686e40a985caba5b2907514e3f"}, +] + +[package.dependencies] +jubilant = ">=1.2,<2.0" + [[package]] name = "juju" version = "3.6.1.3" @@ -1802,6 +1817,7 @@ files = [ {file = "psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4"}, {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"}, {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"}, + {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"}, {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"}, {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"}, {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"}, @@ -1862,6 +1878,7 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, @@ -3085,4 +3102,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "e150f4e46ff0872c1d5da8e9e4f6fde42a7f710b09dde10db97b268e3026b9c2" +content-hash = "b18a577c36974feed06083a767c252cfd05e1e75af60cf11549fca02a2fdc150" diff --git a/pyproject.toml b/pyproject.toml index dd1b495591..689af84831 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ pytest-operator = "^0.43.1" # renovate caret doesn't work: https://github.com/renovatebot/renovate/issues/26940 juju = "<=3.6.1.3" jubilant = "^1.4.0" +jubilant-backports = "^1.0.0a1" boto3 = "*" tenacity = "*" landscape-api-py3 = "^0.9.0" diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 2740fa317d..2740c72605 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -5,7 +5,7 @@ import uuid import boto3 -import jubilant +import jubilant_backports import pytest from pytest_operator.plugin import OpsTest @@ -105,23 +105,15 @@ def juju(request: pytest.FixtureRequest): This adds command line parameter ``--keep-models`` (see help for details). """ - controller = request.config.getoption("--controller") model = request.config.getoption("--model") - controller_and_model = None - if controller and model: - controller_and_model = f"{controller}:{model}" - elif controller: - controller_and_model = controller - elif model: - controller_and_model = model keep_models = bool(request.config.getoption("--keep-models")) - if controller_and_model: - juju = jubilant.Juju(model=controller_and_model) # type: ignore + if model: + juju = jubilant_backports.Juju(model=model) # type: ignore yield juju log = juju.debug_log(limit=1000) else: - with jubilant.temp_model(keep=keep_models) as juju: + with jubilant_backports.temp_model(keep=keep_models) as juju: yield juju log = juju.debug_log(limit=1000) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index a047038990..793313a470 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -1431,3 +1431,40 @@ async def backup_operations( "backup wasn't correctly restored: table 'backup_table_3' exists" ) connection.close() + + +### Ported Mysql jubilant helpers + + +async def execute_queries_on_unit( + unit_address: str, + username: str, + password: str, + queries: list[str], + database: str, + commit: bool = False, +) -> list: + """Execute given MySQL queries on a unit. + + Args: + unit_address: The public IP address of the unit to execute the queries on + username: The PostgreSQL username + password: The PostgreSQL password + queries: A list of queries to execute + database: Database to execute in + commit: A keyword arg indicating whether there are any writes queries + + Returns: + A list of rows that were potentially queried + """ + with ( + psycopg2.connect( + f"dbname='{database}' user='{username}' host='{unit_address}' password='{password}' connect_timeout=10" + ) as connection, + connection.cursor() as cursor, + ): + for query in queries: + cursor.execute(query) + output = list(itertools.chain(*cursor.fetchall())) + + return output diff --git a/tests/integration/high_availability/__init__.py b/tests/integration/high_availability/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py new file mode 100644 index 0000000000..7329ca0bce --- /dev/null +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +import json +import subprocess +from collections.abc import Callable + +import jubilant_backports +from jubilant_backports import Juju +from jubilant_backports.statustypes import Status, UnitStatus +from tenacity import Retrying, stop_after_delay, wait_fixed + +from constants import SERVER_CONFIG_USERNAME + +from ..helpers import execute_queries_on_unit + +MINUTE_SECS = 60 + +JujuModelStatusFn = Callable[[Status], bool] +JujuAppsStatusFn = Callable[[Status, str], bool] + + +async def check_mysql_units_writes_increment( + juju: Juju, app_name: str, app_units: list[str] | None = None +) -> None: + """Ensure that continuous writes is incrementing on all units. + + Also, ensure that all continuous writes up to the max written value is available + on all units (ensure that no committed data is lost). + """ + if not app_units: + app_units = get_app_units(juju, app_name) + + app_primary = get_mysql_primary_unit(juju, app_name) + app_max_value = await get_mysql_max_written_value(juju, app_name, app_primary) + + juju.model_config({"update-status-hook-interval": "15s"}) + for unit_name in app_units: + for attempt in Retrying( + reraise=True, + stop=stop_after_delay(5 * MINUTE_SECS), + wait=wait_fixed(10), + ): + with attempt: + unit_max_value = await get_mysql_max_written_value(juju, app_name, unit_name) + assert unit_max_value > app_max_value, "Writes not incrementing" + app_max_value = unit_max_value + + +def get_app_leader(juju: Juju, app_name: str) -> str: + """Get the leader unit for the given application.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + for name, status in app_status.units.items(): + if status.leader: + return name + + raise Exception("No leader unit found") + + +def get_app_name(juju: Juju, charm_name: str) -> str | None: + """Get the application name for the given charm.""" + model_status = juju.status() + app_statuses = model_status.apps + for name, status in app_statuses.items(): + if status.charm_name == charm_name: + return name + + raise Exception("No application name found") + + +def get_app_units(juju: Juju, app_name: str) -> dict[str, UnitStatus]: + """Get the units for the given application.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + return app_status.units + + +def get_unit_by_number(juju: Juju, app_name: str, unit_number: int) -> str: + """Get unit by number.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + for name in app_status.units: + if name == f"{app_name}/{unit_number}": + return name + + raise Exception("No application unit found") + + +def get_unit_ip(juju: Juju, app_name: str, unit_name: str) -> str: + """Get the application unit IP.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + for name, status in app_status.units.items(): + if name == unit_name: + return status.public_address + + raise Exception("No application unit found") + + +def get_unit_info(juju: Juju, unit_name: str) -> dict: + """Return a dictionary with the show-unit data.""" + output = subprocess.check_output( + ["juju", "show-unit", f"--model={juju.model}", "--format=json", unit_name], + text=True, + ) + + return json.loads(output) + + +def get_unit_status_log(juju: Juju, unit_name: str, log_lines: int = 0) -> list[dict]: + """Get the status log for a unit. + + Args: + juju: The juju instance to use. + unit_name: The name of the unit to retrieve the status log for + log_lines: The number of status logs to retrieve (optional) + """ + # fmt: off + output = subprocess.check_output( + ["juju", "show-status-log", f"--model={juju.model}", "--format=json", unit_name, "-n", f"{log_lines}"], + text=True, + ) + + return json.loads(output) + + +def get_relation_data(juju: Juju, app_name: str, rel_name: str) -> list[dict]: + """Returns a list that contains the relation-data. + + Args: + juju: The juju instance to use. + app_name: The name of the application + rel_name: name of the relation to get connection data from + + Returns: + A list that contains the relation-data + """ + app_leader = get_app_leader(juju, app_name) + app_leader_info = get_unit_info(juju, app_leader) + if not app_leader_info: + raise ValueError(f"No unit info could be grabbed for unit {app_leader}") + + relation_data = [ + value + for value in app_leader_info[app_leader]["relation-info"] + if value["endpoint"] == rel_name + ] + if not relation_data: + raise ValueError(f"No relation data could be grabbed for relation {rel_name}") + + return relation_data + + +def get_mysql_cluster_status(juju: Juju, unit: str, cluster_set: bool = False) -> dict: + """Get the cluster status by running the get-cluster-status action. + + Args: + juju: The juju instance to use. + unit: The unit on which to execute the action on + cluster_set: Whether to get the cluster-set instead (optional) + + Returns: + A dictionary representing the cluster status + """ + task = juju.run( + unit=unit, + action="get-cluster-status", + params={"cluster-set": cluster_set}, + wait=5 * MINUTE_SECS, + ) + task.raise_on_failure() + + return task.results.get("status", {}) + + +def get_mysql_unit_name(instance_label: str) -> str: + """Builds a Juju unit name out of a MySQL instance label.""" + return "/".join(instance_label.rsplit("-", 1)) + + +def get_mysql_primary_unit(juju: Juju, app_name: str) -> str: + """Get the current primary node of the cluster.""" + mysql_primary = get_app_leader(juju, app_name) + mysql_cluster_status = get_mysql_cluster_status(juju, mysql_primary) + mysql_cluster_topology = mysql_cluster_status["defaultreplicaset"]["topology"] + + for label, value in mysql_cluster_topology.items(): + if value["memberrole"] == "primary": + return get_mysql_unit_name(label) + + raise Exception("No MySQL primary node found") + + +async def get_mysql_max_written_value(juju: Juju, app_name: str, unit_name: str) -> int: + """Retrieve the max written value in the MySQL database. + + Args: + juju: The Juju model. + app_name: The application name. + unit_name: The unit name. + """ + credentials_task = juju.run( + unit=unit_name, + action="get-password", + params={"username": SERVER_CONFIG_USERNAME}, + ) + credentials_task.raise_on_failure() + + output = await execute_queries_on_unit( + get_unit_ip(juju, app_name, unit_name), + credentials_task.results["username"], + credentials_task.results["password"], + ["SELECT MAX(number) FROM `continuous_writes`.`data`;"], + ) + return output[0] + + +async def get_mysql_variable_value( + juju: Juju, app_name: str, unit_name: str, variable_name: str +) -> str: + """Retrieve a database variable value as a string. + + Args: + juju: The Juju model. + app_name: The application name. + unit_name: The unit name. + variable_name: The variable name. + """ + credentials_task = juju.run( + unit=unit_name, + action="get-password", + params={"username": SERVER_CONFIG_USERNAME}, + ) + credentials_task.raise_on_failure() + + output = await execute_queries_on_unit( + get_unit_ip(juju, app_name, unit_name), + credentials_task.results["username"], + credentials_task.results["password"], + [f"SELECT @@{variable_name};"], + ) + return output[0] + + +def wait_for_apps_status(jubilant_status_func: JujuAppsStatusFn, *apps: str) -> JujuModelStatusFn: + """Waits for Juju agents to be idle, and for applications to reach a certain status. + + Args: + jubilant_status_func: The Juju apps status function to wait for. + apps: The applications to wait for. + + Returns: + Juju model status function. + """ + return lambda status: all(( + jubilant_backports.all_agents_idle(status, *apps), + jubilant_status_func(status, *apps), + )) + + +def wait_for_unit_status(app_name: str, unit_name: str, unit_status: str) -> JujuModelStatusFn: + """Returns whether a Juju unit to have a specific status.""" + return lambda status: ( + status.apps[app_name].units[unit_name].workload_status.current == unit_status + ) + + +def wait_for_unit_message(app_name: str, unit_name: str, unit_message: str) -> JujuModelStatusFn: + """Returns whether a Juju unit to have a specific message.""" + return lambda status: ( + status.apps[app_name].units[unit_name].workload_status.message == unit_message + ) diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py new file mode 100644 index 0000000000..a020fd88dd --- /dev/null +++ b/tests/integration/high_availability/test_async_replication.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python3 +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +import time +from collections.abc import Generator + +import jubilant_backports +import pytest +from jubilant_backports import Juju + +from .. import architecture +from ..markers import juju3 +from .high_availability_helpers_new import ( + get_app_leader, + get_app_units, + get_mysql_cluster_status, + get_mysql_max_written_value, + wait_for_apps_status, +) + +MYSQL_APP_1 = "db1" +MYSQL_APP_2 = "db2" +MYSQL_ROUTER_NAME = "mysql-router" +MYSQL_TEST_APP_NAME = "mysql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +@pytest.fixture(scope="module") +def first_model(juju: Juju, request: pytest.FixtureRequest) -> Generator: + """Creates and return the first model.""" + yield juju.model + + +@pytest.fixture(scope="module") +def second_model(juju: Juju, request: pytest.FixtureRequest) -> Generator: + """Creates and returns the second model.""" + model_name = f"{juju.model}-other" + + logging.info(f"Creating model: {model_name}") + juju.add_model(model_name) + + yield model_name + if request.config.getoption("--keep-models"): + return + + logging.info(f"Destroying model: {model_name}") + juju.destroy_model(model_name, destroy_storage=True, force=True) + + +@pytest.fixture() +def continuous_writes(first_model: str) -> Generator: + """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" + model_1 = Juju(model=first_model) + model_1_test_app_leader = get_app_leader(model_1, MYSQL_TEST_APP_NAME) + + logging.info("Clearing continuous writes") + model_1.run(model_1_test_app_leader, "clear-continuous-writes") + logging.info("Starting continuous writes") + model_1.run(model_1_test_app_leader, "start-continuous-writes") + + yield + + logging.info("Clearing continuous writes") + model_1.run(model_1_test_app_leader, "clear-continuous-writes") + + +@juju3 +@pytest.mark.abort_on_fail +def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> None: + """Simple test to ensure that the MySQL application charms get deployed.""" + configuration = {"profile": "testing"} + constraints = {"arch": architecture.architecture} + + logging.info("Deploying mysql clusters") + model_1 = Juju(model=first_model) + model_1.deploy( + charm=charm, + app=MYSQL_APP_1, + base="ubuntu@22.04", + config={**configuration, "cluster-name": "lima"}, + constraints=constraints, + num_units=3, + ) + model_2 = Juju(model=second_model) + model_2.deploy( + charm=charm, + app=MYSQL_APP_2, + base="ubuntu@22.04", + config={**configuration, "cluster-name": "cuzco"}, + constraints=constraints, + num_units=3, + ) + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_1), + timeout=10 * MINUTE_SECS, + ) + model_2.wait( + ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_2), + timeout=10 * MINUTE_SECS, + ) + + +@juju3 +@pytest.mark.abort_on_fail +def test_async_relate(first_model: str, second_model: str) -> None: + """Relate the two MySQL clusters.""" + logging.info("Creating offers in first model") + model_1 = Juju(model=first_model) + model_1.offer(MYSQL_APP_1, endpoint="replication-offer") + + logging.info("Consuming offer in second model") + model_2 = Juju(model=second_model) + model_2.consume(f"{first_model}.{MYSQL_APP_1}") + + logging.info("Relating the two mysql clusters") + model_2.integrate( + f"{MYSQL_APP_1}", + f"{MYSQL_APP_2}:replication", + ) + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant_backports.any_blocked, MYSQL_APP_1), + timeout=5 * MINUTE_SECS, + ) + model_2.wait( + ready=wait_for_apps_status(jubilant_backports.any_waiting, MYSQL_APP_2), + timeout=5 * MINUTE_SECS, + ) + + +@juju3 +@pytest.mark.abort_on_fail +def test_deploy_router_and_app(first_model: str) -> None: + """Deploy the router and the test application.""" + logging.info("Deploying the router and test application") + model_1 = Juju(model=first_model) + model_1.deploy( + charm=MYSQL_ROUTER_NAME, + app=MYSQL_ROUTER_NAME, + base="ubuntu@22.04", + channel="dpe/edge", + num_units=1, + trust=True, + ) + model_1.deploy( + charm=MYSQL_TEST_APP_NAME, + app=MYSQL_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + trust=False, + ) + + logging.info("Relating the router and test application") + model_1.integrate( + f"{MYSQL_ROUTER_NAME}:database", + f"{MYSQL_TEST_APP_NAME}:database", + ) + model_1.integrate( + f"{MYSQL_ROUTER_NAME}:backend-database", + f"{MYSQL_APP_1}:database", + ) + + model_1.wait( + ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_TEST_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + +@juju3 +@pytest.mark.abort_on_fail +def test_create_replication(first_model: str, second_model: str) -> None: + """Run the create-replication action and wait for the applications to settle.""" + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + + logging.info("Running create replication action") + task = model_1.run( + unit=get_app_leader(model_1, MYSQL_APP_1), + action="create-replication", + wait=5 * MINUTE_SECS, + ) + task.raise_on_failure() + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_1), + timeout=5 * MINUTE_SECS, + ) + model_2.wait( + ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_2), + timeout=5 * MINUTE_SECS, + ) + + +@juju3 +@pytest.mark.abort_on_fail +async def test_data_replication(first_model: str, second_model: str, continuous_writes) -> None: + """Test to write to primary, and read the same data back from replicas.""" + logging.info("Testing data replication") + results = await get_mysql_max_written_values(first_model, second_model) + + assert len(results) == 6 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + +@juju3 +@pytest.mark.abort_on_fail +async def test_standby_promotion(first_model: str, second_model: str, continuous_writes) -> None: + """Test graceful promotion of a standby cluster to primary.""" + model_2 = Juju(model=second_model) + model_2_mysql_leader = get_app_leader(model_2, MYSQL_APP_2) + + logging.info("Promoting standby cluster to primary") + promotion_task = model_2.run( + unit=model_2_mysql_leader, + action="promote-to-primary", + params={"scope": "cluster"}, + ) + promotion_task.raise_on_failure() + + results = await get_mysql_max_written_values(first_model, second_model) + assert len(results) == 6 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + cluster_set_status = get_mysql_cluster_status( + juju=model_2, + unit=model_2_mysql_leader, + cluster_set=True, + ) + + assert cluster_set_status["clusters"]["cuzco"]["clusterrole"] == "primary", ( + "standby not promoted to primary" + ) + + +@juju3 +@pytest.mark.abort_on_fail +def test_failover(first_model: str, second_model: str) -> None: + """Test switchover on primary cluster fail.""" + logging.info("Freezing mysqld on primary cluster units") + model_2 = Juju(model=second_model) + model_2_mysql_units = get_app_units(model_2, MYSQL_APP_2) + + # Simulating a failure on the primary cluster + for unit_name in model_2_mysql_units: + model_2.exec("sudo pkill -x mysqld --signal SIGSTOP", unit=unit_name) + + logging.info("Promoting standby cluster to primary with force flag") + model_1 = Juju(model=first_model) + model_1_mysql_leader = get_app_leader(model_1, MYSQL_APP_1) + + promotion_task = model_1.run( + unit=model_1_mysql_leader, + action="promote-to-primary", + params={"scope": "cluster", "force": True}, + wait=5 * MINUTE_SECS, + ) + promotion_task.raise_on_failure() + + # Restore mysqld process + logging.info("Unfreezing mysqld on primary cluster units") + for unit_name in model_2_mysql_units: + model_2.exec("sudo pkill -x mysqld --signal SIGCONT", unit=unit_name) + + logging.info("Checking clusters statuses") + cluster_set_status = get_mysql_cluster_status( + juju=model_1, + unit=model_1_mysql_leader, + cluster_set=True, + ) + + assert cluster_set_status["clusters"]["lima"]["clusterrole"] == "primary", ( + "standby not promoted to primary", + ) + assert cluster_set_status["clusters"]["cuzco"]["globalstatus"] == "invalidated", ( + "old primary not invalidated" + ) + + +@juju3 +@pytest.mark.abort_on_fail +async def test_rejoin_invalidated_cluster( + first_model: str, second_model: str, continuous_writes +) -> None: + """Test rejoin invalidated cluster with.""" + model_1 = Juju(model=first_model) + model_1_mysql_leader = get_app_leader(model_1, MYSQL_APP_1) + + task = model_1.run( + unit=model_1_mysql_leader, + action="rejoin-cluster", + params={"cluster-name": "cuzco"}, + wait=5 * MINUTE_SECS, + ) + task.raise_on_failure() + + results = await get_mysql_max_written_values(first_model, second_model) + assert len(results) == 6 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + +@juju3 +@pytest.mark.abort_on_fail +async def test_unrelate_and_relate(first_model: str, second_model: str, continuous_writes) -> None: + """Test removing and re-relating the two mysql clusters.""" + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + + logging.info("Remove async relation") + model_2.remove_relation( + f"{MYSQL_APP_1}", + f"{MYSQL_APP_2}:replication", + ) + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_1), + timeout=10 * MINUTE_SECS, + ) + model_2.wait( + ready=wait_for_apps_status(jubilant_backports.all_blocked, MYSQL_APP_2), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Re relating the two mysql clusters") + model_2.integrate( + f"{MYSQL_APP_1}", + f"{MYSQL_APP_2}:replication", + ) + model_1.wait( + ready=wait_for_apps_status(jubilant_backports.any_blocked, MYSQL_APP_1), + timeout=5 * MINUTE_SECS, + ) + + logging.info("Running create replication action") + task = model_1.run( + unit=get_app_leader(model_1, MYSQL_APP_1), + action="create-replication", + wait=5 * MINUTE_SECS, + ) + task.raise_on_failure() + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_1), + timeout=10 * MINUTE_SECS, + ) + model_2.wait( + ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_2), + timeout=10 * MINUTE_SECS, + ) + + results = await get_mysql_max_written_values(first_model, second_model) + assert len(results) == 6 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + +async def get_mysql_max_written_values(first_model: str, second_model: str) -> list[int]: + """Return list with max written value from all units.""" + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + + logging.info("Stopping continuous writes") + stopping_task = model_1.run( + unit=get_app_leader(model_1, MYSQL_TEST_APP_NAME), + action="stop-continuous-writes", + params={}, + ) + stopping_task.raise_on_failure() + + time.sleep(5) + results = [] + + logging.info(f"Querying max value on all {MYSQL_APP_1} units") + for unit_name in get_app_units(model_1, MYSQL_APP_1): + unit_max_value = await get_mysql_max_written_value(model_1, MYSQL_APP_1, unit_name) + results.append(unit_max_value) + + logging.info(f"Querying max value on all {MYSQL_APP_2} units") + for unit_name in get_app_units(model_2, MYSQL_APP_2): + unit_max_value = await get_mysql_max_written_value(model_2, MYSQL_APP_2, unit_name) + results.append(unit_max_value) + + return results diff --git a/tests/integration/high_availability/test_primary_switchover.py b/tests/integration/high_availability/test_primary_switchover.py new file mode 100644 index 0000000000..6aa2266864 --- /dev/null +++ b/tests/integration/high_availability/test_primary_switchover.py @@ -0,0 +1,105 @@ +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +from subprocess import run + +import pytest +from jubilant_backports import Juju, all_active + +from .high_availability_helpers_new import ( + get_app_name, + get_app_units, + get_mysql_primary_unit, + wait_for_unit_message, + wait_for_unit_status, +) + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +@pytest.mark.abort_on_fail +def test_cluster_switchover(juju: Juju, highly_available_cluster) -> None: + """Test that the primary node can be switched over.""" + logging.info("Testing cluster switchover...") + app_name = get_app_name(juju, "mysql") + assert app_name, "MySQL application not found in the cluster" + + app_units = set(get_app_units(juju, app_name)) + assert len(app_units) > 1, "Not enough units to perform a switchover" + + primary_unit = get_mysql_primary_unit(juju, app_name) + assert primary_unit, "No primary unit found in the cluster" + logging.info(f"Current primary unit: {primary_unit}") + + logging.info("Selecting a new primary unit for switchover...") + app_units.discard(primary_unit) + new_primary_unit = app_units.pop() + logging.info(f"New primary unit selected: {new_primary_unit}") + + switchover_task = juju.run(new_primary_unit, "promote-to-primary", {"scope": "unit"}) + assert switchover_task.status == "completed", "Switchover failed" + + assert get_mysql_primary_unit(juju, app_name) == new_primary_unit, "Switchover failed" + + +@pytest.mark.abort_on_fail +def test_cluster_failover_after_majority_loss(juju: Juju, highly_available_cluster) -> None: + """Test the promote-to-primary command after losing the majority of nodes, with force flag.""" + app_name = get_app_name(juju, "mysql") + assert app_name, "MySQL application not found in the cluster" + + app_units = set(get_app_units(juju, app_name)) + assert len(app_units) > 1, "Not enough units to perform a switchover" + + primary_unit = get_mysql_primary_unit(juju, app_name) + assert primary_unit, "No primary unit found in the cluster" + logging.info(f"Current primary unit: {primary_unit}") + + non_primary_units = app_units - {primary_unit} + + unit_to_promote = non_primary_units.pop() + + logging.info(f"Unit selected for promotion: {unit_to_promote}") + + logging.info("Kill all but one unit to simulate majority loss...") + units_to_kill = [non_primary_units.pop(), primary_unit] + machine_name = [] + for unit in units_to_kill: + machine_name.append(get_unit_machine(juju, app_name, unit)) + + run(["lxc", "restart", "--force", machine_name[0], machine_name[1]], check=True) + + juju.model_config({"update-status-hook-interval": "45s"}) + logging.info("Waiting to settle in error state") + juju.wait( + ready=lambda status: all(( + wait_for_unit_status(app_name, unit_to_promote, "active")(status), + wait_for_unit_message(app_name, units_to_kill[0], "offline")(status), + wait_for_unit_message(app_name, units_to_kill[1], "offline")(status), + )), + timeout=60 * 15, + delay=15, + ) + + failover_task = juju.run( + unit_to_promote, + "promote-to-primary", + {"scope": "unit", "force": True}, + wait=600, + ) + + juju.model_config({"update-status-hook-interval": "15s"}) + + assert failover_task.status == "completed", "Switchover failed" + logging.info("Waiting for all units to become active after switchover...") + juju.wait(all_active, timeout=60 * 10, delay=5) + + assert get_mysql_primary_unit(juju, app_name) == unit_to_promote, "Failover failed" + + +def get_unit_machine(juju: Juju, app_name: str, unit_name: str) -> str: + """Get the machine name for the given unit.""" + status = juju.status() + machine_id = status.apps[app_name].units[unit_name].machine + return status.machines[machine_id].instance_id diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py new file mode 100644 index 0000000000..f45e04e7d2 --- /dev/null +++ b/tests/integration/high_availability/test_upgrade.py @@ -0,0 +1,186 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import json +import logging +import shutil +import zipfile +from pathlib import Path + +import jubilant_backports +import pytest +from jubilant_backports import Juju + +from .high_availability_helpers_new import ( + check_mysql_units_writes_increment, + get_app_leader, + get_app_units, + get_mysql_primary_unit, + get_mysql_variable_value, + get_relation_data, + wait_for_apps_status, +) + +MYSQL_APP_NAME = "mysql" +MYSQL_TEST_APP_NAME = "mysql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +@pytest.mark.abort_on_fail +def test_deploy_latest(juju: Juju) -> None: + """Simple test to ensure that the MySQL and application charms get deployed.""" + logging.info("Deploying MySQL cluster") + juju.deploy( + charm=MYSQL_APP_NAME, + app=MYSQL_APP_NAME, + base="ubuntu@22.04", + channel="8.0/edge", + config={"profile": "testing"}, + num_units=3, + ) + juju.deploy( + charm=MYSQL_TEST_APP_NAME, + app=MYSQL_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + ) + + juju.integrate( + f"{MYSQL_APP_NAME}:database", + f"{MYSQL_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status( + jubilant_backports.all_active, MYSQL_APP_NAME, MYSQL_TEST_APP_NAME + ), + error=jubilant_backports.any_blocked, + timeout=20 * MINUTE_SECS, + ) + + +@pytest.mark.abort_on_fail +async def test_pre_upgrade_check(juju: Juju) -> None: + """Test that the pre-upgrade-check action runs successfully.""" + mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) + mysql_units = get_app_units(juju, MYSQL_APP_NAME) + + logging.info("Run pre-upgrade-check action") + task = juju.run(unit=mysql_leader, action="pre-upgrade-check") + task.raise_on_failure() + + logging.info("Assert slow shutdown is enabled") + for unit_name in mysql_units: + value = await get_mysql_variable_value( + juju, MYSQL_APP_NAME, unit_name, "innodb_fast_shutdown" + ) + assert value == 0 + + logging.info("Assert primary is set to leader") + mysql_primary = get_mysql_primary_unit(juju, MYSQL_APP_NAME) + assert mysql_primary == mysql_leader, "Primary unit not set to leader" + + +@pytest.mark.abort_on_fail +async def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: + """Update the second cluster.""" + logging.info("Ensure continuous writes are incrementing") + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) + + logging.info("Refresh the charm") + juju.refresh(app=MYSQL_APP_NAME, path=charm) + + logging.info("Wait for upgrade to start") + juju.wait( + ready=lambda status: jubilant_backports.any_maintenance(status, MYSQL_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=lambda status: jubilant_backports.all_active(status, MYSQL_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes are incrementing") + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) + + +@pytest.mark.abort_on_fail +async def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: + """Test an upgrade failure and its rollback.""" + mysql_app_leader = get_app_leader(juju, MYSQL_APP_NAME) + mysql_app_units = get_app_units(juju, MYSQL_APP_NAME) + + logging.info("Run pre-upgrade-check action") + task = juju.run(unit=mysql_app_leader, action="pre-upgrade-check") + task.raise_on_failure() + + tmp_folder = Path("tmp") + tmp_folder.mkdir(exist_ok=True) + tmp_folder_charm = Path(tmp_folder, charm).absolute() + + shutil.copy(charm, tmp_folder_charm) + + logging.info("Inject dependency fault") + inject_dependency_fault(juju, MYSQL_APP_NAME, tmp_folder_charm) + + logging.info("Refresh the charm") + juju.refresh(app=MYSQL_APP_NAME, path=tmp_folder_charm) + + logging.info("Wait for upgrade to fail on leader") + juju.wait( + ready=wait_for_apps_status(jubilant_backports.any_blocked, MYSQL_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes on all units") + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME, list(mysql_app_units)) + + logging.info("Re-run pre-upgrade-check action") + task = juju.run(unit=mysql_app_leader, action="pre-upgrade-check") + task.raise_on_failure() + + logging.info("Re-refresh the charm") + juju.refresh(app=MYSQL_APP_NAME, path=charm) + + logging.info("Wait for upgrade to start") + juju.wait( + ready=lambda status: jubilant_backports.any_maintenance(status, MYSQL_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=lambda status: jubilant_backports.all_active(status, MYSQL_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes after rollback procedure") + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME, list(mysql_app_units)) + + # Remove fault charm file + tmp_folder_charm.unlink() + + +def inject_dependency_fault(juju: Juju, app_name: str, charm_file: str | Path) -> None: + """Inject a dependency fault into the mysql charm.""" + # Open dependency.json and load current charm version + with open("src/dependency.json") as dependency_file: + current_charm_version = json.load(dependency_file)["charm"]["version"] + + # Query running dependency to overwrite with incompatible version + relation_data = get_relation_data(juju, app_name, "upgrade") + + loaded_dependency_dict = json.loads(relation_data[0]["application-data"]["dependencies"]) + loaded_dependency_dict["charm"]["upgrade_supported"] = f">{current_charm_version}" + loaded_dependency_dict["charm"]["version"] = f"{int(current_charm_version) + 1}" + + # Overwrite dependency.json with incompatible version + with zipfile.ZipFile(charm_file, mode="a") as charm_zip: + charm_zip.writestr("src/dependency.json", json.dumps(loaded_dependency_dict)) diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py new file mode 100644 index 0000000000..d272d06473 --- /dev/null +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -0,0 +1,106 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import jubilant_backports +import pytest +from jubilant_backports import Juju + +from .high_availability_helpers_new import ( + check_mysql_units_writes_increment, + get_app_leader, + get_app_units, + get_mysql_primary_unit, + get_mysql_variable_value, + wait_for_apps_status, +) + +MYSQL_APP_NAME = "mysql" +MYSQL_TEST_APP_NAME = "mysql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +@pytest.mark.abort_on_fail +def test_deploy_stable(juju: Juju) -> None: + """Simple test to ensure that the MySQL and application charms get deployed.""" + logging.info("Deploying MySQL cluster") + juju.deploy( + charm=MYSQL_APP_NAME, + app=MYSQL_APP_NAME, + base="ubuntu@22.04", + channel="8.0/stable", + config={"profile": "testing"}, + num_units=3, + ) + juju.deploy( + charm=MYSQL_TEST_APP_NAME, + app=MYSQL_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + ) + + juju.integrate( + f"{MYSQL_APP_NAME}:database", + f"{MYSQL_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status( + jubilant_backports.all_active, MYSQL_APP_NAME, MYSQL_TEST_APP_NAME + ), + error=jubilant_backports.any_blocked, + timeout=20 * MINUTE_SECS, + ) + + +@pytest.mark.abort_on_fail +async def test_pre_upgrade_check(juju: Juju) -> None: + """Test that the pre-upgrade-check action runs successfully.""" + mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) + mysql_units = get_app_units(juju, MYSQL_APP_NAME) + + logging.info("Run pre-upgrade-check action") + task = juju.run(unit=mysql_leader, action="pre-upgrade-check") + task.raise_on_failure() + + logging.info("Assert slow shutdown is enabled") + for unit_name in mysql_units: + value = await get_mysql_variable_value( + juju, MYSQL_APP_NAME, unit_name, "innodb_fast_shutdown" + ) + assert value == 0 + + logging.info("Assert primary is set to leader") + mysql_primary = get_mysql_primary_unit(juju, MYSQL_APP_NAME) + assert mysql_primary == mysql_leader, "Primary unit not set to leader" + + +@pytest.mark.abort_on_fail +async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> None: + """Update the second cluster.""" + logging.info("Ensure continuous writes are incrementing") + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) + + logging.info("Refresh the charm") + juju.refresh(app=MYSQL_APP_NAME, path=charm) + + logging.info("Wait for upgrade to start") + juju.wait( + ready=lambda status: jubilant_backports.any_maintenance(status, MYSQL_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=lambda status: jubilant_backports.all_active(status, MYSQL_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes are incrementing") + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) diff --git a/tests/integration/high_availability/test_upgrade_rollback_incompat.py b/tests/integration/high_availability/test_upgrade_rollback_incompat.py new file mode 100644 index 0000000000..d0ccf86e1e --- /dev/null +++ b/tests/integration/high_availability/test_upgrade_rollback_incompat.py @@ -0,0 +1,269 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +import json +import logging +import shutil +import time +import zipfile +from ast import literal_eval +from collections.abc import Generator +from pathlib import Path + +import jubilant_backports +import pytest +from jubilant_backports import Juju + +from ..markers import amd64_only +from .high_availability_helpers_new import ( + check_mysql_units_writes_increment, + get_app_leader, + get_relation_data, + get_unit_by_number, + get_unit_status_log, + wait_for_apps_status, + wait_for_unit_status, +) + +MYSQL_APP_NAME = "mysql" +MYSQL_TEST_APP_NAME = "mysql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +@pytest.fixture() +def continuous_writes(juju: Juju) -> Generator: + """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" + test_app_leader = get_app_leader(juju, MYSQL_TEST_APP_NAME) + + logging.info("Clearing continuous writes") + juju.run(test_app_leader, "clear-continuous-writes") + logging.info("Starting continuous writes") + juju.run(test_app_leader, "start-continuous-writes") + + yield + + logging.info("Clearing continuous writes") + juju.run(test_app_leader, "clear-continuous-writes") + + +# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap +# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) +@amd64_only +@pytest.mark.abort_on_fail +async def test_build_and_deploy(juju: Juju, charm: str) -> None: + """Simple test to ensure that the MySQL and application charms get deployed.""" + snap_revisions = Path("snap_revisions.json") + with snap_revisions.open("r") as file: + old_revisions = json.load(file) + + # TODO: support arm64 & s390x + new_revisions = old_revisions.copy() + new_revisions["x86_64"] = "69" + + with snap_revisions.open("w") as file: + json.dump(new_revisions, file) + + local_charm = get_locally_built_charm(charm) + + with snap_revisions.open("w") as file: + json.dump(old_revisions, file) + + juju.deploy( + charm=local_charm, + app=MYSQL_APP_NAME, + base="ubuntu@22.04", + config={"profile": "testing", "plugin-audit-enabled": False}, + num_units=3, + ) + juju.deploy( + charm=MYSQL_TEST_APP_NAME, + app=MYSQL_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + config={"auto_start_writes": False, "sleep_interval": 500}, + num_units=1, + ) + + juju.integrate( + f"{MYSQL_APP_NAME}:database", + f"{MYSQL_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status( + jubilant_backports.all_active, MYSQL_APP_NAME, MYSQL_TEST_APP_NAME + ), + error=jubilant_backports.any_blocked, + timeout=20 * MINUTE_SECS, + ) + + +# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap +# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) +@amd64_only +@pytest.mark.abort_on_fail +async def test_pre_upgrade_check(juju: Juju) -> None: + """Test that the pre-upgrade-check action runs successfully.""" + mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) + + logging.info("Run pre-upgrade-check action") + task = juju.run(unit=mysql_leader, action="pre-upgrade-check") + task.raise_on_failure() + + +# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap +# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) +@amd64_only +@pytest.mark.abort_on_fail +async def test_upgrade_to_failing(juju: Juju, charm: str, continuous_writes) -> None: + logging.info("Ensure continuous_writes") + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) + + with InjectFailure( + path="src/upgrade.py", + original_str="self.charm.recover_unit_after_restart()", + replace_str="raise Exception", + ): + logging.info("Build charm with failure injected") + new_charm = get_locally_built_charm(charm) + + logging.info("Refresh the charm") + juju.refresh(app=MYSQL_APP_NAME, path=new_charm) + + logging.info("Wait for upgrade to start") + juju.wait( + ready=lambda status: jubilant_backports.any_maintenance(status, MYSQL_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Get first upgrading unit") + relation_data = get_relation_data(juju, MYSQL_APP_NAME, "upgrade") + upgrade_stack = relation_data[0]["application-data"]["upgrade-stack"] + upgrade_unit = get_unit_by_number(juju, MYSQL_APP_NAME, literal_eval(upgrade_stack)[-1]) + + logging.info("Wait for upgrade to fail on upgrading unit") + juju.wait( + ready=wait_for_unit_status(MYSQL_APP_NAME, upgrade_unit, "blocked"), + timeout=10 * MINUTE_SECS, + ) + + +# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap +# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) +@amd64_only +@pytest.mark.abort_on_fail +async def test_rollback(juju: Juju, charm: str, continuous_writes) -> None: + """Test upgrade rollback to a healthy revision.""" + relation_data = get_relation_data(juju, MYSQL_APP_NAME, "upgrade") + upgrade_stack = relation_data[0]["application-data"]["upgrade-stack"] + upgrade_unit = get_unit_by_number(juju, MYSQL_APP_NAME, literal_eval(upgrade_stack)[-1]) + + snap_revisions = Path("snap_revisions.json") + with snap_revisions.open("r") as file: + old_revisions = json.load(file) + + # TODO: support arm64 & s390x + new_revisions = old_revisions.copy() + new_revisions["x86_64"] = "69" + + with snap_revisions.open("w") as file: + json.dump(new_revisions, file) + + mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) + local_charm = get_locally_built_charm(charm) + + time.sleep(10) + + logging.info("Run pre-upgrade-check action") + task = juju.run(unit=mysql_leader, action="pre-upgrade-check") + task.raise_on_failure() + + time.sleep(20) + + logging.info("Refresh with previous charm") + juju.refresh(app=MYSQL_APP_NAME, path=local_charm) + + logging.info("Wait for upgrade to start") + juju.wait( + ready=lambda status: jubilant_backports.any_maintenance(status, MYSQL_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + juju.wait( + ready=lambda status: jubilant_backports.all_active(status, MYSQL_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure rollback has taken place") + unit_status_logs = get_unit_status_log(juju, upgrade_unit, 100) + + upgrade_failed_index = get_unit_log_message( + status_logs=unit_status_logs[:], + unit_message="upgrade failed. Check logs for rollback instruction", + ) + assert upgrade_failed_index is not None + + upgrade_complete_index = get_unit_log_message( + status_logs=unit_status_logs[upgrade_failed_index:], + unit_message="upgrade completed", + ) + assert upgrade_complete_index is not None + + logging.info("Ensure continuous writes after rollback procedure") + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) + + +class InjectFailure: + def __init__(self, path: str, original_str: str, replace_str: str): + self.path = path + self.original_str = original_str + self.replace_str = replace_str + with open(path) as file: + self.original_content = file.read() + + def __enter__(self): + logging.info("Injecting failure") + assert self.original_str in self.original_content, "replace content not found" + new_content = self.original_content.replace(self.original_str, self.replace_str) + assert self.original_str not in new_content, "original string not replaced" + with open(self.path, "w") as file: + file.write(new_content) + + def __exit__(self, exc_type, exc_value, traceback): + logging.info("Reverting failure") + with open(self.path, "w") as file: + file.write(self.original_content) + + +def get_unit_log_message(status_logs: list[dict], unit_message: str) -> int | None: + """Returns the index of a status log containing the desired message.""" + for index, status_log in enumerate(status_logs): + if status_log.get("message") == unit_message: + return index + + return None + + +def get_locally_built_charm(charm: str) -> str: + """Wrapper for a local charm build zip file updating.""" + local_charm_paths = Path().glob("local-*.charm") + + # Clean up local charms from previous runs + # to avoid pytest_operator_cache globbing them + for charm_path in local_charm_paths: + charm_path.unlink() + + # Create a copy of the charm to avoid modifying the original + local_charm_path = shutil.copy(charm, f"local-{Path(charm).stem}.charm") + local_charm_path = Path(local_charm_path) + + for path in ["snap_revisions.json", "src/upgrade.py"]: + with open(path) as f: + content = f.read() + with zipfile.ZipFile(local_charm_path, mode="a") as charm_zip: + charm_zip.writestr(path, content) + + return f"{local_charm_path.resolve()}" diff --git a/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py b/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py new file mode 100644 index 0000000000..c6031d0b26 --- /dev/null +++ b/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py @@ -0,0 +1,97 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import jubilant_backports +import pytest +from jubilant_backports import Juju + +from .high_availability_helpers_new import ( + check_mysql_units_writes_increment, + get_app_units, + wait_for_apps_status, + wait_for_unit_status, +) + +MYSQL_APP_NAME = "mysql" +MYSQL_TEST_APP_NAME = "mysql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +@pytest.mark.abort_on_fail +def test_deploy_stable(juju: Juju) -> None: + """Simple test to ensure that the MySQL and application charms get deployed.""" + logging.info("Deploying MySQL cluster") + juju.deploy( + charm=MYSQL_APP_NAME, + app=MYSQL_APP_NAME, + base="ubuntu@22.04", + channel="8.0/stable", + config={"profile": "testing"}, + num_units=3, + ) + juju.deploy( + charm=MYSQL_TEST_APP_NAME, + app=MYSQL_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + config={"sleep_interval": 50}, + num_units=1, + ) + + juju.integrate( + f"{MYSQL_APP_NAME}:database", + f"{MYSQL_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status( + jubilant_backports.all_active, MYSQL_APP_NAME, MYSQL_TEST_APP_NAME + ), + error=jubilant_backports.any_blocked, + timeout=20 * MINUTE_SECS, + ) + + +@pytest.mark.abort_on_fail +async def test_refresh_without_pre_upgrade_check(juju: Juju, charm: str) -> None: + """Test updating from stable channel.""" + logging.info("Refresh the charm") + juju.refresh(app=MYSQL_APP_NAME, path=charm) + + logging.info("Wait for rolling restart") + app_units = get_app_units(juju, MYSQL_APP_NAME) + app_units_funcs = [wait_for_unit_status(MYSQL_APP_NAME, unit, "error") for unit in app_units] + + juju.wait( + ready=lambda status: any(status_func(status) for status_func in app_units_funcs), + timeout=10 * MINUTE_SECS, + successes=1, + ) + + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) + + +@pytest.mark.abort_on_fail +async def test_rollback_without_pre_upgrade_check(juju: Juju, charm: str) -> None: + """Test refresh back to stable channel.""" + # Early Jubilant 1.X.Y versions do not support the `switch` option + logging.info("Refresh the charm to stable channel") + juju.cli("refresh", "--channel=8.0/stable", f"--switch={MYSQL_APP_NAME}", MYSQL_APP_NAME) + + logging.info("Wait for rolling restart") + app_units = get_app_units(juju, MYSQL_APP_NAME) + app_units_funcs = [wait_for_unit_status(MYSQL_APP_NAME, unit, "error") for unit in app_units] + + juju.wait( + ready=lambda status: any(status_func(status) for status_func in app_units_funcs), + timeout=10 * MINUTE_SECS, + successes=1, + ) + + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) From e5b7c2242158795eb7e18e054890192e7f790fcc Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 7 Oct 2025 14:05:38 +0300 Subject: [PATCH 02/33] Tweak async repl test --- .../ha_tests/test_async_replication.py | 566 ------------------ .../high_availability_helpers_new.py | 28 +- .../test_async_replication.py | 157 +++-- .../test_async_replication.py/task.yaml | 2 +- 4 files changed, 89 insertions(+), 664 deletions(-) delete mode 100644 tests/integration/ha_tests/test_async_replication.py diff --git a/tests/integration/ha_tests/test_async_replication.py b/tests/integration/ha_tests/test_async_replication.py deleted file mode 100644 index 1e94786436..0000000000 --- a/tests/integration/ha_tests/test_async_replication.py +++ /dev/null @@ -1,566 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import contextlib -import logging -import subprocess -from asyncio import gather - -import psycopg2 -import pytest as pytest -from juju.model import Model -from pytest_operator.plugin import OpsTest -from tenacity import Retrying, stop_after_delay, wait_fixed - -from .. import architecture -from ..helpers import ( - APPLICATION_NAME, - DATABASE_APP_NAME, - get_leader_unit, - get_password, - get_primary, - get_unit_address, - scale_application, - wait_for_relation_removed_between, -) -from .helpers import ( - app_name, - are_writes_increasing, - check_writes, - get_standby_leader, - get_sync_standby, - start_continuous_writes, -) - -logger = logging.getLogger(__name__) - - -CLUSTER_SIZE = 3 -FAST_INTERVAL = "10s" -IDLE_PERIOD = 5 -TIMEOUT = 2000 - -DATA_INTEGRATOR_APP_NAME = "data-integrator" - - -@contextlib.asynccontextmanager -async def fast_forward(model: Model, fast_interval: str = "10s", slow_interval: str | None = None): - """Adaptation of OpsTest.fast_forward to work with different models.""" - update_interval_key = "update-status-hook-interval" - interval_after = ( - slow_interval if slow_interval else (await model.get_config())[update_interval_key] - ) - - await model.set_config({update_interval_key: fast_interval}) - yield - await model.set_config({update_interval_key: interval_after}) - - -@pytest.fixture(scope="module") -def first_model(ops_test: OpsTest) -> Model: - """Return the first model.""" - first_model = ops_test.model - return first_model - - -@pytest.fixture(scope="module") -async def second_model(ops_test: OpsTest, first_model, request) -> Model: - """Create and return the second model.""" - second_model_name = f"{first_model.info.name}-other" - if second_model_name not in await ops_test._controller.list_models(): - await ops_test._controller.add_model(second_model_name) - subprocess.run(["juju", "switch", second_model_name], check=True) - subprocess.run( - ["juju", "set-model-constraints", f"arch={architecture.architecture}"], check=True - ) - subprocess.run(["juju", "switch", first_model.info.name], check=True) - second_model = Model() - await second_model.connect(model_name=second_model_name) - yield second_model - if request.config.getoption("--keep-models"): - return - logger.info("Destroying second model") - await ops_test._controller.destroy_model(second_model_name, destroy_storage=True) - - -@pytest.fixture -async def second_model_continuous_writes(second_model) -> None: - """Cleans up continuous writes on the second model after a test run.""" - yield - # Clear the written data at the end. - for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3), reraise=True): - with attempt: - action = ( - await second_model.applications[APPLICATION_NAME] - .units[0] - .run_action("clear-continuous-writes") - ) - await action.wait() - assert action.results["result"] == "True", "Unable to clear up continuous_writes table" - - -@pytest.mark.abort_on_fail -async def test_deploy_async_replication_setup( - ops_test: OpsTest, first_model: Model, second_model: Model, charm -) -> None: - """Build and deploy two PostgreSQL cluster in two separate models to test async replication.""" - if not await app_name(ops_test): - await ops_test.model.deploy( - charm, - num_units=CLUSTER_SIZE, - config={"profile": "testing"}, - ) - if not await app_name(ops_test, DATA_INTEGRATOR_APP_NAME): - await ops_test.model.deploy( - DATA_INTEGRATOR_APP_NAME, - num_units=1, - channel="latest/edge", - config={"database-name": "testdb"}, - ) - await ops_test.model.relate(DATABASE_APP_NAME, DATA_INTEGRATOR_APP_NAME) - if not await app_name(ops_test, model=second_model): - await second_model.deploy( - charm, - num_units=CLUSTER_SIZE, - config={"profile": "testing"}, - ) - await ops_test.model.deploy( - APPLICATION_NAME, channel="latest/edge", num_units=1, config={"sleep_interval": 1000} - ) - await second_model.deploy( - APPLICATION_NAME, channel="latest/edge", num_units=1, config={"sleep_interval": 1000} - ) - - async with ops_test.fast_forward(), fast_forward(second_model): - await gather( - first_model.wait_for_idle(apps=[APPLICATION_NAME], status="blocked"), - second_model.wait_for_idle(apps=[APPLICATION_NAME], status="blocked"), - ) - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME, DATA_INTEGRATOR_APP_NAME], - status="active", - timeout=TIMEOUT, - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], - status="active", - timeout=TIMEOUT, - ), - ) - - -@pytest.mark.abort_on_fail -async def test_async_replication( - ops_test: OpsTest, - first_model: Model, - second_model: Model, - continuous_writes, -) -> None: - """Test async replication between two PostgreSQL clusters.""" - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - first_offer_command = f"offer {DATABASE_APP_NAME}:replication-offer replication-offer" - await ops_test.juju(*first_offer_command.split()) - first_consume_command = ( - f"consume -m {second_model.info.name} admin/{first_model.info.name}.replication-offer" - ) - await ops_test.juju(*first_consume_command.split()) - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - await second_model.relate(DATABASE_APP_NAME, "replication-offer") - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Run the promote action. - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - logger.info("promoting the first cluster") - run_action = await leader_unit.run_action("create-replication") - await run_action.wait() - assert (run_action.results.get("return-code", None) == 0) or ( - run_action.results.get("Code", None) == "0" - ), "Promote action failed" - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test, extra_model=second_model) - - -@pytest.mark.abort_on_fail -async def test_get_data_integrator_credentials( - ops_test: OpsTest, -): - unit = ops_test.model.applications[DATA_INTEGRATOR_APP_NAME].units[0] - action = await unit.run_action(action_name="get-credentials") - result = await action.wait() - global data_integrator_credentials - data_integrator_credentials = result.results - - -@pytest.mark.abort_on_fail -async def test_switchover( - ops_test: OpsTest, - first_model: Model, - second_model: Model, - second_model_continuous_writes, -): - """Test switching over to the second cluster.""" - second_offer_command = f"offer {DATABASE_APP_NAME}:replication replication" - await ops_test.juju(*second_offer_command.split()) - second_consume_command = ( - f"consume -m {second_model.info.name} admin/{first_model.info.name}.replication" - ) - await ops_test.juju(*second_consume_command.split()) - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - # Run the promote action. - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME, model=second_model) - assert leader_unit is not None, "No leader unit found" - logger.info("promoting the second cluster") - run_action = await leader_unit.run_action("promote-to-primary", scope="cluster", force=True) - await run_action.wait() - assert (run_action.results.get("return-code", None) == 0) or ( - run_action.results.get("Code", None) == "0" - ), "Promote action failed" - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME, model=second_model) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test, extra_model=second_model) - - -@pytest.mark.abort_on_fail -async def test_data_integrator_creds_keep_on_working( - ops_test: OpsTest, - second_model: Model, -) -> None: - user = data_integrator_credentials["postgresql"]["username"] - password = data_integrator_credentials["postgresql"]["password"] - database = data_integrator_credentials["postgresql"]["database"] - - any_unit = second_model.applications[DATABASE_APP_NAME].units[0].name - primary = await get_primary(ops_test, any_unit, second_model) - address = second_model.units.get(primary).public_address - - connstr = f"dbname='{database}' user='{user}' host='{address}' port='5432' password='{password}' connect_timeout=1" - try: - with psycopg2.connect(connstr) as connection: - pass - finally: - connection.close() - - -@pytest.mark.abort_on_fail -async def test_promote_standby( - ops_test: OpsTest, - first_model: Model, - second_model: Model, - second_model_continuous_writes, -) -> None: - """Test promoting the standby cluster.""" - logger.info("breaking the relations") - await first_model.applications[DATABASE_APP_NAME].remove_relation( - "database", f"{APPLICATION_NAME}:database" - ) - await second_model.applications[DATABASE_APP_NAME].remove_relation( - "replication", "replication-offer" - ) - wait_for_relation_removed_between(ops_test, "replication-offer", "replication", second_model) - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - first_model.block_until( - lambda: first_model.applications[DATABASE_APP_NAME].status == "blocked", - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - # Run the promote action. - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - logger.info("promoting the first cluster") - run_action = await leader_unit.run_action("promote-to-primary", scope="cluster") - await run_action.wait() - assert (run_action.results.get("return-code", None) == 0) or ( - run_action.results.get("Code", None) == "0" - ), "Promote action failed" - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("removing the previous data") - any_unit = ops_test.model.applications[DATABASE_APP_NAME].units[0].name - primary = await get_primary(ops_test, any_unit) - address = get_unit_address(ops_test, primary) - password = await get_password(ops_test) - database_name = f"{APPLICATION_NAME.replace('-', '_')}_database" - connection = None - try: - connection = psycopg2.connect( - f"dbname={database_name} user=operator password={password} host={address}" - ) - connection.autocommit = True - cursor = connection.cursor() - cursor.execute("DROP TABLE IF EXISTS continuous_writes;") - except psycopg2.Error as e: - assert False, f"Failed to drop continuous writes table: {e}" - finally: - if connection is not None: - connection.close() - - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - -@pytest.mark.abort_on_fail -async def test_reestablish_relation( - ops_test: OpsTest, first_model: Model, second_model: Model, continuous_writes -) -> None: - """Test that the relation can be broken and re-established.""" - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - logger.info("reestablishing the relation") - await second_model.relate(DATABASE_APP_NAME, "replication-offer") - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Run the promote action. - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - logger.info("promoting the first cluster") - run_action = await leader_unit.run_action("create-replication") - await run_action.wait() - assert (run_action.results.get("return-code", None) == 0) or ( - run_action.results.get("Code", None) == "0" - ), "Promote action failed" - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test, extra_model=second_model) - - -@pytest.mark.abort_on_fail -async def test_async_replication_failover_in_main_cluster( - ops_test: OpsTest, first_model: Model, second_model: Model, continuous_writes -) -> None: - """Test that async replication fails over correctly.""" - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - sync_standby = await get_sync_standby(ops_test, first_model, DATABASE_APP_NAME) - logger.info(f"Sync-standby: {sync_standby}") - logger.info("deleting the sync-standby") - await first_model.applications[DATABASE_APP_NAME].destroy_units(sync_standby) - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], - status="active", - idle_period=IDLE_PERIOD, - timeout=TIMEOUT, - wait_for_exact_units=(CLUSTER_SIZE - 1), - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - # Check that the sync-standby unit is not the same as before. - new_sync_standby = await get_sync_standby(ops_test, first_model, DATABASE_APP_NAME) - logger.info(f"New sync-standby: {new_sync_standby}") - assert new_sync_standby != sync_standby, "Sync-standby is the same as before" - - logger.info("Ensure continuous_writes after the crashed unit") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test, extra_model=second_model) - - -@pytest.mark.abort_on_fail -async def test_async_replication_failover_in_secondary_cluster( - ops_test: OpsTest, first_model: Model, second_model: Model, continuous_writes -) -> None: - """Test that async replication fails back correctly.""" - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - standby_leader = await get_standby_leader(second_model, DATABASE_APP_NAME) - logger.info(f"Standby leader: {standby_leader}") - logger.info("deleting the standby leader") - await second_model.applications[DATABASE_APP_NAME].destroy_units(standby_leader) - - async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): - await gather( - first_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - second_model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=IDLE_PERIOD, timeout=TIMEOUT - ), - ) - - logger.info("Ensure continuous_writes after the crashed unit") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test, extra_model=second_model) - - -@pytest.mark.abort_on_fail -async def test_scaling( - ops_test: OpsTest, first_model: Model, second_model: Model, continuous_writes -) -> None: - """Test that async replication works when scaling the clusters.""" - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - logger.info("scaling out the clusters") - first_cluster_original_size = len(first_model.applications[DATABASE_APP_NAME].units) - second_cluster_original_size = len(second_model.applications[DATABASE_APP_NAME].units) - await gather( - scale_application(ops_test, DATABASE_APP_NAME, first_cluster_original_size + 1), - scale_application( - ops_test, - DATABASE_APP_NAME, - second_cluster_original_size + 1, - model=second_model, - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test, extra_model=second_model) - - logger.info("scaling in the clusters") - await gather( - scale_application(ops_test, DATABASE_APP_NAME, first_cluster_original_size), - scale_application( - ops_test, DATABASE_APP_NAME, second_cluster_original_size, model=second_model - ), - ) - - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test, extra_model=second_model) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test, extra_model=second_model) diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 7329ca0bce..42da85ac3c 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -21,7 +21,7 @@ JujuAppsStatusFn = Callable[[Status, str], bool] -async def check_mysql_units_writes_increment( +async def check_postgresql_units_writes_increment( juju: Juju, app_name: str, app_units: list[str] | None = None ) -> None: """Ensure that continuous writes is incrementing on all units. @@ -32,8 +32,8 @@ async def check_mysql_units_writes_increment( if not app_units: app_units = get_app_units(juju, app_name) - app_primary = get_mysql_primary_unit(juju, app_name) - app_max_value = await get_mysql_max_written_value(juju, app_name, app_primary) + app_primary = get_postgresql_primary_unit(juju, app_name) + app_max_value = await get_postgresql_max_written_value(juju, app_name, app_primary) juju.model_config({"update-status-hook-interval": "15s"}) for unit_name in app_units: @@ -43,7 +43,7 @@ async def check_mysql_units_writes_increment( wait=wait_fixed(10), ): with attempt: - unit_max_value = await get_mysql_max_written_value(juju, app_name, unit_name) + unit_max_value = await get_postgresql_max_written_value(juju, app_name, unit_name) assert unit_max_value > app_max_value, "Writes not incrementing" app_max_value = unit_max_value @@ -153,7 +153,7 @@ def get_relation_data(juju: Juju, app_name: str, rel_name: str) -> list[dict]: return relation_data -def get_mysql_cluster_status(juju: Juju, unit: str, cluster_set: bool = False) -> dict: +def get_postgresql_cluster_status(juju: Juju, unit: str, cluster_set: bool = False) -> dict: """Get the cluster status by running the get-cluster-status action. Args: @@ -175,25 +175,25 @@ def get_mysql_cluster_status(juju: Juju, unit: str, cluster_set: bool = False) - return task.results.get("status", {}) -def get_mysql_unit_name(instance_label: str) -> str: +def get_postgresql_unit_name(instance_label: str) -> str: """Builds a Juju unit name out of a MySQL instance label.""" return "/".join(instance_label.rsplit("-", 1)) -def get_mysql_primary_unit(juju: Juju, app_name: str) -> str: +def get_postgresql_primary_unit(juju: Juju, app_name: str) -> str: """Get the current primary node of the cluster.""" - mysql_primary = get_app_leader(juju, app_name) - mysql_cluster_status = get_mysql_cluster_status(juju, mysql_primary) - mysql_cluster_topology = mysql_cluster_status["defaultreplicaset"]["topology"] + postgresql_primary = get_app_leader(juju, app_name) + postgresql_cluster_status = get_postgresql_cluster_status(juju, postgresql_primary) + postgresql_cluster_topology = postgresql_cluster_status["defaultreplicaset"]["topology"] - for label, value in mysql_cluster_topology.items(): + for label, value in postgresql_cluster_topology.items(): if value["memberrole"] == "primary": - return get_mysql_unit_name(label) + return get_postgresql_unit_name(label) raise Exception("No MySQL primary node found") -async def get_mysql_max_written_value(juju: Juju, app_name: str, unit_name: str) -> int: +async def get_postgresql_max_written_value(juju: Juju, app_name: str, unit_name: str) -> int: """Retrieve the max written value in the MySQL database. Args: @@ -217,7 +217,7 @@ async def get_mysql_max_written_value(juju: Juju, app_name: str, unit_name: str) return output[0] -async def get_mysql_variable_value( +async def get_postgresql_variable_value( juju: Juju, app_name: str, unit_name: str, variable_name: str ) -> str: """Retrieve a database variable value as a string. diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index a020fd88dd..92a8dd9aa3 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -15,15 +15,14 @@ from .high_availability_helpers_new import ( get_app_leader, get_app_units, - get_mysql_cluster_status, - get_mysql_max_written_value, + get_postgresql_cluster_status, + get_postgresql_max_written_value, wait_for_apps_status, ) -MYSQL_APP_1 = "db1" -MYSQL_APP_2 = "db2" -MYSQL_ROUTER_NAME = "mysql-router" -MYSQL_TEST_APP_NAME = "mysql-test-app" +POSTGRESQL_APP_1 = "db1" +POSTGRESQL_APP_2 = "db2" +POSTGRESQL_TEST_APP_NAME = "postgresql-test-app" MINUTE_SECS = 60 @@ -56,7 +55,7 @@ def second_model(juju: Juju, request: pytest.FixtureRequest) -> Generator: def continuous_writes(first_model: str) -> Generator: """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" model_1 = Juju(model=first_model) - model_1_test_app_leader = get_app_leader(model_1, MYSQL_TEST_APP_NAME) + model_1_test_app_leader = get_app_leader(model_1, POSTGRESQL_TEST_APP_NAME) logging.info("Clearing continuous writes") model_1.run(model_1_test_app_leader, "clear-continuous-writes") @@ -76,11 +75,11 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No configuration = {"profile": "testing"} constraints = {"arch": architecture.architecture} - logging.info("Deploying mysql clusters") + logging.info("Deploying postgresql clusters") model_1 = Juju(model=first_model) model_1.deploy( charm=charm, - app=MYSQL_APP_1, + app=POSTGRESQL_APP_1, base="ubuntu@22.04", config={**configuration, "cluster-name": "lima"}, constraints=constraints, @@ -89,7 +88,7 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No model_2 = Juju(model=second_model) model_2.deploy( charm=charm, - app=MYSQL_APP_2, + app=POSTGRESQL_APP_2, base="ubuntu@22.04", config={**configuration, "cluster-name": "cuzco"}, constraints=constraints, @@ -98,11 +97,11 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_2), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_2), timeout=10 * MINUTE_SECS, ) @@ -113,25 +112,25 @@ def test_async_relate(first_model: str, second_model: str) -> None: """Relate the two MySQL clusters.""" logging.info("Creating offers in first model") model_1 = Juju(model=first_model) - model_1.offer(MYSQL_APP_1, endpoint="replication-offer") + model_1.offer(POSTGRESQL_APP_1, endpoint="replication-offer") logging.info("Consuming offer in second model") model_2 = Juju(model=second_model) - model_2.consume(f"{first_model}.{MYSQL_APP_1}") + model_2.consume(f"{first_model}.{POSTGRESQL_APP_1}") - logging.info("Relating the two mysql clusters") + logging.info("Relating the two postgresql clusters") model_2.integrate( - f"{MYSQL_APP_1}", - f"{MYSQL_APP_2}:replication", + f"{POSTGRESQL_APP_1}", + f"{POSTGRESQL_APP_2}:replication", ) logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant_backports.any_blocked, MYSQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.any_blocked, POSTGRESQL_APP_1), timeout=5 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant_backports.any_waiting, MYSQL_APP_2), + ready=wait_for_apps_status(jubilant_backports.any_waiting, POSTGRESQL_APP_2), timeout=5 * MINUTE_SECS, ) @@ -140,37 +139,25 @@ def test_async_relate(first_model: str, second_model: str) -> None: @pytest.mark.abort_on_fail def test_deploy_router_and_app(first_model: str) -> None: """Deploy the router and the test application.""" - logging.info("Deploying the router and test application") + logging.info("Deploying test application") model_1 = Juju(model=first_model) model_1.deploy( - charm=MYSQL_ROUTER_NAME, - app=MYSQL_ROUTER_NAME, - base="ubuntu@22.04", - channel="dpe/edge", - num_units=1, - trust=True, - ) - model_1.deploy( - charm=MYSQL_TEST_APP_NAME, - app=MYSQL_TEST_APP_NAME, + charm=POSTGRESQL_TEST_APP_NAME, + app=POSTGRESQL_TEST_APP_NAME, base="ubuntu@22.04", channel="latest/edge", num_units=1, trust=False, ) - logging.info("Relating the router and test application") - model_1.integrate( - f"{MYSQL_ROUTER_NAME}:database", - f"{MYSQL_TEST_APP_NAME}:database", - ) + logging.info("Relating test application") model_1.integrate( - f"{MYSQL_ROUTER_NAME}:backend-database", - f"{MYSQL_APP_1}:database", + f"{POSTGRESQL_TEST_APP_NAME}:database", + f"{POSTGRESQL_APP_1}:database", ) model_1.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_TEST_APP_NAME), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_TEST_APP_NAME), timeout=10 * MINUTE_SECS, ) @@ -184,7 +171,7 @@ def test_create_replication(first_model: str, second_model: str) -> None: logging.info("Running create replication action") task = model_1.run( - unit=get_app_leader(model_1, MYSQL_APP_1), + unit=get_app_leader(model_1, POSTGRESQL_APP_1), action="create-replication", wait=5 * MINUTE_SECS, ) @@ -192,11 +179,11 @@ def test_create_replication(first_model: str, second_model: str) -> None: logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), timeout=5 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_2), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_2), timeout=5 * MINUTE_SECS, ) @@ -206,7 +193,7 @@ def test_create_replication(first_model: str, second_model: str) -> None: async def test_data_replication(first_model: str, second_model: str, continuous_writes) -> None: """Test to write to primary, and read the same data back from replicas.""" logging.info("Testing data replication") - results = await get_mysql_max_written_values(first_model, second_model) + results = await get_postgresql_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" @@ -218,24 +205,24 @@ async def test_data_replication(first_model: str, second_model: str, continuous_ async def test_standby_promotion(first_model: str, second_model: str, continuous_writes) -> None: """Test graceful promotion of a standby cluster to primary.""" model_2 = Juju(model=second_model) - model_2_mysql_leader = get_app_leader(model_2, MYSQL_APP_2) + model_2_postgresql_leader = get_app_leader(model_2, POSTGRESQL_APP_2) logging.info("Promoting standby cluster to primary") promotion_task = model_2.run( - unit=model_2_mysql_leader, + unit=model_2_postgresql_leader, action="promote-to-primary", params={"scope": "cluster"}, ) promotion_task.raise_on_failure() - results = await get_mysql_max_written_values(first_model, second_model) + results = await get_postgresql_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" - cluster_set_status = get_mysql_cluster_status( + cluster_set_status = get_postgresql_cluster_status( juju=model_2, - unit=model_2_mysql_leader, + unit=model_2_postgresql_leader, cluster_set=True, ) @@ -248,35 +235,35 @@ async def test_standby_promotion(first_model: str, second_model: str, continuous @pytest.mark.abort_on_fail def test_failover(first_model: str, second_model: str) -> None: """Test switchover on primary cluster fail.""" - logging.info("Freezing mysqld on primary cluster units") + logging.info("Freezing postgresqld on primary cluster units") model_2 = Juju(model=second_model) - model_2_mysql_units = get_app_units(model_2, MYSQL_APP_2) + model_2_postgresql_units = get_app_units(model_2, POSTGRESQL_APP_2) # Simulating a failure on the primary cluster - for unit_name in model_2_mysql_units: - model_2.exec("sudo pkill -x mysqld --signal SIGSTOP", unit=unit_name) + for unit_name in model_2_postgresql_units: + model_2.exec("sudo pkill -x postgres --signal SIGSTOP", unit=unit_name) logging.info("Promoting standby cluster to primary with force flag") model_1 = Juju(model=first_model) - model_1_mysql_leader = get_app_leader(model_1, MYSQL_APP_1) + model_1_postgresql_leader = get_app_leader(model_1, POSTGRESQL_APP_1) promotion_task = model_1.run( - unit=model_1_mysql_leader, + unit=model_1_postgresql_leader, action="promote-to-primary", params={"scope": "cluster", "force": True}, wait=5 * MINUTE_SECS, ) promotion_task.raise_on_failure() - # Restore mysqld process - logging.info("Unfreezing mysqld on primary cluster units") - for unit_name in model_2_mysql_units: - model_2.exec("sudo pkill -x mysqld --signal SIGCONT", unit=unit_name) + # Restore postgresqld process + logging.info("Unfreezing postgresqld on primary cluster units") + for unit_name in model_2_postgresql_units: + model_2.exec("sudo pkill -x postgres --signal SIGCONT", unit=unit_name) logging.info("Checking clusters statuses") - cluster_set_status = get_mysql_cluster_status( + cluster_set_status = get_postgresql_cluster_status( juju=model_1, - unit=model_1_mysql_leader, + unit=model_1_postgresql_leader, cluster_set=True, ) @@ -295,17 +282,17 @@ async def test_rejoin_invalidated_cluster( ) -> None: """Test rejoin invalidated cluster with.""" model_1 = Juju(model=first_model) - model_1_mysql_leader = get_app_leader(model_1, MYSQL_APP_1) + model_1_postgresql_leader = get_app_leader(model_1, POSTGRESQL_APP_1) task = model_1.run( - unit=model_1_mysql_leader, + unit=model_1_postgresql_leader, action="rejoin-cluster", params={"cluster-name": "cuzco"}, wait=5 * MINUTE_SECS, ) task.raise_on_failure() - results = await get_mysql_max_written_values(first_model, second_model) + results = await get_postgresql_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" @@ -314,39 +301,39 @@ async def test_rejoin_invalidated_cluster( @juju3 @pytest.mark.abort_on_fail async def test_unrelate_and_relate(first_model: str, second_model: str, continuous_writes) -> None: - """Test removing and re-relating the two mysql clusters.""" + """Test removing and re-relating the two postgresql clusters.""" model_1 = Juju(model=first_model) model_2 = Juju(model=second_model) logging.info("Remove async relation") model_2.remove_relation( - f"{MYSQL_APP_1}", - f"{MYSQL_APP_2}:replication", + f"{POSTGRESQL_APP_1}", + f"{POSTGRESQL_APP_2}:replication", ) logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant_backports.all_blocked, MYSQL_APP_2), + ready=wait_for_apps_status(jubilant_backports.all_blocked, POSTGRESQL_APP_2), timeout=10 * MINUTE_SECS, ) - logging.info("Re relating the two mysql clusters") + logging.info("Re relating the two postgresql clusters") model_2.integrate( - f"{MYSQL_APP_1}", - f"{MYSQL_APP_2}:replication", + f"{POSTGRESQL_APP_1}", + f"{POSTGRESQL_APP_2}:replication", ) model_1.wait( - ready=wait_for_apps_status(jubilant_backports.any_blocked, MYSQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.any_blocked, POSTGRESQL_APP_1), timeout=5 * MINUTE_SECS, ) logging.info("Running create replication action") task = model_1.run( - unit=get_app_leader(model_1, MYSQL_APP_1), + unit=get_app_leader(model_1, POSTGRESQL_APP_1), action="create-replication", wait=5 * MINUTE_SECS, ) @@ -354,28 +341,28 @@ async def test_unrelate_and_relate(first_model: str, second_model: str, continuo logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_2), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_2), timeout=10 * MINUTE_SECS, ) - results = await get_mysql_max_written_values(first_model, second_model) + results = await get_postgresql_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" -async def get_mysql_max_written_values(first_model: str, second_model: str) -> list[int]: +async def get_postgresql_max_written_values(first_model: str, second_model: str) -> list[int]: """Return list with max written value from all units.""" model_1 = Juju(model=first_model) model_2 = Juju(model=second_model) logging.info("Stopping continuous writes") stopping_task = model_1.run( - unit=get_app_leader(model_1, MYSQL_TEST_APP_NAME), + unit=get_app_leader(model_1, POSTGRESQL_TEST_APP_NAME), action="stop-continuous-writes", params={}, ) @@ -384,14 +371,18 @@ async def get_mysql_max_written_values(first_model: str, second_model: str) -> l time.sleep(5) results = [] - logging.info(f"Querying max value on all {MYSQL_APP_1} units") - for unit_name in get_app_units(model_1, MYSQL_APP_1): - unit_max_value = await get_mysql_max_written_value(model_1, MYSQL_APP_1, unit_name) + logging.info(f"Querying max value on all {POSTGRESQL_APP_1} units") + for unit_name in get_app_units(model_1, POSTGRESQL_APP_1): + unit_max_value = await get_postgresql_max_written_value( + model_1, POSTGRESQL_APP_1, unit_name + ) results.append(unit_max_value) - logging.info(f"Querying max value on all {MYSQL_APP_2} units") - for unit_name in get_app_units(model_2, MYSQL_APP_2): - unit_max_value = await get_mysql_max_written_value(model_2, MYSQL_APP_2, unit_name) + logging.info(f"Querying max value on all {POSTGRESQL_APP_2} units") + for unit_name in get_app_units(model_2, POSTGRESQL_APP_2): + unit_max_value = await get_postgresql_max_written_value( + model_2, POSTGRESQL_APP_2, unit_name + ) results.append(unit_max_value) return results diff --git a/tests/spread/test_async_replication.py/task.yaml b/tests/spread/test_async_replication.py/task.yaml index 4fbf3b6b36..d1116ce09a 100644 --- a/tests/spread/test_async_replication.py/task.yaml +++ b/tests/spread/test_async_replication.py/task.yaml @@ -1,6 +1,6 @@ summary: test_async_replication.py environment: - TEST_MODULE: ha_tests/test_async_replication.py + TEST_MODULE: high_availability/test_async_replication.py execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: From 76fe647b0d083c9414e7c9938c285cb4a710810b Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 7 Oct 2025 14:12:00 +0300 Subject: [PATCH 03/33] WIP --- poetry.lock | 17 +- pyproject.toml | 2 +- tests/integration/conftest.py | 6 +- .../high_availability_helpers_new.py | 8 +- .../test_async_replication.py | 28 +- .../test_primary_switchover.py | 105 ------- .../high_availability/test_upgrade.py | 186 ------------ .../test_upgrade_from_stable.py | 106 ------- .../test_upgrade_rollback_incompat.py | 269 ------------------ .../test_upgrade_skip_pre_upgrade_check.py | 97 ------- 10 files changed, 23 insertions(+), 801 deletions(-) delete mode 100644 tests/integration/high_availability/test_primary_switchover.py delete mode 100644 tests/integration/high_availability/test_upgrade.py delete mode 100644 tests/integration/high_availability/test_upgrade_from_stable.py delete mode 100644 tests/integration/high_availability/test_upgrade_rollback_incompat.py delete mode 100644 tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py diff --git a/poetry.lock b/poetry.lock index 1796b1018d..9b0d686104 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1216,21 +1216,6 @@ files = [ [package.dependencies] PyYAML = "==6.*" -[[package]] -name = "jubilant-backports" -version = "1.0.0a1" -description = "Extends Jubilant to include support for Juju 2.9" -optional = false -python-versions = ">=3.8" -groups = ["integration"] -files = [ - {file = "jubilant_backports-1.0.0a1-py3-none-any.whl", hash = "sha256:ff8d73e17afaae4418c588496978ac42ee9eb9d6d4e77ce103102772038796cc"}, - {file = "jubilant_backports-1.0.0a1.tar.gz", hash = "sha256:03f0788a2301e1a71ebab56bc59515361c37e5686e40a985caba5b2907514e3f"}, -] - -[package.dependencies] -jubilant = ">=1.2,<2.0" - [[package]] name = "juju" version = "3.6.1.3" @@ -3101,4 +3086,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "08804e0562ed263efbf3cbe3875c782ebb3a02ef4d33a5cad6bb38aa45a72ea3" +content-hash = "cde347f0b635694de15820dbbae25b4952c5845777d3bb19f34e7eeb01993b1f" diff --git a/pyproject.toml b/pyproject.toml index 0b42222e45..5bb1b13258 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,7 @@ pytest = "^8.4.2" pytest-operator = "^0.43.1" # renovate caret doesn't work: https://github.com/renovatebot/renovate/issues/26940 juju = "<=3.6.1.3" -jubilant-backports = "^1.0.0a1" +jubilant = "^1.4.0" boto3 = "*" tenacity = "*" landscape-api-py3 = "^0.9.0" diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 2740c72605..df4ef3d637 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -5,7 +5,7 @@ import uuid import boto3 -import jubilant_backports +import jubilant import pytest from pytest_operator.plugin import OpsTest @@ -109,11 +109,11 @@ def juju(request: pytest.FixtureRequest): keep_models = bool(request.config.getoption("--keep-models")) if model: - juju = jubilant_backports.Juju(model=model) # type: ignore + juju = jubilant.Juju(model=model) # type: ignore yield juju log = juju.debug_log(limit=1000) else: - with jubilant_backports.temp_model(keep=keep_models) as juju: + with jubilant.temp_model(keep=keep_models) as juju: yield juju log = juju.debug_log(limit=1000) diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 42da85ac3c..4a36283389 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -6,9 +6,9 @@ import subprocess from collections.abc import Callable -import jubilant_backports -from jubilant_backports import Juju -from jubilant_backports.statustypes import Status, UnitStatus +import jubilant +from jubilant import Juju +from jubilant.statustypes import Status, UnitStatus from tenacity import Retrying, stop_after_delay, wait_fixed from constants import SERVER_CONFIG_USERNAME @@ -255,7 +255,7 @@ def wait_for_apps_status(jubilant_status_func: JujuAppsStatusFn, *apps: str) -> Juju model status function. """ return lambda status: all(( - jubilant_backports.all_agents_idle(status, *apps), + jubilant.all_agents_idle(status, *apps), jubilant_status_func(status, *apps), )) diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index 92a8dd9aa3..365b2c6ef6 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -6,9 +6,9 @@ import time from collections.abc import Generator -import jubilant_backports +import jubilant import pytest -from jubilant_backports import Juju +from jubilant import Juju from .. import architecture from ..markers import juju3 @@ -97,11 +97,11 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_2), + ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_2), timeout=10 * MINUTE_SECS, ) @@ -126,11 +126,11 @@ def test_async_relate(first_model: str, second_model: str) -> None: logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant_backports.any_blocked, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant.any_blocked, POSTGRESQL_APP_1), timeout=5 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant_backports.any_waiting, POSTGRESQL_APP_2), + ready=wait_for_apps_status(jubilant.any_waiting, POSTGRESQL_APP_2), timeout=5 * MINUTE_SECS, ) @@ -157,7 +157,7 @@ def test_deploy_router_and_app(first_model: str) -> None: ) model_1.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_TEST_APP_NAME), + ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_TEST_APP_NAME), timeout=10 * MINUTE_SECS, ) @@ -179,11 +179,11 @@ def test_create_replication(first_model: str, second_model: str) -> None: logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), timeout=5 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_2), + ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_2), timeout=5 * MINUTE_SECS, ) @@ -313,11 +313,11 @@ async def test_unrelate_and_relate(first_model: str, second_model: str, continuo logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant_backports.all_blocked, POSTGRESQL_APP_2), + ready=wait_for_apps_status(jubilant.all_blocked, POSTGRESQL_APP_2), timeout=10 * MINUTE_SECS, ) @@ -327,7 +327,7 @@ async def test_unrelate_and_relate(first_model: str, second_model: str, continuo f"{POSTGRESQL_APP_2}:replication", ) model_1.wait( - ready=wait_for_apps_status(jubilant_backports.any_blocked, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant.any_blocked, POSTGRESQL_APP_1), timeout=5 * MINUTE_SECS, ) @@ -341,11 +341,11 @@ async def test_unrelate_and_relate(first_model: str, second_model: str, continuo logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_2), + ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_2), timeout=10 * MINUTE_SECS, ) diff --git a/tests/integration/high_availability/test_primary_switchover.py b/tests/integration/high_availability/test_primary_switchover.py deleted file mode 100644 index 6aa2266864..0000000000 --- a/tests/integration/high_availability/test_primary_switchover.py +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright 2025 Canonical Ltd. -# See LICENSE file for licensing details. - -import logging -from subprocess import run - -import pytest -from jubilant_backports import Juju, all_active - -from .high_availability_helpers_new import ( - get_app_name, - get_app_units, - get_mysql_primary_unit, - wait_for_unit_message, - wait_for_unit_status, -) - -logging.getLogger("jubilant.wait").setLevel(logging.WARNING) - - -@pytest.mark.abort_on_fail -def test_cluster_switchover(juju: Juju, highly_available_cluster) -> None: - """Test that the primary node can be switched over.""" - logging.info("Testing cluster switchover...") - app_name = get_app_name(juju, "mysql") - assert app_name, "MySQL application not found in the cluster" - - app_units = set(get_app_units(juju, app_name)) - assert len(app_units) > 1, "Not enough units to perform a switchover" - - primary_unit = get_mysql_primary_unit(juju, app_name) - assert primary_unit, "No primary unit found in the cluster" - logging.info(f"Current primary unit: {primary_unit}") - - logging.info("Selecting a new primary unit for switchover...") - app_units.discard(primary_unit) - new_primary_unit = app_units.pop() - logging.info(f"New primary unit selected: {new_primary_unit}") - - switchover_task = juju.run(new_primary_unit, "promote-to-primary", {"scope": "unit"}) - assert switchover_task.status == "completed", "Switchover failed" - - assert get_mysql_primary_unit(juju, app_name) == new_primary_unit, "Switchover failed" - - -@pytest.mark.abort_on_fail -def test_cluster_failover_after_majority_loss(juju: Juju, highly_available_cluster) -> None: - """Test the promote-to-primary command after losing the majority of nodes, with force flag.""" - app_name = get_app_name(juju, "mysql") - assert app_name, "MySQL application not found in the cluster" - - app_units = set(get_app_units(juju, app_name)) - assert len(app_units) > 1, "Not enough units to perform a switchover" - - primary_unit = get_mysql_primary_unit(juju, app_name) - assert primary_unit, "No primary unit found in the cluster" - logging.info(f"Current primary unit: {primary_unit}") - - non_primary_units = app_units - {primary_unit} - - unit_to_promote = non_primary_units.pop() - - logging.info(f"Unit selected for promotion: {unit_to_promote}") - - logging.info("Kill all but one unit to simulate majority loss...") - units_to_kill = [non_primary_units.pop(), primary_unit] - machine_name = [] - for unit in units_to_kill: - machine_name.append(get_unit_machine(juju, app_name, unit)) - - run(["lxc", "restart", "--force", machine_name[0], machine_name[1]], check=True) - - juju.model_config({"update-status-hook-interval": "45s"}) - logging.info("Waiting to settle in error state") - juju.wait( - ready=lambda status: all(( - wait_for_unit_status(app_name, unit_to_promote, "active")(status), - wait_for_unit_message(app_name, units_to_kill[0], "offline")(status), - wait_for_unit_message(app_name, units_to_kill[1], "offline")(status), - )), - timeout=60 * 15, - delay=15, - ) - - failover_task = juju.run( - unit_to_promote, - "promote-to-primary", - {"scope": "unit", "force": True}, - wait=600, - ) - - juju.model_config({"update-status-hook-interval": "15s"}) - - assert failover_task.status == "completed", "Switchover failed" - logging.info("Waiting for all units to become active after switchover...") - juju.wait(all_active, timeout=60 * 10, delay=5) - - assert get_mysql_primary_unit(juju, app_name) == unit_to_promote, "Failover failed" - - -def get_unit_machine(juju: Juju, app_name: str, unit_name: str) -> str: - """Get the machine name for the given unit.""" - status = juju.status() - machine_id = status.apps[app_name].units[unit_name].machine - return status.machines[machine_id].instance_id diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py deleted file mode 100644 index f45e04e7d2..0000000000 --- a/tests/integration/high_availability/test_upgrade.py +++ /dev/null @@ -1,186 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -import json -import logging -import shutil -import zipfile -from pathlib import Path - -import jubilant_backports -import pytest -from jubilant_backports import Juju - -from .high_availability_helpers_new import ( - check_mysql_units_writes_increment, - get_app_leader, - get_app_units, - get_mysql_primary_unit, - get_mysql_variable_value, - get_relation_data, - wait_for_apps_status, -) - -MYSQL_APP_NAME = "mysql" -MYSQL_TEST_APP_NAME = "mysql-test-app" - -MINUTE_SECS = 60 - -logging.getLogger("jubilant.wait").setLevel(logging.WARNING) - - -@pytest.mark.abort_on_fail -def test_deploy_latest(juju: Juju) -> None: - """Simple test to ensure that the MySQL and application charms get deployed.""" - logging.info("Deploying MySQL cluster") - juju.deploy( - charm=MYSQL_APP_NAME, - app=MYSQL_APP_NAME, - base="ubuntu@22.04", - channel="8.0/edge", - config={"profile": "testing"}, - num_units=3, - ) - juju.deploy( - charm=MYSQL_TEST_APP_NAME, - app=MYSQL_TEST_APP_NAME, - base="ubuntu@22.04", - channel="latest/edge", - num_units=1, - ) - - juju.integrate( - f"{MYSQL_APP_NAME}:database", - f"{MYSQL_TEST_APP_NAME}:database", - ) - - logging.info("Wait for applications to become active") - juju.wait( - ready=wait_for_apps_status( - jubilant_backports.all_active, MYSQL_APP_NAME, MYSQL_TEST_APP_NAME - ), - error=jubilant_backports.any_blocked, - timeout=20 * MINUTE_SECS, - ) - - -@pytest.mark.abort_on_fail -async def test_pre_upgrade_check(juju: Juju) -> None: - """Test that the pre-upgrade-check action runs successfully.""" - mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) - mysql_units = get_app_units(juju, MYSQL_APP_NAME) - - logging.info("Run pre-upgrade-check action") - task = juju.run(unit=mysql_leader, action="pre-upgrade-check") - task.raise_on_failure() - - logging.info("Assert slow shutdown is enabled") - for unit_name in mysql_units: - value = await get_mysql_variable_value( - juju, MYSQL_APP_NAME, unit_name, "innodb_fast_shutdown" - ) - assert value == 0 - - logging.info("Assert primary is set to leader") - mysql_primary = get_mysql_primary_unit(juju, MYSQL_APP_NAME) - assert mysql_primary == mysql_leader, "Primary unit not set to leader" - - -@pytest.mark.abort_on_fail -async def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: - """Update the second cluster.""" - logging.info("Ensure continuous writes are incrementing") - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) - - logging.info("Refresh the charm") - juju.refresh(app=MYSQL_APP_NAME, path=charm) - - logging.info("Wait for upgrade to start") - juju.wait( - ready=lambda status: jubilant_backports.any_maintenance(status, MYSQL_APP_NAME), - timeout=10 * MINUTE_SECS, - ) - - logging.info("Wait for upgrade to complete") - juju.wait( - ready=lambda status: jubilant_backports.all_active(status, MYSQL_APP_NAME), - timeout=20 * MINUTE_SECS, - ) - - logging.info("Ensure continuous writes are incrementing") - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) - - -@pytest.mark.abort_on_fail -async def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: - """Test an upgrade failure and its rollback.""" - mysql_app_leader = get_app_leader(juju, MYSQL_APP_NAME) - mysql_app_units = get_app_units(juju, MYSQL_APP_NAME) - - logging.info("Run pre-upgrade-check action") - task = juju.run(unit=mysql_app_leader, action="pre-upgrade-check") - task.raise_on_failure() - - tmp_folder = Path("tmp") - tmp_folder.mkdir(exist_ok=True) - tmp_folder_charm = Path(tmp_folder, charm).absolute() - - shutil.copy(charm, tmp_folder_charm) - - logging.info("Inject dependency fault") - inject_dependency_fault(juju, MYSQL_APP_NAME, tmp_folder_charm) - - logging.info("Refresh the charm") - juju.refresh(app=MYSQL_APP_NAME, path=tmp_folder_charm) - - logging.info("Wait for upgrade to fail on leader") - juju.wait( - ready=wait_for_apps_status(jubilant_backports.any_blocked, MYSQL_APP_NAME), - timeout=10 * MINUTE_SECS, - ) - - logging.info("Ensure continuous writes on all units") - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME, list(mysql_app_units)) - - logging.info("Re-run pre-upgrade-check action") - task = juju.run(unit=mysql_app_leader, action="pre-upgrade-check") - task.raise_on_failure() - - logging.info("Re-refresh the charm") - juju.refresh(app=MYSQL_APP_NAME, path=charm) - - logging.info("Wait for upgrade to start") - juju.wait( - ready=lambda status: jubilant_backports.any_maintenance(status, MYSQL_APP_NAME), - timeout=10 * MINUTE_SECS, - ) - - logging.info("Wait for upgrade to complete") - juju.wait( - ready=lambda status: jubilant_backports.all_active(status, MYSQL_APP_NAME), - timeout=20 * MINUTE_SECS, - ) - - logging.info("Ensure continuous writes after rollback procedure") - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME, list(mysql_app_units)) - - # Remove fault charm file - tmp_folder_charm.unlink() - - -def inject_dependency_fault(juju: Juju, app_name: str, charm_file: str | Path) -> None: - """Inject a dependency fault into the mysql charm.""" - # Open dependency.json and load current charm version - with open("src/dependency.json") as dependency_file: - current_charm_version = json.load(dependency_file)["charm"]["version"] - - # Query running dependency to overwrite with incompatible version - relation_data = get_relation_data(juju, app_name, "upgrade") - - loaded_dependency_dict = json.loads(relation_data[0]["application-data"]["dependencies"]) - loaded_dependency_dict["charm"]["upgrade_supported"] = f">{current_charm_version}" - loaded_dependency_dict["charm"]["version"] = f"{int(current_charm_version) + 1}" - - # Overwrite dependency.json with incompatible version - with zipfile.ZipFile(charm_file, mode="a") as charm_zip: - charm_zip.writestr("src/dependency.json", json.dumps(loaded_dependency_dict)) diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py deleted file mode 100644 index d272d06473..0000000000 --- a/tests/integration/high_availability/test_upgrade_from_stable.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -import logging - -import jubilant_backports -import pytest -from jubilant_backports import Juju - -from .high_availability_helpers_new import ( - check_mysql_units_writes_increment, - get_app_leader, - get_app_units, - get_mysql_primary_unit, - get_mysql_variable_value, - wait_for_apps_status, -) - -MYSQL_APP_NAME = "mysql" -MYSQL_TEST_APP_NAME = "mysql-test-app" - -MINUTE_SECS = 60 - -logging.getLogger("jubilant.wait").setLevel(logging.WARNING) - - -@pytest.mark.abort_on_fail -def test_deploy_stable(juju: Juju) -> None: - """Simple test to ensure that the MySQL and application charms get deployed.""" - logging.info("Deploying MySQL cluster") - juju.deploy( - charm=MYSQL_APP_NAME, - app=MYSQL_APP_NAME, - base="ubuntu@22.04", - channel="8.0/stable", - config={"profile": "testing"}, - num_units=3, - ) - juju.deploy( - charm=MYSQL_TEST_APP_NAME, - app=MYSQL_TEST_APP_NAME, - base="ubuntu@22.04", - channel="latest/edge", - num_units=1, - ) - - juju.integrate( - f"{MYSQL_APP_NAME}:database", - f"{MYSQL_TEST_APP_NAME}:database", - ) - - logging.info("Wait for applications to become active") - juju.wait( - ready=wait_for_apps_status( - jubilant_backports.all_active, MYSQL_APP_NAME, MYSQL_TEST_APP_NAME - ), - error=jubilant_backports.any_blocked, - timeout=20 * MINUTE_SECS, - ) - - -@pytest.mark.abort_on_fail -async def test_pre_upgrade_check(juju: Juju) -> None: - """Test that the pre-upgrade-check action runs successfully.""" - mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) - mysql_units = get_app_units(juju, MYSQL_APP_NAME) - - logging.info("Run pre-upgrade-check action") - task = juju.run(unit=mysql_leader, action="pre-upgrade-check") - task.raise_on_failure() - - logging.info("Assert slow shutdown is enabled") - for unit_name in mysql_units: - value = await get_mysql_variable_value( - juju, MYSQL_APP_NAME, unit_name, "innodb_fast_shutdown" - ) - assert value == 0 - - logging.info("Assert primary is set to leader") - mysql_primary = get_mysql_primary_unit(juju, MYSQL_APP_NAME) - assert mysql_primary == mysql_leader, "Primary unit not set to leader" - - -@pytest.mark.abort_on_fail -async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> None: - """Update the second cluster.""" - logging.info("Ensure continuous writes are incrementing") - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) - - logging.info("Refresh the charm") - juju.refresh(app=MYSQL_APP_NAME, path=charm) - - logging.info("Wait for upgrade to start") - juju.wait( - ready=lambda status: jubilant_backports.any_maintenance(status, MYSQL_APP_NAME), - timeout=10 * MINUTE_SECS, - ) - - logging.info("Wait for upgrade to complete") - juju.wait( - ready=lambda status: jubilant_backports.all_active(status, MYSQL_APP_NAME), - timeout=20 * MINUTE_SECS, - ) - - logging.info("Ensure continuous writes are incrementing") - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) diff --git a/tests/integration/high_availability/test_upgrade_rollback_incompat.py b/tests/integration/high_availability/test_upgrade_rollback_incompat.py deleted file mode 100644 index d0ccf86e1e..0000000000 --- a/tests/integration/high_availability/test_upgrade_rollback_incompat.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -import json -import logging -import shutil -import time -import zipfile -from ast import literal_eval -from collections.abc import Generator -from pathlib import Path - -import jubilant_backports -import pytest -from jubilant_backports import Juju - -from ..markers import amd64_only -from .high_availability_helpers_new import ( - check_mysql_units_writes_increment, - get_app_leader, - get_relation_data, - get_unit_by_number, - get_unit_status_log, - wait_for_apps_status, - wait_for_unit_status, -) - -MYSQL_APP_NAME = "mysql" -MYSQL_TEST_APP_NAME = "mysql-test-app" - -MINUTE_SECS = 60 - -logging.getLogger("jubilant.wait").setLevel(logging.WARNING) - - -@pytest.fixture() -def continuous_writes(juju: Juju) -> Generator: - """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" - test_app_leader = get_app_leader(juju, MYSQL_TEST_APP_NAME) - - logging.info("Clearing continuous writes") - juju.run(test_app_leader, "clear-continuous-writes") - logging.info("Starting continuous writes") - juju.run(test_app_leader, "start-continuous-writes") - - yield - - logging.info("Clearing continuous writes") - juju.run(test_app_leader, "clear-continuous-writes") - - -# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap -# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) -@amd64_only -@pytest.mark.abort_on_fail -async def test_build_and_deploy(juju: Juju, charm: str) -> None: - """Simple test to ensure that the MySQL and application charms get deployed.""" - snap_revisions = Path("snap_revisions.json") - with snap_revisions.open("r") as file: - old_revisions = json.load(file) - - # TODO: support arm64 & s390x - new_revisions = old_revisions.copy() - new_revisions["x86_64"] = "69" - - with snap_revisions.open("w") as file: - json.dump(new_revisions, file) - - local_charm = get_locally_built_charm(charm) - - with snap_revisions.open("w") as file: - json.dump(old_revisions, file) - - juju.deploy( - charm=local_charm, - app=MYSQL_APP_NAME, - base="ubuntu@22.04", - config={"profile": "testing", "plugin-audit-enabled": False}, - num_units=3, - ) - juju.deploy( - charm=MYSQL_TEST_APP_NAME, - app=MYSQL_TEST_APP_NAME, - base="ubuntu@22.04", - channel="latest/edge", - config={"auto_start_writes": False, "sleep_interval": 500}, - num_units=1, - ) - - juju.integrate( - f"{MYSQL_APP_NAME}:database", - f"{MYSQL_TEST_APP_NAME}:database", - ) - - logging.info("Wait for applications to become active") - juju.wait( - ready=wait_for_apps_status( - jubilant_backports.all_active, MYSQL_APP_NAME, MYSQL_TEST_APP_NAME - ), - error=jubilant_backports.any_blocked, - timeout=20 * MINUTE_SECS, - ) - - -# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap -# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) -@amd64_only -@pytest.mark.abort_on_fail -async def test_pre_upgrade_check(juju: Juju) -> None: - """Test that the pre-upgrade-check action runs successfully.""" - mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) - - logging.info("Run pre-upgrade-check action") - task = juju.run(unit=mysql_leader, action="pre-upgrade-check") - task.raise_on_failure() - - -# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap -# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) -@amd64_only -@pytest.mark.abort_on_fail -async def test_upgrade_to_failing(juju: Juju, charm: str, continuous_writes) -> None: - logging.info("Ensure continuous_writes") - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) - - with InjectFailure( - path="src/upgrade.py", - original_str="self.charm.recover_unit_after_restart()", - replace_str="raise Exception", - ): - logging.info("Build charm with failure injected") - new_charm = get_locally_built_charm(charm) - - logging.info("Refresh the charm") - juju.refresh(app=MYSQL_APP_NAME, path=new_charm) - - logging.info("Wait for upgrade to start") - juju.wait( - ready=lambda status: jubilant_backports.any_maintenance(status, MYSQL_APP_NAME), - timeout=10 * MINUTE_SECS, - ) - - logging.info("Get first upgrading unit") - relation_data = get_relation_data(juju, MYSQL_APP_NAME, "upgrade") - upgrade_stack = relation_data[0]["application-data"]["upgrade-stack"] - upgrade_unit = get_unit_by_number(juju, MYSQL_APP_NAME, literal_eval(upgrade_stack)[-1]) - - logging.info("Wait for upgrade to fail on upgrading unit") - juju.wait( - ready=wait_for_unit_status(MYSQL_APP_NAME, upgrade_unit, "blocked"), - timeout=10 * MINUTE_SECS, - ) - - -# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap -# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) -@amd64_only -@pytest.mark.abort_on_fail -async def test_rollback(juju: Juju, charm: str, continuous_writes) -> None: - """Test upgrade rollback to a healthy revision.""" - relation_data = get_relation_data(juju, MYSQL_APP_NAME, "upgrade") - upgrade_stack = relation_data[0]["application-data"]["upgrade-stack"] - upgrade_unit = get_unit_by_number(juju, MYSQL_APP_NAME, literal_eval(upgrade_stack)[-1]) - - snap_revisions = Path("snap_revisions.json") - with snap_revisions.open("r") as file: - old_revisions = json.load(file) - - # TODO: support arm64 & s390x - new_revisions = old_revisions.copy() - new_revisions["x86_64"] = "69" - - with snap_revisions.open("w") as file: - json.dump(new_revisions, file) - - mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) - local_charm = get_locally_built_charm(charm) - - time.sleep(10) - - logging.info("Run pre-upgrade-check action") - task = juju.run(unit=mysql_leader, action="pre-upgrade-check") - task.raise_on_failure() - - time.sleep(20) - - logging.info("Refresh with previous charm") - juju.refresh(app=MYSQL_APP_NAME, path=local_charm) - - logging.info("Wait for upgrade to start") - juju.wait( - ready=lambda status: jubilant_backports.any_maintenance(status, MYSQL_APP_NAME), - timeout=10 * MINUTE_SECS, - ) - juju.wait( - ready=lambda status: jubilant_backports.all_active(status, MYSQL_APP_NAME), - timeout=20 * MINUTE_SECS, - ) - - logging.info("Ensure rollback has taken place") - unit_status_logs = get_unit_status_log(juju, upgrade_unit, 100) - - upgrade_failed_index = get_unit_log_message( - status_logs=unit_status_logs[:], - unit_message="upgrade failed. Check logs for rollback instruction", - ) - assert upgrade_failed_index is not None - - upgrade_complete_index = get_unit_log_message( - status_logs=unit_status_logs[upgrade_failed_index:], - unit_message="upgrade completed", - ) - assert upgrade_complete_index is not None - - logging.info("Ensure continuous writes after rollback procedure") - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) - - -class InjectFailure: - def __init__(self, path: str, original_str: str, replace_str: str): - self.path = path - self.original_str = original_str - self.replace_str = replace_str - with open(path) as file: - self.original_content = file.read() - - def __enter__(self): - logging.info("Injecting failure") - assert self.original_str in self.original_content, "replace content not found" - new_content = self.original_content.replace(self.original_str, self.replace_str) - assert self.original_str not in new_content, "original string not replaced" - with open(self.path, "w") as file: - file.write(new_content) - - def __exit__(self, exc_type, exc_value, traceback): - logging.info("Reverting failure") - with open(self.path, "w") as file: - file.write(self.original_content) - - -def get_unit_log_message(status_logs: list[dict], unit_message: str) -> int | None: - """Returns the index of a status log containing the desired message.""" - for index, status_log in enumerate(status_logs): - if status_log.get("message") == unit_message: - return index - - return None - - -def get_locally_built_charm(charm: str) -> str: - """Wrapper for a local charm build zip file updating.""" - local_charm_paths = Path().glob("local-*.charm") - - # Clean up local charms from previous runs - # to avoid pytest_operator_cache globbing them - for charm_path in local_charm_paths: - charm_path.unlink() - - # Create a copy of the charm to avoid modifying the original - local_charm_path = shutil.copy(charm, f"local-{Path(charm).stem}.charm") - local_charm_path = Path(local_charm_path) - - for path in ["snap_revisions.json", "src/upgrade.py"]: - with open(path) as f: - content = f.read() - with zipfile.ZipFile(local_charm_path, mode="a") as charm_zip: - charm_zip.writestr(path, content) - - return f"{local_charm_path.resolve()}" diff --git a/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py b/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py deleted file mode 100644 index c6031d0b26..0000000000 --- a/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -import logging - -import jubilant_backports -import pytest -from jubilant_backports import Juju - -from .high_availability_helpers_new import ( - check_mysql_units_writes_increment, - get_app_units, - wait_for_apps_status, - wait_for_unit_status, -) - -MYSQL_APP_NAME = "mysql" -MYSQL_TEST_APP_NAME = "mysql-test-app" - -MINUTE_SECS = 60 - -logging.getLogger("jubilant.wait").setLevel(logging.WARNING) - - -@pytest.mark.abort_on_fail -def test_deploy_stable(juju: Juju) -> None: - """Simple test to ensure that the MySQL and application charms get deployed.""" - logging.info("Deploying MySQL cluster") - juju.deploy( - charm=MYSQL_APP_NAME, - app=MYSQL_APP_NAME, - base="ubuntu@22.04", - channel="8.0/stable", - config={"profile": "testing"}, - num_units=3, - ) - juju.deploy( - charm=MYSQL_TEST_APP_NAME, - app=MYSQL_TEST_APP_NAME, - base="ubuntu@22.04", - channel="latest/edge", - config={"sleep_interval": 50}, - num_units=1, - ) - - juju.integrate( - f"{MYSQL_APP_NAME}:database", - f"{MYSQL_TEST_APP_NAME}:database", - ) - - logging.info("Wait for applications to become active") - juju.wait( - ready=wait_for_apps_status( - jubilant_backports.all_active, MYSQL_APP_NAME, MYSQL_TEST_APP_NAME - ), - error=jubilant_backports.any_blocked, - timeout=20 * MINUTE_SECS, - ) - - -@pytest.mark.abort_on_fail -async def test_refresh_without_pre_upgrade_check(juju: Juju, charm: str) -> None: - """Test updating from stable channel.""" - logging.info("Refresh the charm") - juju.refresh(app=MYSQL_APP_NAME, path=charm) - - logging.info("Wait for rolling restart") - app_units = get_app_units(juju, MYSQL_APP_NAME) - app_units_funcs = [wait_for_unit_status(MYSQL_APP_NAME, unit, "error") for unit in app_units] - - juju.wait( - ready=lambda status: any(status_func(status) for status_func in app_units_funcs), - timeout=10 * MINUTE_SECS, - successes=1, - ) - - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) - - -@pytest.mark.abort_on_fail -async def test_rollback_without_pre_upgrade_check(juju: Juju, charm: str) -> None: - """Test refresh back to stable channel.""" - # Early Jubilant 1.X.Y versions do not support the `switch` option - logging.info("Refresh the charm to stable channel") - juju.cli("refresh", "--channel=8.0/stable", f"--switch={MYSQL_APP_NAME}", MYSQL_APP_NAME) - - logging.info("Wait for rolling restart") - app_units = get_app_units(juju, MYSQL_APP_NAME) - app_units_funcs = [wait_for_unit_status(MYSQL_APP_NAME, unit, "error") for unit in app_units] - - juju.wait( - ready=lambda status: any(status_func(status) for status_func in app_units_funcs), - timeout=10 * MINUTE_SECS, - successes=1, - ) - - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) From daa72c13321c288d9c07ff1290f34a8f1be8694a Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 7 Oct 2025 14:36:25 +0300 Subject: [PATCH 04/33] Remove disk cleanup --- .github/workflows/integration_test.yaml | 9 --------- spread.yaml | 3 +++ tests/spread/test_async_replication.py/task.yaml | 2 -- tests/spread/test_scaling.py/task.yaml | 2 -- tests/spread/test_scaling_three_units.py/task.yaml | 2 -- tests/spread/test_scaling_three_units_async.py/task.yaml | 2 -- 6 files changed, 3 insertions(+), 17 deletions(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index f7120e7515..9aed6c5657 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -86,15 +86,6 @@ jobs: runs-on: ${{ matrix.job.runner }} timeout-minutes: 226 # Sum of steps `timeout-minutes` + 5 steps: - - name: Free up disk space - timeout-minutes: 10 - run: | - printf '\nDisk usage before cleanup\n' - df --human-readable - # Based on https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 - rm -r /opt/hostedtoolcache/ - printf '\nDisk usage after cleanup\n' - df --human-readable - name: Checkout timeout-minutes: 3 uses: actions/checkout@v5 diff --git a/spread.yaml b/spread.yaml index fe01ada361..ce4b4e088c 100644 --- a/spread.yaml +++ b/spread.yaml @@ -82,6 +82,9 @@ backends: sudo passwd -d runner ADDRESS localhost + + sudo mkdir -p /var/snap/lxd/common/lxd/storage-pools + sudo mount --bind /mnt /var/snap/lxd/common/lxd/storage-pools # HACK: spread does not pass environment variables set on runner # Manually pass specific environment variables environment: diff --git a/tests/spread/test_async_replication.py/task.yaml b/tests/spread/test_async_replication.py/task.yaml index d1116ce09a..cfadb00ee5 100644 --- a/tests/spread/test_async_replication.py/task.yaml +++ b/tests/spread/test_async_replication.py/task.yaml @@ -5,5 +5,3 @@ execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: - allure-results -variants: - - -juju29 diff --git a/tests/spread/test_scaling.py/task.yaml b/tests/spread/test_scaling.py/task.yaml index 32358243db..656780e30d 100644 --- a/tests/spread/test_scaling.py/task.yaml +++ b/tests/spread/test_scaling.py/task.yaml @@ -5,5 +5,3 @@ execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: - allure-results -variants: - - -juju29 diff --git a/tests/spread/test_scaling_three_units.py/task.yaml b/tests/spread/test_scaling_three_units.py/task.yaml index ae8dcc1006..f46a54dab3 100644 --- a/tests/spread/test_scaling_three_units.py/task.yaml +++ b/tests/spread/test_scaling_three_units.py/task.yaml @@ -5,5 +5,3 @@ execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: - allure-results -variants: - - -juju29 diff --git a/tests/spread/test_scaling_three_units_async.py/task.yaml b/tests/spread/test_scaling_three_units_async.py/task.yaml index cd8a7ba5aa..686116f361 100644 --- a/tests/spread/test_scaling_three_units_async.py/task.yaml +++ b/tests/spread/test_scaling_three_units_async.py/task.yaml @@ -5,5 +5,3 @@ execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: - allure-results -variants: - - -juju29 From abfb660422834bae6bab6c07f329bd3ef5934ff4 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 7 Oct 2025 14:56:12 +0300 Subject: [PATCH 05/33] Add password getter --- .../high_availability_helpers_new.py | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 4a36283389..9737903df1 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -11,11 +11,12 @@ from jubilant.statustypes import Status, UnitStatus from tenacity import Retrying, stop_after_delay, wait_fixed -from constants import SERVER_CONFIG_USERNAME +from constants import PEER from ..helpers import execute_queries_on_unit MINUTE_SECS = 60 +SERVER_CONFIG_USERNAME = "operator" JujuModelStatusFn = Callable[[Status], bool] JujuAppsStatusFn = Callable[[Status, str], bool] @@ -201,17 +202,12 @@ async def get_postgresql_max_written_value(juju: Juju, app_name: str, unit_name: app_name: The application name. unit_name: The unit name. """ - credentials_task = juju.run( - unit=unit_name, - action="get-password", - params={"username": SERVER_CONFIG_USERNAME}, - ) - credentials_task.raise_on_failure() + password = get_user_password(juju, app_name, SERVER_CONFIG_USERNAME) output = await execute_queries_on_unit( get_unit_ip(juju, app_name, unit_name), - credentials_task.results["username"], - credentials_task.results["password"], + SERVER_CONFIG_USERNAME, + password, ["SELECT MAX(number) FROM `continuous_writes`.`data`;"], ) return output[0] @@ -228,17 +224,12 @@ async def get_postgresql_variable_value( unit_name: The unit name. variable_name: The variable name. """ - credentials_task = juju.run( - unit=unit_name, - action="get-password", - params={"username": SERVER_CONFIG_USERNAME}, - ) - credentials_task.raise_on_failure() + password = get_user_password(juju, app_name, SERVER_CONFIG_USERNAME) output = await execute_queries_on_unit( get_unit_ip(juju, app_name, unit_name), - credentials_task.results["username"], - credentials_task.results["password"], + SERVER_CONFIG_USERNAME, + password, [f"SELECT @@{variable_name};"], ) return output[0] @@ -272,3 +263,14 @@ def wait_for_unit_message(app_name: str, unit_name: str, unit_message: str) -> J return lambda status: ( status.apps[app_name].units[unit_name].workload_status.message == unit_message ) + + +# PG helpers + + +def get_user_password(juju: Juju, app_name: str, user: str) -> str | None: + """Get a system user's password.""" + for secret in juju.secrets(owner=app_name): + if secret.label == f"{PEER}.{app_name}.app": + revealed_secret = juju.show_secret(secret.uri, reveal=True) + return revealed_secret.content.get(f"{user}-password") From 1eb046e3baa09f592ef78d743bc97e1cb820e494 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 7 Oct 2025 15:40:19 +0300 Subject: [PATCH 06/33] Tweaks --- .../high_availability_helpers_new.py | 2 +- .../test_async_replication.py | 25 +++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 9737903df1..08f711af38 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -270,7 +270,7 @@ def wait_for_unit_message(app_name: str, unit_name: str, unit_message: str) -> J def get_user_password(juju: Juju, app_name: str, user: str) -> str | None: """Get a system user's password.""" - for secret in juju.secrets(owner=app_name): + for secret in juju.secrets(): if secret.label == f"{PEER}.{app_name}.app": revealed_secret = juju.show_secret(secret.uri, reveal=True) return revealed_secret.content.get(f"{user}-password") diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index 365b2c6ef6..63d7ca7fd9 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -80,8 +80,8 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No model_1.deploy( charm=charm, app=POSTGRESQL_APP_1, - base="ubuntu@22.04", - config={**configuration, "cluster-name": "lima"}, + base="ubuntu@24.04", + config=configuration, constraints=constraints, num_units=3, ) @@ -89,8 +89,8 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No model_2.deploy( charm=charm, app=POSTGRESQL_APP_2, - base="ubuntu@22.04", - config={**configuration, "cluster-name": "cuzco"}, + base="ubuntu@24.04", + config=configuration, constraints=constraints, num_units=3, ) @@ -127,17 +127,17 @@ def test_async_relate(first_model: str, second_model: str) -> None: logging.info("Waiting for the applications to settle") model_1.wait( ready=wait_for_apps_status(jubilant.any_blocked, POSTGRESQL_APP_1), - timeout=5 * MINUTE_SECS, + timeout=10 * MINUTE_SECS, ) model_2.wait( ready=wait_for_apps_status(jubilant.any_waiting, POSTGRESQL_APP_2), - timeout=5 * MINUTE_SECS, + timeout=10 * MINUTE_SECS, ) @juju3 @pytest.mark.abort_on_fail -def test_deploy_router_and_app(first_model: str) -> None: +def test_deploy_app(first_model: str) -> None: """Deploy the router and the test application.""" logging.info("Deploying test application") model_1 = Juju(model=first_model) @@ -180,11 +180,11 @@ def test_create_replication(first_model: str, second_model: str) -> None: logging.info("Waiting for the applications to settle") model_1.wait( ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), - timeout=5 * MINUTE_SECS, + timeout=10 * MINUTE_SECS, ) model_2.wait( ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_2), - timeout=5 * MINUTE_SECS, + timeout=10 * MINUTE_SECS, ) @@ -235,7 +235,7 @@ async def test_standby_promotion(first_model: str, second_model: str, continuous @pytest.mark.abort_on_fail def test_failover(first_model: str, second_model: str) -> None: """Test switchover on primary cluster fail.""" - logging.info("Freezing postgresqld on primary cluster units") + logging.info("Freezing postgres on primary cluster units") model_2 = Juju(model=second_model) model_2_postgresql_units = get_app_units(model_2, POSTGRESQL_APP_2) @@ -255,8 +255,8 @@ def test_failover(first_model: str, second_model: str) -> None: ) promotion_task.raise_on_failure() - # Restore postgresqld process - logging.info("Unfreezing postgresqld on primary cluster units") + # Restore postgres process + logging.info("Unfreezing postgres on primary cluster units") for unit_name in model_2_postgresql_units: model_2.exec("sudo pkill -x postgres --signal SIGCONT", unit=unit_name) @@ -287,7 +287,6 @@ async def test_rejoin_invalidated_cluster( task = model_1.run( unit=model_1_postgresql_leader, action="rejoin-cluster", - params={"cluster-name": "cuzco"}, wait=5 * MINUTE_SECS, ) task.raise_on_failure() From e70e67c7d4cf804e8ac3c5db9078808b580847c1 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 8 Oct 2025 04:47:35 +0300 Subject: [PATCH 07/33] Upgrade test --- tests/integration/ha_tests/test_upgrade.py | 226 ------------------ .../high_availability_helpers_new.py | 2 + .../test_async_replication.py | 10 +- .../high_availability/test_upgrade.py | 185 ++++++++++++++ tests/spread/test_upgrade.py/task.yaml | 2 +- 5 files changed, 192 insertions(+), 233 deletions(-) delete mode 100644 tests/integration/ha_tests/test_upgrade.py create mode 100644 tests/integration/high_availability/test_upgrade.py diff --git a/tests/integration/ha_tests/test_upgrade.py b/tests/integration/ha_tests/test_upgrade.py deleted file mode 100644 index c2714d7ac7..0000000000 --- a/tests/integration/ha_tests/test_upgrade.py +++ /dev/null @@ -1,226 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -import logging -import platform -import shutil -import zipfile -from asyncio import gather -from pathlib import Path - -import pytest -import tomli -import tomli_w -from pytest_operator.plugin import OpsTest - -from ..helpers import ( - APPLICATION_NAME, - DATABASE_APP_NAME, - count_switchovers, - get_leader_unit, - get_primary, -) -from .helpers import ( - are_writes_increasing, - check_writes, - start_continuous_writes, -) - -logger = logging.getLogger(__name__) - -TIMEOUT = 30 * 60 - - -@pytest.mark.abort_on_fail -async def test_deploy_latest(ops_test: OpsTest) -> None: - """Simple test to ensure that the PostgreSQL and application charms get deployed.""" - await gather( - ops_test.model.deploy( - DATABASE_APP_NAME, num_units=3, channel="16/edge", config={"profile": "testing"} - ), - ops_test.model.deploy( - APPLICATION_NAME, - num_units=1, - channel="latest/edge", - config={"sleep_interval": 500}, - ), - ) - await ops_test.model.relate(DATABASE_APP_NAME, f"{APPLICATION_NAME}:database") - logger.info("Wait for applications to become active") - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active", timeout=1500 - ) - assert len(ops_test.model.applications[DATABASE_APP_NAME].units) == 3 - - -@pytest.mark.abort_on_fail -async def test_pre_refresh_check(ops_test: OpsTest) -> None: - """Test that the pre-refresh-check action runs successfully.""" - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - - logger.info("Run pre-refresh-check action") - action = await leader_unit.run_action("pre-refresh-check") - await action.wait() - - -@pytest.mark.abort_on_fail -async def test_upgrade_from_edge(ops_test: OpsTest, continuous_writes, charm) -> None: - # Start an application that continuously writes data to the database. - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - primary_name = await get_primary(ops_test, f"{DATABASE_APP_NAME}/0") - initial_number_of_switchovers = count_switchovers(ops_test, primary_name) - - application = ops_test.model.applications[DATABASE_APP_NAME] - - logger.info("Refresh the charm") - await application.refresh(path=charm) - - logger.info("Wait for upgrade to start") - try: - # Blocked status is expected due to: - # (on PR) compatibility checks (on PR charm revision is '16/1.25.0+dirty...') - # (non-PR) the first unit upgraded and paused (pause-after-unit-refresh=first) - await ops_test.model.block_until(lambda: application.status == "blocked", timeout=60 * 3) - - logger.info("Wait for refresh to block as paused or incompatible") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT - ) - - # Highest to lowest unit number - refresh_order = sorted( - application.units, key=lambda unit: int(unit.name.split("/")[1]), reverse=True - ) - - if "Refresh incompatible" in application.status_message: - logger.info("Application refresh is blocked due to incompatibility") - - action = await refresh_order[0].run_action( - "force-refresh-start", **{"check-compatibility": False} - ) - await action.wait() - - logger.info("Wait for first incompatible unit to upgrade") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT - ) - - logger.info("Run resume-refresh action") - action = await refresh_order[1].run_action("resume-refresh") - await action.wait() - except TimeoutError: - # If the application didn't get into the blocked state, it should have upgraded only - # the charm code because the snap revision didn't change. - logger.info("Upgrade completed without snap refresh (charm.py upgrade only)") - assert application.status == "active", ( - "Application didn't reach blocked or active state after refresh attempt" - ) - - logger.info("Wait for upgrade to complete") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=30, timeout=TIMEOUT - ) - - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test) - - logger.info("checking the number of switchovers") - final_number_of_switchovers = count_switchovers(ops_test, primary_name) - assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( - "Number of switchovers is greater than 2" - ) - - -@pytest.mark.abort_on_fail -async def test_fail_and_rollback(ops_test, charm, continuous_writes) -> None: - # Start an application that continuously writes data to the database. - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - - logger.info("Run pre-refresh-check action") - action = await leader_unit.run_action("pre-refresh-check") - await action.wait() - - filename = Path(charm).name - fault_charm = Path("/tmp", f"{filename}.fault.charm") - shutil.copy(charm, fault_charm) - - logger.info("Inject dependency fault") - await inject_dependency_fault(fault_charm) - - application = ops_test.model.applications[DATABASE_APP_NAME] - - logger.info("Refresh the charm") - await application.refresh(path=fault_charm) - - logger.info("Wait for upgrade to fail") - await ops_test.model.block_until( - lambda: application.status == "blocked" - and "incompatible" in application.status_message.lower(), - timeout=TIMEOUT, - ) - - logger.info("Ensure continuous_writes while in failure state on remaining units") - await are_writes_increasing(ops_test) - - logger.info("Re-refresh the charm") - await application.refresh(path=charm) - - logger.info("Wait for upgrade to start") - await ops_test.model.block_until(lambda: application.status == "blocked", timeout=TIMEOUT) - - logger.info("Wait for application to recover") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", timeout=TIMEOUT - ) - - logger.info("Ensure continuous_writes after rollback procedure") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("Checking whether no writes were lost") - await check_writes(ops_test) - - # Remove fault charm file. - fault_charm.unlink() - - -async def inject_dependency_fault(charm_file: str | Path) -> None: - """Inject a dependency fault into the PostgreSQL charm.""" - with Path("refresh_versions.toml").open("rb") as file: - versions = tomli.load(file) - - versions["charm"] = "16/0.0.0" - versions["snap"]["revisions"][platform.machine()] = "1" - - # Overwrite refresh_versions.toml with incompatible version. - with zipfile.ZipFile(charm_file, mode="a") as charm_zip: - charm_zip.writestr("refresh_versions.toml", tomli_w.dumps(versions)) diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 08f711af38..1236b405ff 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -209,6 +209,7 @@ async def get_postgresql_max_written_value(juju: Juju, app_name: str, unit_name: SERVER_CONFIG_USERNAME, password, ["SELECT MAX(number) FROM `continuous_writes`.`data`;"], + f"{app_name.replace('-', '_')}_database", ) return output[0] @@ -231,6 +232,7 @@ async def get_postgresql_variable_value( SERVER_CONFIG_USERNAME, password, [f"SELECT @@{variable_name};"], + f"{app_name.replace('-', '_')}_database", ) return output[0] diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index 63d7ca7fd9..0fae8aee51 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -98,11 +98,11 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No logging.info("Waiting for the applications to settle") model_1.wait( ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), - timeout=10 * MINUTE_SECS, + timeout=15 * MINUTE_SECS, ) model_2.wait( ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_2), - timeout=10 * MINUTE_SECS, + timeout=15 * MINUTE_SECS, ) @@ -211,7 +211,7 @@ async def test_standby_promotion(first_model: str, second_model: str, continuous promotion_task = model_2.run( unit=model_2_postgresql_leader, action="promote-to-primary", - params={"scope": "cluster"}, + params={"scope": "cluster", "force": "true"}, ) promotion_task.raise_on_failure() @@ -361,9 +361,7 @@ async def get_postgresql_max_written_values(first_model: str, second_model: str) logging.info("Stopping continuous writes") stopping_task = model_1.run( - unit=get_app_leader(model_1, POSTGRESQL_TEST_APP_NAME), - action="stop-continuous-writes", - params={}, + unit=get_app_leader(model_1, POSTGRESQL_TEST_APP_NAME), action="stop-continuous-writes" ) stopping_task.raise_on_failure() diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py new file mode 100644 index 0000000000..50306152b5 --- /dev/null +++ b/tests/integration/high_availability/test_upgrade.py @@ -0,0 +1,185 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +import platform +import shutil +import zipfile +from pathlib import Path + +import jubilant +import pytest +import tomli +import tomli_w +from jubilant import Juju + +from .high_availability_helpers_new import ( + check_postgresql_units_writes_increment, + get_app_leader, + get_app_units, + get_postgresql_primary_unit, + get_postgresql_variable_value, + wait_for_apps_status, +) + +POSTGRESQL_APP_NAME = "postgresql" +POSTGRESQL_TEST_APP_NAME = "postgresql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +@pytest.mark.abort_on_fail +def test_deploy_latest(juju: Juju) -> None: + """Simple test to ensure that the PostgreSQL and application charms get deployed.""" + logging.info("Deploying PostgreSQL cluster") + juju.deploy( + charm=POSTGRESQL_APP_NAME, + app=POSTGRESQL_APP_NAME, + base="ubuntu@24.04", + channel="16/edge", + config={"profile": "testing"}, + num_units=3, + ) + juju.deploy( + charm=POSTGRESQL_TEST_APP_NAME, + app=POSTGRESQL_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + ) + + juju.integrate( + f"{POSTGRESQL_APP_NAME}:database", + f"{POSTGRESQL_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status( + jubilant.all_active, POSTGRESQL_APP_NAME, POSTGRESQL_TEST_APP_NAME + ), + timeout=20 * MINUTE_SECS, + ) + + +@pytest.mark.abort_on_fail +async def test_pre_refresh_check(juju: Juju) -> None: + """Test that the pre-refresh-check action runs successfully.""" + postgresql_leader = get_app_leader(juju, POSTGRESQL_APP_NAME) + postgresql_units = get_app_units(juju, POSTGRESQL_APP_NAME) + + logging.info("Run pre-refresh-check action") + task = juju.run(unit=postgresql_leader, action="pre-refresh-check") + task.raise_on_failure() + + logging.info("Assert slow shutdown is enabled") + for unit_name in postgresql_units: + value = await get_postgresql_variable_value( + juju, POSTGRESQL_APP_NAME, unit_name, "innodb_fast_shutdown" + ) + assert value == 0 + + logging.info("Assert primary is set to leader") + postgresql_primary = get_postgresql_primary_unit(juju, POSTGRESQL_APP_NAME) + assert postgresql_primary == postgresql_leader, "Primary unit not set to leader" + + +@pytest.mark.abort_on_fail +async def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: + """Update the second cluster.""" + logging.info("Ensure continuous writes are incrementing") + await check_postgresql_units_writes_increment(juju, POSTGRESQL_APP_NAME) + + logging.info("Refresh the charm") + juju.refresh(app=POSTGRESQL_APP_NAME, path=charm) + + logging.info("Wait for upgrade to start") + juju.wait( + ready=lambda status: jubilant.any_maintenance(status, POSTGRESQL_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=lambda status: jubilant.all_active(status, POSTGRESQL_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes are incrementing") + await check_postgresql_units_writes_increment(juju, POSTGRESQL_APP_NAME) + + +@pytest.mark.abort_on_fail +async def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: + """Test an upgrade failure and its rollback.""" + postgresql_app_leader = get_app_leader(juju, POSTGRESQL_APP_NAME) + postgresql_app_units = get_app_units(juju, POSTGRESQL_APP_NAME) + + logging.info("Run pre-refresh-check action") + task = juju.run(unit=postgresql_app_leader, action="pre-refresh-check") + task.raise_on_failure() + + tmp_folder = Path("tmp") + tmp_folder.mkdir(exist_ok=True) + tmp_folder_charm = Path(tmp_folder, charm).absolute() + + shutil.copy(charm, tmp_folder_charm) + + logging.info("Inject dependency fault") + inject_dependency_fault(juju, POSTGRESQL_APP_NAME, tmp_folder_charm) + + logging.info("Refresh the charm") + juju.refresh(app=POSTGRESQL_APP_NAME, path=tmp_folder_charm) + + logging.info("Wait for upgrade to fail on leader") + juju.wait( + ready=wait_for_apps_status(jubilant.any_blocked, POSTGRESQL_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes on all units") + await check_postgresql_units_writes_increment( + juju, POSTGRESQL_APP_NAME, list(postgresql_app_units) + ) + + logging.info("Re-run pre-refresh-check action") + task = juju.run(unit=postgresql_app_leader, action="pre-refresh-check") + task.raise_on_failure() + + logging.info("Re-refresh the charm") + juju.refresh(app=POSTGRESQL_APP_NAME, path=charm) + + logging.info("Wait for upgrade to start") + juju.wait( + ready=lambda status: jubilant.any_maintenance(status, POSTGRESQL_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=lambda status: jubilant.all_active(status, POSTGRESQL_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes after rollback procedure") + await check_postgresql_units_writes_increment( + juju, POSTGRESQL_APP_NAME, list(postgresql_app_units) + ) + + # Remove fault charm file + tmp_folder_charm.unlink() + + +def inject_dependency_fault(juju: Juju, app_name: str, charm_file: str | Path) -> None: + """Inject a dependency fault into the PostgreSQL charm.""" + with Path("refresh_versions.toml").open("rb") as file: + versions = tomli.load(file) + + versions["charm"] = "16/0.0.0" + versions["snap"]["revisions"][platform.machine()] = "1" + + # Overwrite refresh_versions.toml with incompatible version. + with zipfile.ZipFile(charm_file, mode="a") as charm_zip: + charm_zip.writestr("refresh_versions.toml", tomli_w.dumps(versions)) diff --git a/tests/spread/test_upgrade.py/task.yaml b/tests/spread/test_upgrade.py/task.yaml index b3be366921..f99ac69384 100644 --- a/tests/spread/test_upgrade.py/task.yaml +++ b/tests/spread/test_upgrade.py/task.yaml @@ -1,6 +1,6 @@ summary: test_upgrade.py environment: - TEST_MODULE: ha_tests/test_upgrade.py + TEST_MODULE: high_availability/test_upgrade.py execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: From 2839d809320b89da7a2cb58ed336ced22674e3c7 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 8 Oct 2025 15:42:08 +0300 Subject: [PATCH 08/33] Back to backports --- poetry.lock | 8 +- pyproject.toml | 2 +- tests/integration/conftest.py | 6 +- .../high_availability_helpers_new.py | 53 ++++--------- .../test_async_replication.py | 54 ++++++------- .../high_availability/test_upgrade.py | 79 ++++++++----------- 6 files changed, 80 insertions(+), 122 deletions(-) diff --git a/poetry.lock b/poetry.lock index 9b0d686104..e033a8fd88 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1202,15 +1202,15 @@ files = [ referencing = ">=0.31.0" [[package]] -name = "jubilant" +name = "jubilant-backports" version = "1.4.0" description = "Juju CLI wrapper for charm integration testing" optional = false python-versions = ">=3.8" groups = ["integration"] files = [ - {file = "jubilant-1.4.0-py3-none-any.whl", hash = "sha256:1df7eaf125fad8d0d3d35e6d83eca43bfbb7884debcd6c7f4b0822600e2a485c"}, - {file = "jubilant-1.4.0.tar.gz", hash = "sha256:aa377699a8811fea29bfe0febb6b552d4593c02e666f5ba8c3fba24258700199"}, + {file = "jubilant_backports-1.4.0-py3-none-any.whl", hash = "sha256:b5c2d9aca29b39543bbe45b3205e97dd22b60fca9f8e0e66885b71201f8127d5"}, + {file = "jubilant_backports-1.4.0.tar.gz", hash = "sha256:0961645e67a08e85b3371d6c386795d254527d0c107a355ca24bdbca3872b671"}, ] [package.dependencies] @@ -3086,4 +3086,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "cde347f0b635694de15820dbbae25b4952c5845777d3bb19f34e7eeb01993b1f" +content-hash = "222f14db8792819536fb7ee713cffd614305d03b7801cc6d65d60d7b7faa36d9" diff --git a/pyproject.toml b/pyproject.toml index 5bb1b13258..0ac6c5b26f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,7 @@ pytest = "^8.4.2" pytest-operator = "^0.43.1" # renovate caret doesn't work: https://github.com/renovatebot/renovate/issues/26940 juju = "<=3.6.1.3" -jubilant = "^1.4.0" +jubilant-backports = "^1.4.0" boto3 = "*" tenacity = "*" landscape-api-py3 = "^0.9.0" diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index df4ef3d637..5795fed3e2 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -5,7 +5,7 @@ import uuid import boto3 -import jubilant +import jubilant_backports import pytest from pytest_operator.plugin import OpsTest @@ -109,11 +109,11 @@ def juju(request: pytest.FixtureRequest): keep_models = bool(request.config.getoption("--keep-models")) if model: - juju = jubilant.Juju(model=model) # type: ignore + juju = jubilant_backports.Juju(model=model) yield juju log = juju.debug_log(limit=1000) else: - with jubilant.temp_model(keep=keep_models) as juju: + with jubilant_backports.temp_model(keep=keep_models) as juju: yield juju log = juju.debug_log(limit=1000) diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 1236b405ff..dc2031ffc0 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -6,9 +6,9 @@ import subprocess from collections.abc import Callable -import jubilant -from jubilant import Juju -from jubilant.statustypes import Status, UnitStatus +import jubilant_backports +from jubilant_backports import Juju +from jubilant_backports.statustypes import Status, UnitStatus from tenacity import Retrying, stop_after_delay, wait_fixed from constants import PEER @@ -22,7 +22,7 @@ JujuAppsStatusFn = Callable[[Status, str], bool] -async def check_postgresql_units_writes_increment( +async def check_db_units_writes_increment( juju: Juju, app_name: str, app_units: list[str] | None = None ) -> None: """Ensure that continuous writes is incrementing on all units. @@ -33,8 +33,8 @@ async def check_postgresql_units_writes_increment( if not app_units: app_units = get_app_units(juju, app_name) - app_primary = get_postgresql_primary_unit(juju, app_name) - app_max_value = await get_postgresql_max_written_value(juju, app_name, app_primary) + app_primary = get_db_primary_unit(juju, app_name) + app_max_value = await get_db_max_written_value(juju, app_name, app_primary) juju.model_config({"update-status-hook-interval": "15s"}) for unit_name in app_units: @@ -44,7 +44,7 @@ async def check_postgresql_units_writes_increment( wait=wait_fixed(10), ): with attempt: - unit_max_value = await get_postgresql_max_written_value(juju, app_name, unit_name) + unit_max_value = await get_db_max_written_value(juju, app_name, unit_name) assert unit_max_value > app_max_value, "Writes not incrementing" app_max_value = unit_max_value @@ -154,7 +154,7 @@ def get_relation_data(juju: Juju, app_name: str, rel_name: str) -> list[dict]: return relation_data -def get_postgresql_cluster_status(juju: Juju, unit: str, cluster_set: bool = False) -> dict: +def get_db_cluster_status(juju: Juju, unit: str, cluster_set: bool = False) -> dict: """Get the cluster status by running the get-cluster-status action. Args: @@ -176,25 +176,25 @@ def get_postgresql_cluster_status(juju: Juju, unit: str, cluster_set: bool = Fal return task.results.get("status", {}) -def get_postgresql_unit_name(instance_label: str) -> str: +def get_db_unit_name(instance_label: str) -> str: """Builds a Juju unit name out of a MySQL instance label.""" return "/".join(instance_label.rsplit("-", 1)) -def get_postgresql_primary_unit(juju: Juju, app_name: str) -> str: +def get_db_primary_unit(juju: Juju, app_name: str) -> str: """Get the current primary node of the cluster.""" postgresql_primary = get_app_leader(juju, app_name) - postgresql_cluster_status = get_postgresql_cluster_status(juju, postgresql_primary) + postgresql_cluster_status = get_db_cluster_status(juju, postgresql_primary) postgresql_cluster_topology = postgresql_cluster_status["defaultreplicaset"]["topology"] for label, value in postgresql_cluster_topology.items(): if value["memberrole"] == "primary": - return get_postgresql_unit_name(label) + return get_db_unit_name(label) raise Exception("No MySQL primary node found") -async def get_postgresql_max_written_value(juju: Juju, app_name: str, unit_name: str) -> int: +async def get_db_max_written_value(juju: Juju, app_name: str, unit_name: str) -> int: """Retrieve the max written value in the MySQL database. Args: @@ -209,30 +209,7 @@ async def get_postgresql_max_written_value(juju: Juju, app_name: str, unit_name: SERVER_CONFIG_USERNAME, password, ["SELECT MAX(number) FROM `continuous_writes`.`data`;"], - f"{app_name.replace('-', '_')}_database", - ) - return output[0] - - -async def get_postgresql_variable_value( - juju: Juju, app_name: str, unit_name: str, variable_name: str -) -> str: - """Retrieve a database variable value as a string. - - Args: - juju: The Juju model. - app_name: The application name. - unit_name: The unit name. - variable_name: The variable name. - """ - password = get_user_password(juju, app_name, SERVER_CONFIG_USERNAME) - - output = await execute_queries_on_unit( - get_unit_ip(juju, app_name, unit_name), - SERVER_CONFIG_USERNAME, - password, - [f"SELECT @@{variable_name};"], - f"{app_name.replace('-', '_')}_database", + "postgresql_test_app_database", ) return output[0] @@ -248,7 +225,7 @@ def wait_for_apps_status(jubilant_status_func: JujuAppsStatusFn, *apps: str) -> Juju model status function. """ return lambda status: all(( - jubilant.all_agents_idle(status, *apps), + jubilant_backports.all_agents_idle(status, *apps), jubilant_status_func(status, *apps), )) diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index 0fae8aee51..a2c2808831 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -6,17 +6,17 @@ import time from collections.abc import Generator -import jubilant +import jubilant_backports import pytest -from jubilant import Juju +from jubilant_backports import Juju from .. import architecture from ..markers import juju3 from .high_availability_helpers_new import ( get_app_leader, get_app_units, - get_postgresql_cluster_status, - get_postgresql_max_written_value, + get_db_cluster_status, + get_db_max_written_value, wait_for_apps_status, ) @@ -97,11 +97,11 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), timeout=15 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_2), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_2), timeout=15 * MINUTE_SECS, ) @@ -126,11 +126,11 @@ def test_async_relate(first_model: str, second_model: str) -> None: logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.any_blocked, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.any_blocked, POSTGRESQL_APP_1), timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant.any_waiting, POSTGRESQL_APP_2), + ready=wait_for_apps_status(jubilant_backports.any_waiting, POSTGRESQL_APP_2), timeout=10 * MINUTE_SECS, ) @@ -157,7 +157,7 @@ def test_deploy_app(first_model: str) -> None: ) model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_TEST_APP_NAME), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_TEST_APP_NAME), timeout=10 * MINUTE_SECS, ) @@ -179,11 +179,11 @@ def test_create_replication(first_model: str, second_model: str) -> None: logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_2), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_2), timeout=10 * MINUTE_SECS, ) @@ -193,7 +193,7 @@ def test_create_replication(first_model: str, second_model: str) -> None: async def test_data_replication(first_model: str, second_model: str, continuous_writes) -> None: """Test to write to primary, and read the same data back from replicas.""" logging.info("Testing data replication") - results = await get_postgresql_max_written_values(first_model, second_model) + results = await get_db_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" @@ -215,12 +215,12 @@ async def test_standby_promotion(first_model: str, second_model: str, continuous ) promotion_task.raise_on_failure() - results = await get_postgresql_max_written_values(first_model, second_model) + results = await get_db_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" - cluster_set_status = get_postgresql_cluster_status( + cluster_set_status = get_db_cluster_status( juju=model_2, unit=model_2_postgresql_leader, cluster_set=True, @@ -261,7 +261,7 @@ def test_failover(first_model: str, second_model: str) -> None: model_2.exec("sudo pkill -x postgres --signal SIGCONT", unit=unit_name) logging.info("Checking clusters statuses") - cluster_set_status = get_postgresql_cluster_status( + cluster_set_status = get_db_cluster_status( juju=model_1, unit=model_1_postgresql_leader, cluster_set=True, @@ -291,7 +291,7 @@ async def test_rejoin_invalidated_cluster( ) task.raise_on_failure() - results = await get_postgresql_max_written_values(first_model, second_model) + results = await get_db_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" @@ -312,11 +312,11 @@ async def test_unrelate_and_relate(first_model: str, second_model: str, continuo logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_blocked, POSTGRESQL_APP_2), + ready=wait_for_apps_status(jubilant_backports.all_blocked, POSTGRESQL_APP_2), timeout=10 * MINUTE_SECS, ) @@ -326,7 +326,7 @@ async def test_unrelate_and_relate(first_model: str, second_model: str, continuo f"{POSTGRESQL_APP_2}:replication", ) model_1.wait( - ready=wait_for_apps_status(jubilant.any_blocked, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.any_blocked, POSTGRESQL_APP_1), timeout=5 * MINUTE_SECS, ) @@ -340,21 +340,21 @@ async def test_unrelate_and_relate(first_model: str, second_model: str, continuo logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_1), timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_2), + ready=wait_for_apps_status(jubilant_backports.all_active, POSTGRESQL_APP_2), timeout=10 * MINUTE_SECS, ) - results = await get_postgresql_max_written_values(first_model, second_model) + results = await get_db_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" -async def get_postgresql_max_written_values(first_model: str, second_model: str) -> list[int]: +async def get_db_max_written_values(first_model: str, second_model: str) -> list[int]: """Return list with max written value from all units.""" model_1 = Juju(model=first_model) model_2 = Juju(model=second_model) @@ -370,16 +370,12 @@ async def get_postgresql_max_written_values(first_model: str, second_model: str) logging.info(f"Querying max value on all {POSTGRESQL_APP_1} units") for unit_name in get_app_units(model_1, POSTGRESQL_APP_1): - unit_max_value = await get_postgresql_max_written_value( - model_1, POSTGRESQL_APP_1, unit_name - ) + unit_max_value = await get_db_max_written_value(model_1, POSTGRESQL_APP_1, unit_name) results.append(unit_max_value) logging.info(f"Querying max value on all {POSTGRESQL_APP_2} units") for unit_name in get_app_units(model_2, POSTGRESQL_APP_2): - unit_max_value = await get_postgresql_max_written_value( - model_2, POSTGRESQL_APP_2, unit_name - ) + unit_max_value = await get_db_max_written_value(model_2, POSTGRESQL_APP_2, unit_name) results.append(unit_max_value) return results diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py index 50306152b5..7f698abad3 100644 --- a/tests/integration/high_availability/test_upgrade.py +++ b/tests/integration/high_availability/test_upgrade.py @@ -7,23 +7,22 @@ import zipfile from pathlib import Path -import jubilant +import jubilant_backports import pytest import tomli import tomli_w -from jubilant import Juju +from jubilant_backports import Juju from .high_availability_helpers_new import ( - check_postgresql_units_writes_increment, + check_db_units_writes_increment, get_app_leader, get_app_units, - get_postgresql_primary_unit, - get_postgresql_variable_value, + get_db_primary_unit, wait_for_apps_status, ) -POSTGRESQL_APP_NAME = "postgresql" -POSTGRESQL_TEST_APP_NAME = "postgresql-test-app" +DB_APP_NAME = "postgresql" +DB_TEST_APP_NAME = "postgresql-test-app" MINUTE_SECS = 60 @@ -35,31 +34,29 @@ def test_deploy_latest(juju: Juju) -> None: """Simple test to ensure that the PostgreSQL and application charms get deployed.""" logging.info("Deploying PostgreSQL cluster") juju.deploy( - charm=POSTGRESQL_APP_NAME, - app=POSTGRESQL_APP_NAME, + charm=DB_APP_NAME, + app=DB_APP_NAME, base="ubuntu@24.04", channel="16/edge", config={"profile": "testing"}, num_units=3, ) juju.deploy( - charm=POSTGRESQL_TEST_APP_NAME, - app=POSTGRESQL_TEST_APP_NAME, + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_NAME, base="ubuntu@22.04", channel="latest/edge", num_units=1, ) juju.integrate( - f"{POSTGRESQL_APP_NAME}:database", - f"{POSTGRESQL_TEST_APP_NAME}:database", + f"{DB_APP_NAME}:database", + f"{DB_TEST_APP_NAME}:database", ) logging.info("Wait for applications to become active") juju.wait( - ready=wait_for_apps_status( - jubilant.all_active, POSTGRESQL_APP_NAME, POSTGRESQL_TEST_APP_NAME - ), + ready=wait_for_apps_status(jubilant_backports.all_active, DB_APP_NAME, DB_TEST_APP_NAME), timeout=20 * MINUTE_SECS, ) @@ -67,22 +64,14 @@ def test_deploy_latest(juju: Juju) -> None: @pytest.mark.abort_on_fail async def test_pre_refresh_check(juju: Juju) -> None: """Test that the pre-refresh-check action runs successfully.""" - postgresql_leader = get_app_leader(juju, POSTGRESQL_APP_NAME) - postgresql_units = get_app_units(juju, POSTGRESQL_APP_NAME) + postgresql_leader = get_app_leader(juju, DB_APP_NAME) logging.info("Run pre-refresh-check action") task = juju.run(unit=postgresql_leader, action="pre-refresh-check") task.raise_on_failure() - logging.info("Assert slow shutdown is enabled") - for unit_name in postgresql_units: - value = await get_postgresql_variable_value( - juju, POSTGRESQL_APP_NAME, unit_name, "innodb_fast_shutdown" - ) - assert value == 0 - logging.info("Assert primary is set to leader") - postgresql_primary = get_postgresql_primary_unit(juju, POSTGRESQL_APP_NAME) + postgresql_primary = get_db_primary_unit(juju, DB_APP_NAME) assert postgresql_primary == postgresql_leader, "Primary unit not set to leader" @@ -90,35 +79,35 @@ async def test_pre_refresh_check(juju: Juju) -> None: async def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: """Update the second cluster.""" logging.info("Ensure continuous writes are incrementing") - await check_postgresql_units_writes_increment(juju, POSTGRESQL_APP_NAME) + await check_db_units_writes_increment(juju, DB_APP_NAME) logging.info("Refresh the charm") - juju.refresh(app=POSTGRESQL_APP_NAME, path=charm) + juju.refresh(app=DB_APP_NAME, path=charm) logging.info("Wait for upgrade to start") juju.wait( - ready=lambda status: jubilant.any_maintenance(status, POSTGRESQL_APP_NAME), + ready=lambda status: jubilant_backports.any_maintenance(status, DB_APP_NAME), timeout=10 * MINUTE_SECS, ) logging.info("Wait for upgrade to complete") juju.wait( - ready=lambda status: jubilant.all_active(status, POSTGRESQL_APP_NAME), + ready=lambda status: jubilant_backports.all_active(status, DB_APP_NAME), timeout=20 * MINUTE_SECS, ) logging.info("Ensure continuous writes are incrementing") - await check_postgresql_units_writes_increment(juju, POSTGRESQL_APP_NAME) + await check_db_units_writes_increment(juju, DB_APP_NAME) @pytest.mark.abort_on_fail async def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: """Test an upgrade failure and its rollback.""" - postgresql_app_leader = get_app_leader(juju, POSTGRESQL_APP_NAME) - postgresql_app_units = get_app_units(juju, POSTGRESQL_APP_NAME) + db_app_leader = get_app_leader(juju, DB_APP_NAME) + db_app_units = get_app_units(juju, DB_APP_NAME) logging.info("Run pre-refresh-check action") - task = juju.run(unit=postgresql_app_leader, action="pre-refresh-check") + task = juju.run(unit=db_app_leader, action="pre-refresh-check") task.raise_on_failure() tmp_folder = Path("tmp") @@ -128,45 +117,41 @@ async def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> N shutil.copy(charm, tmp_folder_charm) logging.info("Inject dependency fault") - inject_dependency_fault(juju, POSTGRESQL_APP_NAME, tmp_folder_charm) + inject_dependency_fault(juju, DB_APP_NAME, tmp_folder_charm) logging.info("Refresh the charm") - juju.refresh(app=POSTGRESQL_APP_NAME, path=tmp_folder_charm) + juju.refresh(app=DB_APP_NAME, path=tmp_folder_charm) logging.info("Wait for upgrade to fail on leader") juju.wait( - ready=wait_for_apps_status(jubilant.any_blocked, POSTGRESQL_APP_NAME), + ready=wait_for_apps_status(jubilant_backports.any_blocked, DB_APP_NAME), timeout=10 * MINUTE_SECS, ) logging.info("Ensure continuous writes on all units") - await check_postgresql_units_writes_increment( - juju, POSTGRESQL_APP_NAME, list(postgresql_app_units) - ) + await check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) logging.info("Re-run pre-refresh-check action") - task = juju.run(unit=postgresql_app_leader, action="pre-refresh-check") + task = juju.run(unit=db_app_leader, action="pre-refresh-check") task.raise_on_failure() logging.info("Re-refresh the charm") - juju.refresh(app=POSTGRESQL_APP_NAME, path=charm) + juju.refresh(app=DB_APP_NAME, path=charm) logging.info("Wait for upgrade to start") juju.wait( - ready=lambda status: jubilant.any_maintenance(status, POSTGRESQL_APP_NAME), + ready=lambda status: jubilant_backports.any_maintenance(status, DB_APP_NAME), timeout=10 * MINUTE_SECS, ) logging.info("Wait for upgrade to complete") juju.wait( - ready=lambda status: jubilant.all_active(status, POSTGRESQL_APP_NAME), + ready=lambda status: jubilant_backports.all_active(status, DB_APP_NAME), timeout=20 * MINUTE_SECS, ) logging.info("Ensure continuous writes after rollback procedure") - await check_postgresql_units_writes_increment( - juju, POSTGRESQL_APP_NAME, list(postgresql_app_units) - ) + await check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) # Remove fault charm file tmp_folder_charm.unlink() From fdf5a15c329a5141e4ff1a3549b3d8168a4f8ef0 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 8 Oct 2025 16:37:07 +0300 Subject: [PATCH 09/33] Switch to get primary action --- .../high_availability_helpers_new.py | 34 ++++--------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 589f495513..c85a9ee1b7 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -154,28 +154,6 @@ def get_relation_data(juju: Juju, app_name: str, rel_name: str) -> list[dict]: return relation_data -def get_db_cluster_status(juju: Juju, unit: str, cluster_set: bool = False) -> dict: - """Get the cluster status by running the get-cluster-status action. - - Args: - juju: The juju instance to use. - unit: The unit on which to execute the action on - cluster_set: Whether to get the cluster-set instead (optional) - - Returns: - A dictionary representing the cluster status - """ - task = juju.run( - unit=unit, - action="get-cluster-status", - params={"cluster-set": cluster_set}, - wait=5 * MINUTE_SECS, - ) - task.raise_on_failure() - - return task.results.get("status", {}) - - def get_db_unit_name(instance_label: str) -> str: """Builds a Juju unit name out of a MySQL instance label.""" return "/".join(instance_label.rsplit("-", 1)) @@ -184,14 +162,14 @@ def get_db_unit_name(instance_label: str) -> str: def get_db_primary_unit(juju: Juju, app_name: str) -> str: """Get the current primary node of the cluster.""" postgresql_primary = get_app_leader(juju, app_name) - postgresql_cluster_status = get_db_cluster_status(juju, postgresql_primary) - postgresql_cluster_topology = postgresql_cluster_status["defaultreplicaset"]["topology"] + task = juju.run(unit=postgresql_primary, action="get-primary", wait=5 * MINUTE_SECS) + task.raise_on_failure() - for label, value in postgresql_cluster_topology.items(): - if value["memberrole"] == "primary": - return get_db_unit_name(label) + primary = task.results.get("primary") + if primary != "None": + return primary - raise Exception("No MySQL primary node found") + raise Exception("No primary node found") async def get_db_max_written_value(juju: Juju, app_name: str, unit_name: str) -> int: From 1afc563f4c95986e7a4382b5e51cd1b40e30a798 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 8 Oct 2025 17:44:58 +0300 Subject: [PATCH 10/33] Add continuous writes fixture --- .../integration/high_availability/conftest.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 tests/integration/high_availability/conftest.py diff --git a/tests/integration/high_availability/conftest.py b/tests/integration/high_availability/conftest.py new file mode 100644 index 0000000000..8b33194914 --- /dev/null +++ b/tests/integration/high_availability/conftest.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import pytest + +from .high_availability_helpers_new import get_app_leader + +logger = logging.getLogger(__name__) + +DB_TEST_APP_NAME = "postgresql-test-app" + + +@pytest.fixture() +def continuous_writes(juju): + """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" + application_unit = get_app_leader(juju, DB_TEST_APP_NAME) + + logger.info("Clearing continuous writes") + juju.run(unit=application_unit, action="clear-continuous-writes", wait=120) + + logger.info("Starting continuous writes") + juju.run(unit=application_unit, action="start-continuous-writes") + + yield + + logger.info("Clearing continuous writes") + juju.run(unit=application_unit, action="clear-continuous-writes", wait=120) From 731f8b92f620f21e22da5ba97e69d238aa9a6993 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 8 Oct 2025 18:17:39 +0300 Subject: [PATCH 11/33] add upgrade from stable --- .../ha_tests/test_upgrade_from_stable.py | 143 ------------------ .../high_availability_helpers_new.py | 2 +- .../test_upgrade_from_stable.py | 94 ++++++++++++ .../test_upgrade_from_stable.py/task.yaml | 2 +- 4 files changed, 96 insertions(+), 145 deletions(-) delete mode 100644 tests/integration/ha_tests/test_upgrade_from_stable.py create mode 100644 tests/integration/high_availability/test_upgrade_from_stable.py diff --git a/tests/integration/ha_tests/test_upgrade_from_stable.py b/tests/integration/ha_tests/test_upgrade_from_stable.py deleted file mode 100644 index d95e4567be..0000000000 --- a/tests/integration/ha_tests/test_upgrade_from_stable.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. -import logging -from asyncio import gather - -import pytest -from pytest_operator.plugin import OpsTest - -from ..helpers import ( - APPLICATION_NAME, - DATABASE_APP_NAME, - count_switchovers, - get_leader_unit, - get_primary, -) -from .helpers import ( - are_writes_increasing, - check_writes, - start_continuous_writes, -) - -logger = logging.getLogger(__name__) - -TIMEOUT = 25 * 60 - - -@pytest.mark.abort_on_fail -async def test_deploy_stable(ops_test: OpsTest) -> None: - """Simple test to ensure that the PostgreSQL and application charms get deployed.""" - await gather( - ops_test.model.deploy( - DATABASE_APP_NAME, num_units=3, channel="16/stable", config={"profile": "testing"} - ), - ops_test.model.deploy( - APPLICATION_NAME, - num_units=1, - channel="latest/edge", - config={"sleep_interval": 500}, - ), - ) - await ops_test.model.relate(DATABASE_APP_NAME, f"{APPLICATION_NAME}:database") - logger.info("Wait for applications to become active") - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active", timeout=(20 * 60) - ) - assert len(ops_test.model.applications[DATABASE_APP_NAME].units) == 3 - - -@pytest.mark.abort_on_fail -async def test_pre_refresh_check(ops_test: OpsTest) -> None: - """Test that the pre-refresh-check action runs successfully.""" - logger.info("Get leader unit") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - assert leader_unit is not None, "No leader unit found" - - logger.info("Run pre-refresh-check action") - action = await leader_unit.run_action("pre-refresh-check") - await action.wait() - - -@pytest.mark.abort_on_fail -async def test_upgrade_from_stable(ops_test: OpsTest, charm): - """Test updating from stable channel.""" - # Start an application that continuously writes data to the database. - logger.info("starting continuous writes to the database") - await start_continuous_writes(ops_test, DATABASE_APP_NAME) - - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - primary_name = await get_primary(ops_test, f"{DATABASE_APP_NAME}/0") - initial_number_of_switchovers = count_switchovers(ops_test, primary_name) - - application = ops_test.model.applications[DATABASE_APP_NAME] - - logger.info("Refresh the charm") - await application.refresh(path=charm) - - logger.info("Wait for upgrade to start") - try: - # Blocked status is expected due to: - # (on PR) compatibility checks (on PR charm revision is '16/1.25.0+dirty...') - # (non-PR) the first unit upgraded and paused (pause-after-unit-refresh=first) - await ops_test.model.block_until(lambda: application.status == "blocked", timeout=60 * 3) - - logger.info("Wait for refresh to block as paused or incompatible") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT - ) - - # Highest to lowest unit number - refresh_order = sorted( - application.units, key=lambda unit: int(unit.name.split("/")[1]), reverse=True - ) - - if "Refresh incompatible" in application.status_message: - logger.info("Application refresh is blocked due to incompatibility") - - action = await refresh_order[0].run_action( - "force-refresh-start", **{"check-compatibility": False} - ) - await action.wait() - - logger.info("Wait for first incompatible unit to upgrade") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT - ) - - logger.info("Run resume-refresh action") - action = await refresh_order[1].run_action("resume-refresh") - await action.wait() - except TimeoutError: - # If the application didn't get into the blocked state, it should have upgraded only - # the charm code because the snap revision didn't change. - logger.info("Upgrade completed without snap refresh (charm.py upgrade only)") - assert application.status == "active", ( - "Application didn't reach blocked or active state after refresh attempt" - ) - - logger.info("Wait for upgrade to complete") - async with ops_test.fast_forward("60s"): - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=30, timeout=TIMEOUT - ) - - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) - - # Verify that no writes to the database were missed after stopping the writes - # (check that all the units have all the writes). - logger.info("checking whether no writes were lost") - await check_writes(ops_test) - - logger.info("checking the number of switchovers") - final_number_of_switchovers = count_switchovers(ops_test, primary_name) - assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( - "Number of switchovers is greater than 2" - ) diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index c85a9ee1b7..b2e164f146 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -186,7 +186,7 @@ async def get_db_max_written_value(juju: Juju, app_name: str, unit_name: str) -> get_unit_ip(juju, app_name, unit_name), SERVER_CONFIG_USERNAME, password, - ["SELECT MAX(number) FROM `continuous_writes`.`data`;"], + ["SELECT COUNT(number) FROM continuous_writes;"], "postgresql_test_app_database", ) return output[0] diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py new file mode 100644 index 0000000000..37711fd572 --- /dev/null +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -0,0 +1,94 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import jubilant +import pytest +from jubilant import Juju + +from .high_availability_helpers_new import ( + check_db_units_writes_increment, + get_app_leader, + get_db_primary_unit, + wait_for_apps_status, +) + +DB_APP_NAME = "postgresql" +DB_TEST_APP_NAME = "postgresql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +@pytest.mark.abort_on_fail +def test_deploy_stable(juju: Juju) -> None: + """Simple test to ensure that the MySQL and application charms get deployed.""" + logging.info("Deploying MySQL cluster") + juju.deploy( + charm=DB_APP_NAME, + app=DB_APP_NAME, + base="ubuntu@24.04", + channel="16/stable", + config={"profile": "testing"}, + num_units=3, + ) + juju.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + ) + + juju.integrate( + f"{DB_APP_NAME}:database", + f"{DB_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME, DB_TEST_APP_NAME), + error=jubilant.any_blocked, + timeout=20 * MINUTE_SECS, + ) + + +@pytest.mark.abort_on_fail +async def test_pre_upgrade_check(juju: Juju) -> None: + """Test that the pre-upgrade-check action runs successfully.""" + db_leader = get_app_leader(juju, DB_APP_NAME) + + logging.info("Run pre-refresh-check action") + task = juju.run(unit=db_leader, action="pre-refresh-check") + task.raise_on_failure() + + logging.info("Assert primary is set to leader") + db_primary = get_db_primary_unit(juju, DB_APP_NAME) + assert db_primary == db_leader, "Primary unit not set to leader" + + +@pytest.mark.abort_on_fail +async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> None: + """Update the second cluster.""" + logging.info("Ensure continuous writes are incrementing") + await check_db_units_writes_increment(juju, DB_APP_NAME) + + logging.info("Refresh the charm") + juju.refresh(app=DB_APP_NAME, path=charm) + + logging.info("Wait for upgrade to start") + juju.wait( + ready=lambda status: jubilant.any_maintenance(status, DB_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=lambda status: jubilant.all_active(status, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes are incrementing") + await check_db_units_writes_increment(juju, DB_APP_NAME) diff --git a/tests/spread/test_upgrade_from_stable.py/task.yaml b/tests/spread/test_upgrade_from_stable.py/task.yaml index 047617ab39..ffdb002d25 100644 --- a/tests/spread/test_upgrade_from_stable.py/task.yaml +++ b/tests/spread/test_upgrade_from_stable.py/task.yaml @@ -1,6 +1,6 @@ summary: test_upgrade_from_stable.py environment: - TEST_MODULE: ha_tests/test_upgrade_from_stable.py + TEST_MODULE: high_availability/test_upgrade_from_stable.py execute: | tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" artifacts: From 1dcaaf4f5b73099db6eb21c7c54287adb89bb506 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 8 Oct 2025 23:29:35 +0300 Subject: [PATCH 12/33] Wait for idle --- tests/integration/high_availability/test_upgrade.py | 2 +- .../high_availability/test_upgrade_from_stable.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py index cd34f8e6e4..3d5ff88c04 100644 --- a/tests/integration/high_availability/test_upgrade.py +++ b/tests/integration/high_availability/test_upgrade.py @@ -92,7 +92,7 @@ async def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> N logging.info("Wait for upgrade to complete") juju.wait( - ready=lambda status: jubilant.all_active(status, DB_APP_NAME), + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), timeout=20 * MINUTE_SECS, ) diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py index 37711fd572..a657c646b3 100644 --- a/tests/integration/high_availability/test_upgrade_from_stable.py +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -23,9 +23,9 @@ @pytest.mark.abort_on_fail -def test_deploy_stable(juju: Juju) -> None: - """Simple test to ensure that the MySQL and application charms get deployed.""" - logging.info("Deploying MySQL cluster") +def test_deploy_latest(juju: Juju) -> None: + """Simple test to ensure that the PostgreSQL and application charms get deployed.""" + logging.info("Deploying PostgreSQL cluster") juju.deploy( charm=DB_APP_NAME, app=DB_APP_NAME, @@ -50,7 +50,6 @@ def test_deploy_stable(juju: Juju) -> None: logging.info("Wait for applications to become active") juju.wait( ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME, DB_TEST_APP_NAME), - error=jubilant.any_blocked, timeout=20 * MINUTE_SECS, ) @@ -86,7 +85,7 @@ async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> logging.info("Wait for upgrade to complete") juju.wait( - ready=lambda status: jubilant.all_active(status, DB_APP_NAME), + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), timeout=20 * MINUTE_SECS, ) From a623f79192739d161e806f5d0bf8cacddf26f2bd Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 9 Oct 2025 00:47:59 +0300 Subject: [PATCH 13/33] Try to handle force refresh --- .../high_availability/test_upgrade.py | 5 ---- .../test_upgrade_from_stable.py | 25 +++++++++++++------ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py index 3d5ff88c04..e9f291d518 100644 --- a/tests/integration/high_availability/test_upgrade.py +++ b/tests/integration/high_availability/test_upgrade.py @@ -17,7 +17,6 @@ check_db_units_writes_increment, get_app_leader, get_app_units, - get_db_primary_unit, wait_for_apps_status, ) @@ -70,10 +69,6 @@ async def test_pre_refresh_check(juju: Juju) -> None: task = juju.run(unit=postgresql_leader, action="pre-refresh-check") task.raise_on_failure() - logging.info("Assert primary is set to leader") - postgresql_primary = get_db_primary_unit(juju, DB_APP_NAME) - assert postgresql_primary == postgresql_leader, "Primary unit not set to leader" - @pytest.mark.abort_on_fail async def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py index a657c646b3..686884d92c 100644 --- a/tests/integration/high_availability/test_upgrade_from_stable.py +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -10,7 +10,7 @@ from .high_availability_helpers_new import ( check_db_units_writes_increment, get_app_leader, - get_db_primary_unit, + get_app_units, wait_for_apps_status, ) @@ -55,18 +55,14 @@ def test_deploy_latest(juju: Juju) -> None: @pytest.mark.abort_on_fail -async def test_pre_upgrade_check(juju: Juju) -> None: - """Test that the pre-upgrade-check action runs successfully.""" +async def test_pre_refresh_check(juju: Juju) -> None: + """Test that the pre-refresh-check action runs successfully.""" db_leader = get_app_leader(juju, DB_APP_NAME) logging.info("Run pre-refresh-check action") task = juju.run(unit=db_leader, action="pre-refresh-check") task.raise_on_failure() - logging.info("Assert primary is set to leader") - db_primary = get_db_primary_unit(juju, DB_APP_NAME) - assert db_primary == db_leader, "Primary unit not set to leader" - @pytest.mark.abort_on_fail async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> None: @@ -83,6 +79,21 @@ async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> timeout=10 * MINUTE_SECS, ) + logging.info("Application refresh is blocked due to incompatibility") + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + db_leader = get_app_leader(juju, DB_APP_NAME) + juju.run( + unit=db_leader, action="force-refresh-start", params={"check-compatibility": "False"} + ) + + juju.wait(ready=jubilant.all_active) + + logging.info("Run resume-refresh action") + units = get_app_units(juju, DB_APP_NAME) + await juju.run(unit=units[sorted(units.keys())[1]], action="resume-refresh") + logging.info("Wait for upgrade to complete") juju.wait( ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), From 3acca97fee58f33d333aa71533a7fcd125ad7d34 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 9 Oct 2025 11:39:46 +0300 Subject: [PATCH 14/33] Don't wait for maintenance --- .../high_availability/test_upgrade_from_stable.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py index 686884d92c..144096433f 100644 --- a/tests/integration/high_availability/test_upgrade_from_stable.py +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -73,12 +73,6 @@ async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> logging.info("Refresh the charm") juju.refresh(app=DB_APP_NAME, path=charm) - logging.info("Wait for upgrade to start") - juju.wait( - ready=lambda status: jubilant.any_maintenance(status, DB_APP_NAME), - timeout=10 * MINUTE_SECS, - ) - logging.info("Application refresh is blocked due to incompatibility") juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked) From ec691cee36b2358f808cce6db6b230b00dce4b9f Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 9 Oct 2025 15:17:14 +0300 Subject: [PATCH 15/33] Typed params --- tests/integration/high_availability/test_upgrade_from_stable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py index 144096433f..4a9ecb98dc 100644 --- a/tests/integration/high_availability/test_upgrade_from_stable.py +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -79,7 +79,7 @@ async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: db_leader = get_app_leader(juju, DB_APP_NAME) juju.run( - unit=db_leader, action="force-refresh-start", params={"check-compatibility": "False"} + unit=db_leader, action="force-refresh-start", params={"check-compatibility": False} ) juju.wait(ready=jubilant.all_active) From b356cbb4e7ec70618fc4b2b375350ffd23465d76 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 9 Oct 2025 17:51:34 +0300 Subject: [PATCH 16/33] Tweak actions --- tests/integration/high_availability/conftest.py | 9 ++++++--- .../high_availability/test_upgrade_from_stable.py | 11 +++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/integration/high_availability/conftest.py b/tests/integration/high_availability/conftest.py index 8b33194914..4d72873430 100644 --- a/tests/integration/high_availability/conftest.py +++ b/tests/integration/high_availability/conftest.py @@ -19,12 +19,15 @@ def continuous_writes(juju): application_unit = get_app_leader(juju, DB_TEST_APP_NAME) logger.info("Clearing continuous writes") - juju.run(unit=application_unit, action="clear-continuous-writes", wait=120) + result = juju.run(unit=application_unit, action="clear-continuous-writes", wait=120) + result.raise_on_failure() logger.info("Starting continuous writes") - juju.run(unit=application_unit, action="start-continuous-writes") + result = juju.run(unit=application_unit, action="start-continuous-writes") + result.raise_on_failure() yield logger.info("Clearing continuous writes") - juju.run(unit=application_unit, action="clear-continuous-writes", wait=120) + result = juju.run(unit=application_unit, action="clear-continuous-writes", wait=120) + result.raise_on_failure() diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py index 4a9ecb98dc..3271a91f80 100644 --- a/tests/integration/high_availability/test_upgrade_from_stable.py +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -76,17 +76,20 @@ async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> logging.info("Application refresh is blocked due to incompatibility") juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked) + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: - db_leader = get_app_leader(juju, DB_APP_NAME) juju.run( - unit=db_leader, action="force-refresh-start", params={"check-compatibility": False} + unit=units[unit_names[-1]], + action="force-refresh-start", + params={"check-compatibility": False}, ) juju.wait(ready=jubilant.all_active) logging.info("Run resume-refresh action") - units = get_app_units(juju, DB_APP_NAME) - await juju.run(unit=units[sorted(units.keys())[1]], action="resume-refresh") + juju.run(unit=units[unit_names[1]], action="resume-refresh") logging.info("Wait for upgrade to complete") juju.wait( From fa8c88438dbadb0a0b2958a5b0a0f4be23b0f9bd Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 9 Oct 2025 19:52:46 +0300 Subject: [PATCH 17/33] Tweaks --- tests/integration/helpers.py | 2 +- .../high_availability_helpers_new.py | 10 ++++----- .../test_async_replication.py | 22 +++++++++---------- .../high_availability/test_upgrade.py | 14 ++++++------ .../test_upgrade_from_stable.py | 19 ++++++++-------- 5 files changed, 34 insertions(+), 33 deletions(-) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 793313a470..eb39cac6bb 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -1436,7 +1436,7 @@ async def backup_operations( ### Ported Mysql jubilant helpers -async def execute_queries_on_unit( +def execute_queries_on_unit( unit_address: str, username: str, password: str, diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index b2e164f146..6205f5ab6a 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -22,7 +22,7 @@ JujuAppsStatusFn = Callable[[Status, str], bool] -async def check_db_units_writes_increment( +def check_db_units_writes_increment( juju: Juju, app_name: str, app_units: list[str] | None = None ) -> None: """Ensure that continuous writes is incrementing on all units. @@ -34,7 +34,7 @@ async def check_db_units_writes_increment( app_units = get_app_units(juju, app_name) app_primary = get_db_primary_unit(juju, app_name) - app_max_value = await get_db_max_written_value(juju, app_name, app_primary) + app_max_value = get_db_max_written_value(juju, app_name, app_primary) juju.model_config({"update-status-hook-interval": "15s"}) for unit_name in app_units: @@ -44,7 +44,7 @@ async def check_db_units_writes_increment( wait=wait_fixed(10), ): with attempt: - unit_max_value = await get_db_max_written_value(juju, app_name, unit_name) + unit_max_value = get_db_max_written_value(juju, app_name, unit_name) assert unit_max_value > app_max_value, "Writes not incrementing" app_max_value = unit_max_value @@ -172,7 +172,7 @@ def get_db_primary_unit(juju: Juju, app_name: str) -> str: raise Exception("No primary node found") -async def get_db_max_written_value(juju: Juju, app_name: str, unit_name: str) -> int: +def get_db_max_written_value(juju: Juju, app_name: str, unit_name: str) -> int: """Retrieve the max written value in the MySQL database. Args: @@ -182,7 +182,7 @@ async def get_db_max_written_value(juju: Juju, app_name: str, unit_name: str) -> """ password = get_user_password(juju, app_name, SERVER_CONFIG_USERNAME) - output = await execute_queries_on_unit( + output = execute_queries_on_unit( get_unit_ip(juju, app_name, unit_name), SERVER_CONFIG_USERNAME, password, diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index f184b0aec4..6310b6946f 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -190,10 +190,10 @@ def test_create_replication(first_model: str, second_model: str) -> None: @juju3 @pytest.mark.abort_on_fail -async def test_data_replication(first_model: str, second_model: str, continuous_writes) -> None: +def test_data_replication(first_model: str, second_model: str, continuous_writes) -> None: """Test to write to primary, and read the same data back from replicas.""" logging.info("Testing data replication") - results = await get_db_max_written_values(first_model, second_model) + results = get_db_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" @@ -202,7 +202,7 @@ async def test_data_replication(first_model: str, second_model: str, continuous_ @juju3 @pytest.mark.abort_on_fail -async def test_standby_promotion(first_model: str, second_model: str, continuous_writes) -> None: +def test_standby_promotion(first_model: str, second_model: str, continuous_writes) -> None: """Test graceful promotion of a standby cluster to primary.""" model_2 = Juju(model=second_model) model_2_postgresql_leader = get_app_leader(model_2, POSTGRESQL_APP_2) @@ -215,7 +215,7 @@ async def test_standby_promotion(first_model: str, second_model: str, continuous ) promotion_task.raise_on_failure() - results = await get_db_max_written_values(first_model, second_model) + results = get_db_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" @@ -277,7 +277,7 @@ def test_failover(first_model: str, second_model: str) -> None: @juju3 @pytest.mark.abort_on_fail -async def test_rejoin_invalidated_cluster( +def test_rejoin_invalidated_cluster( first_model: str, second_model: str, continuous_writes ) -> None: """Test rejoin invalidated cluster with.""" @@ -291,7 +291,7 @@ async def test_rejoin_invalidated_cluster( ) task.raise_on_failure() - results = await get_db_max_written_values(first_model, second_model) + results = get_db_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" @@ -299,7 +299,7 @@ async def test_rejoin_invalidated_cluster( @juju3 @pytest.mark.abort_on_fail -async def test_unrelate_and_relate(first_model: str, second_model: str, continuous_writes) -> None: +def test_unrelate_and_relate(first_model: str, second_model: str, continuous_writes) -> None: """Test removing and re-relating the two postgresql clusters.""" model_1 = Juju(model=first_model) model_2 = Juju(model=second_model) @@ -348,13 +348,13 @@ async def test_unrelate_and_relate(first_model: str, second_model: str, continuo timeout=10 * MINUTE_SECS, ) - results = await get_db_max_written_values(first_model, second_model) + results = get_db_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" -async def get_db_max_written_values(first_model: str, second_model: str) -> list[int]: +def get_db_max_written_values(first_model: str, second_model: str) -> list[int]: """Return list with max written value from all units.""" model_1 = Juju(model=first_model) model_2 = Juju(model=second_model) @@ -370,12 +370,12 @@ async def get_db_max_written_values(first_model: str, second_model: str) -> list logging.info(f"Querying max value on all {POSTGRESQL_APP_1} units") for unit_name in get_app_units(model_1, POSTGRESQL_APP_1): - unit_max_value = await get_db_max_written_value(model_1, POSTGRESQL_APP_1, unit_name) + unit_max_value = get_db_max_written_value(model_1, POSTGRESQL_APP_1, unit_name) results.append(unit_max_value) logging.info(f"Querying max value on all {POSTGRESQL_APP_2} units") for unit_name in get_app_units(model_2, POSTGRESQL_APP_2): - unit_max_value = await get_db_max_written_value(model_2, POSTGRESQL_APP_2, unit_name) + unit_max_value = get_db_max_written_value(model_2, POSTGRESQL_APP_2, unit_name) results.append(unit_max_value) return results diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py index e9f291d518..b4252dcb4b 100644 --- a/tests/integration/high_availability/test_upgrade.py +++ b/tests/integration/high_availability/test_upgrade.py @@ -61,7 +61,7 @@ def test_deploy_latest(juju: Juju) -> None: @pytest.mark.abort_on_fail -async def test_pre_refresh_check(juju: Juju) -> None: +def test_pre_refresh_check(juju: Juju) -> None: """Test that the pre-refresh-check action runs successfully.""" postgresql_leader = get_app_leader(juju, DB_APP_NAME) @@ -71,10 +71,10 @@ async def test_pre_refresh_check(juju: Juju) -> None: @pytest.mark.abort_on_fail -async def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: +def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: """Update the second cluster.""" logging.info("Ensure continuous writes are incrementing") - await check_db_units_writes_increment(juju, DB_APP_NAME) + check_db_units_writes_increment(juju, DB_APP_NAME) logging.info("Refresh the charm") juju.refresh(app=DB_APP_NAME, path=charm) @@ -92,11 +92,11 @@ async def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> N ) logging.info("Ensure continuous writes are incrementing") - await check_db_units_writes_increment(juju, DB_APP_NAME) + check_db_units_writes_increment(juju, DB_APP_NAME) @pytest.mark.abort_on_fail -async def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: +def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: """Test an upgrade failure and its rollback.""" db_app_leader = get_app_leader(juju, DB_APP_NAME) db_app_units = get_app_units(juju, DB_APP_NAME) @@ -124,7 +124,7 @@ async def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> N ) logging.info("Ensure continuous writes on all units") - await check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) + check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) logging.info("Re-run pre-refresh-check action") task = juju.run(unit=db_app_leader, action="pre-refresh-check") @@ -146,7 +146,7 @@ async def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> N ) logging.info("Ensure continuous writes after rollback procedure") - await check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) + check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) # Remove fault charm file tmp_folder_charm.unlink() diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py index 3271a91f80..1dc4cc7bd1 100644 --- a/tests/integration/high_availability/test_upgrade_from_stable.py +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -55,20 +55,21 @@ def test_deploy_latest(juju: Juju) -> None: @pytest.mark.abort_on_fail -async def test_pre_refresh_check(juju: Juju) -> None: +def test_pre_refresh_check(juju: Juju) -> None: """Test that the pre-refresh-check action runs successfully.""" db_leader = get_app_leader(juju, DB_APP_NAME) logging.info("Run pre-refresh-check action") - task = juju.run(unit=db_leader, action="pre-refresh-check") - task.raise_on_failure() + juju.run(unit=db_leader, action="pre-refresh-check") + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) @pytest.mark.abort_on_fail -async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> None: +def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> None: """Update the second cluster.""" logging.info("Ensure continuous writes are incrementing") - await check_db_units_writes_increment(juju, DB_APP_NAME) + check_db_units_writes_increment(juju, DB_APP_NAME) logging.info("Refresh the charm") juju.refresh(app=DB_APP_NAME, path=charm) @@ -81,15 +82,15 @@ async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: juju.run( - unit=units[unit_names[-1]], + unit=unit_names[-1], action="force-refresh-start", params={"check-compatibility": False}, ) - juju.wait(ready=jubilant.all_active) + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) logging.info("Run resume-refresh action") - juju.run(unit=units[unit_names[1]], action="resume-refresh") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) logging.info("Wait for upgrade to complete") juju.wait( @@ -98,4 +99,4 @@ async def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> ) logging.info("Ensure continuous writes are incrementing") - await check_db_units_writes_increment(juju, DB_APP_NAME) + check_db_units_writes_increment(juju, DB_APP_NAME) From e22958c1fe18a1f3ec80d667d25abd4c1dc435aa Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 10 Oct 2025 03:10:27 +0300 Subject: [PATCH 18/33] Tweak upgrade test --- .../high_availability/test_upgrade.py | 30 ++++++++++++++----- .../test_upgrade_from_stable.py | 3 +- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py index b4252dcb4b..773f8ee921 100644 --- a/tests/integration/high_availability/test_upgrade.py +++ b/tests/integration/high_availability/test_upgrade.py @@ -63,11 +63,12 @@ def test_deploy_latest(juju: Juju) -> None: @pytest.mark.abort_on_fail def test_pre_refresh_check(juju: Juju) -> None: """Test that the pre-refresh-check action runs successfully.""" - postgresql_leader = get_app_leader(juju, DB_APP_NAME) + db_leader = get_app_leader(juju, DB_APP_NAME) logging.info("Run pre-refresh-check action") - task = juju.run(unit=postgresql_leader, action="pre-refresh-check") - task.raise_on_failure() + juju.run(unit=db_leader, action="pre-refresh-check") + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) @pytest.mark.abort_on_fail @@ -79,11 +80,24 @@ def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: logging.info("Refresh the charm") juju.refresh(app=DB_APP_NAME, path=charm) - logging.info("Wait for upgrade to start") - juju.wait( - ready=lambda status: jubilant.any_maintenance(status, DB_APP_NAME), - timeout=10 * MINUTE_SECS, - ) + logging.info("Application refresh is blocked due to incompatibility") + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked) + + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + juju.run( + unit=unit_names[-1], + action="force-refresh-start", + params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, + ) + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + logging.info("Run resume-refresh action") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) logging.info("Wait for upgrade to complete") juju.wait( diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py index 1dc4cc7bd1..ed9cc039c3 100644 --- a/tests/integration/high_availability/test_upgrade_from_stable.py +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -23,7 +23,7 @@ @pytest.mark.abort_on_fail -def test_deploy_latest(juju: Juju) -> None: +def test_deploy_stable(juju: Juju) -> None: """Simple test to ensure that the PostgreSQL and application charms get deployed.""" logging.info("Deploying PostgreSQL cluster") juju.deploy( @@ -85,6 +85,7 @@ def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> None: unit=unit_names[-1], action="force-refresh-start", params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, ) juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) From 867fcee3f0e9515ab844973782f16c24ce8a4622 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 10 Oct 2025 16:56:03 +0300 Subject: [PATCH 19/33] Retry start in case of ro transaction --- tests/integration/high_availability/conftest.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/integration/high_availability/conftest.py b/tests/integration/high_availability/conftest.py index 4d72873430..1636863cc2 100644 --- a/tests/integration/high_availability/conftest.py +++ b/tests/integration/high_availability/conftest.py @@ -5,6 +5,10 @@ import logging import pytest +from tenacity import ( + Retrying, + stop_after_attempt, +) from .high_availability_helpers_new import get_app_leader @@ -23,8 +27,13 @@ def continuous_writes(juju): result.raise_on_failure() logger.info("Starting continuous writes") - result = juju.run(unit=application_unit, action="start-continuous-writes") - result.raise_on_failure() + + for attempt in Retrying(stop=stop_after_attempt(10), reraise=True): + with attempt: + result = juju.run(unit=application_unit, action="start-continuous-writes") + result.raise_on_failure() + + assert result.results["result"] == "True" yield From 67d2c7cd9387ee2c87a908814377937492bce311 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Sun, 12 Oct 2025 01:30:06 +0300 Subject: [PATCH 20/33] Tweak tests --- tests/integration/conftest.py | 5 ---- .../high_availability_helpers_new.py | 11 ++++++++- .../test_async_replication.py | 8 ------- .../high_availability/test_upgrade.py | 24 ++++++++++++------- .../test_upgrade_from_stable.py | 18 +++++++++----- 5 files changed, 37 insertions(+), 29 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index d31d275dba..b36aa192af 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -111,14 +111,9 @@ def juju(request: pytest.FixtureRequest): if model: juju = jubilant.Juju(model=model) yield juju - log = juju.debug_log(limit=1000) else: with jubilant.temp_model(keep=keep_models) as juju: yield juju - log = juju.debug_log(limit=1000) - - if request.session.testsfailed: - print(log, end="") @pytest.fixture(scope="module") diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 6205f5ab6a..4f0e2e7747 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -7,6 +7,7 @@ from collections.abc import Callable import jubilant +import requests from jubilant import Juju from jubilant.statustypes import Status, UnitStatus from tenacity import Retrying, stop_after_delay, wait_fixed @@ -186,7 +187,7 @@ def get_db_max_written_value(juju: Juju, app_name: str, unit_name: str) -> int: get_unit_ip(juju, app_name, unit_name), SERVER_CONFIG_USERNAME, password, - ["SELECT COUNT(number) FROM continuous_writes;"], + ["SELECT MAX(number) FROM continuous_writes;"], "postgresql_test_app_database", ) return output[0] @@ -231,3 +232,11 @@ def get_user_password(juju: Juju, app_name: str, user: str) -> str | None: if secret.label == f"{PEER}.{app_name}.app": revealed_secret = juju.show_secret(secret.uri, reveal=True) return revealed_secret.content.get(f"{user}-password") + + +def count_switchovers(juju: Juju, app_name: str) -> int: + """Return the number of performed switchovers.""" + app_primary = get_db_primary_unit(juju, app_name) + unit_address = get_unit_ip(juju, app_name, app_primary) + switchover_history_info = requests.get(f"https://{unit_address}:8008/history", verify=False) + return len(switchover_history_info.json()) diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index 6310b6946f..9b4f18d628 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -69,7 +69,6 @@ def continuous_writes(first_model: str) -> Generator: @juju3 -@pytest.mark.abort_on_fail def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> None: """Simple test to ensure that the MySQL application charms get deployed.""" configuration = {"profile": "testing"} @@ -107,7 +106,6 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No @juju3 -@pytest.mark.abort_on_fail def test_async_relate(first_model: str, second_model: str) -> None: """Relate the two MySQL clusters.""" logging.info("Creating offers in first model") @@ -136,7 +134,6 @@ def test_async_relate(first_model: str, second_model: str) -> None: @juju3 -@pytest.mark.abort_on_fail def test_deploy_app(first_model: str) -> None: """Deploy the router and the test application.""" logging.info("Deploying test application") @@ -163,7 +160,6 @@ def test_deploy_app(first_model: str) -> None: @juju3 -@pytest.mark.abort_on_fail def test_create_replication(first_model: str, second_model: str) -> None: """Run the create-replication action and wait for the applications to settle.""" model_1 = Juju(model=first_model) @@ -189,7 +185,6 @@ def test_create_replication(first_model: str, second_model: str) -> None: @juju3 -@pytest.mark.abort_on_fail def test_data_replication(first_model: str, second_model: str, continuous_writes) -> None: """Test to write to primary, and read the same data back from replicas.""" logging.info("Testing data replication") @@ -201,7 +196,6 @@ def test_data_replication(first_model: str, second_model: str, continuous_writes @juju3 -@pytest.mark.abort_on_fail def test_standby_promotion(first_model: str, second_model: str, continuous_writes) -> None: """Test graceful promotion of a standby cluster to primary.""" model_2 = Juju(model=second_model) @@ -232,7 +226,6 @@ def test_standby_promotion(first_model: str, second_model: str, continuous_write @juju3 -@pytest.mark.abort_on_fail def test_failover(first_model: str, second_model: str) -> None: """Test switchover on primary cluster fail.""" logging.info("Freezing postgres on primary cluster units") @@ -276,7 +269,6 @@ def test_failover(first_model: str, second_model: str) -> None: @juju3 -@pytest.mark.abort_on_fail def test_rejoin_invalidated_cluster( first_model: str, second_model: str, continuous_writes ) -> None: diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py index 773f8ee921..bfa7fdd195 100644 --- a/tests/integration/high_availability/test_upgrade.py +++ b/tests/integration/high_availability/test_upgrade.py @@ -8,13 +8,13 @@ from pathlib import Path import jubilant -import pytest import tomli import tomli_w from jubilant import Juju from .high_availability_helpers_new import ( check_db_units_writes_increment, + count_switchovers, get_app_leader, get_app_units, wait_for_apps_status, @@ -28,7 +28,6 @@ logging.getLogger("jubilant.wait").setLevel(logging.WARNING) -@pytest.mark.abort_on_fail def test_deploy_latest(juju: Juju) -> None: """Simple test to ensure that the PostgreSQL and application charms get deployed.""" logging.info("Deploying PostgreSQL cluster") @@ -60,7 +59,6 @@ def test_deploy_latest(juju: Juju) -> None: ) -@pytest.mark.abort_on_fail def test_pre_refresh_check(juju: Juju) -> None: """Test that the pre-refresh-check action runs successfully.""" db_leader = get_app_leader(juju, DB_APP_NAME) @@ -71,22 +69,24 @@ def test_pre_refresh_check(juju: Juju) -> None: juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) -@pytest.mark.abort_on_fail def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: """Update the second cluster.""" logging.info("Ensure continuous writes are incrementing") check_db_units_writes_increment(juju, DB_APP_NAME) + initial_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + logging.info("Refresh the charm") juju.refresh(app=DB_APP_NAME, path=charm) - logging.info("Application refresh is blocked due to incompatibility") - juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked) + logging.info("Waiting for refresh to block") + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) units = get_app_units(juju, DB_APP_NAME) unit_names = sorted(units.keys()) if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") juju.run( unit=unit_names[-1], action="force-refresh-start", @@ -108,16 +108,22 @@ def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: logging.info("Ensure continuous writes are incrementing") check_db_units_writes_increment(juju, DB_APP_NAME) + logging.info("checking the number of switchovers") + final_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( + "Number of switchovers is greater than 2" + ) + -@pytest.mark.abort_on_fail def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: """Test an upgrade failure and its rollback.""" db_app_leader = get_app_leader(juju, DB_APP_NAME) db_app_units = get_app_units(juju, DB_APP_NAME) logging.info("Run pre-refresh-check action") - task = juju.run(unit=db_app_leader, action="pre-refresh-check") - task.raise_on_failure() + juju.run(unit=db_app_leader, action="pre-refresh-check") + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) tmp_folder = Path("tmp") tmp_folder.mkdir(exist_ok=True) diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py index ed9cc039c3..bc40f2ca0a 100644 --- a/tests/integration/high_availability/test_upgrade_from_stable.py +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -4,11 +4,11 @@ import logging import jubilant -import pytest from jubilant import Juju from .high_availability_helpers_new import ( check_db_units_writes_increment, + count_switchovers, get_app_leader, get_app_units, wait_for_apps_status, @@ -22,7 +22,6 @@ logging.getLogger("jubilant.wait").setLevel(logging.WARNING) -@pytest.mark.abort_on_fail def test_deploy_stable(juju: Juju) -> None: """Simple test to ensure that the PostgreSQL and application charms get deployed.""" logging.info("Deploying PostgreSQL cluster") @@ -54,7 +53,6 @@ def test_deploy_stable(juju: Juju) -> None: ) -@pytest.mark.abort_on_fail def test_pre_refresh_check(juju: Juju) -> None: """Test that the pre-refresh-check action runs successfully.""" db_leader = get_app_leader(juju, DB_APP_NAME) @@ -65,22 +63,24 @@ def test_pre_refresh_check(juju: Juju) -> None: juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) -@pytest.mark.abort_on_fail def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> None: """Update the second cluster.""" logging.info("Ensure continuous writes are incrementing") check_db_units_writes_increment(juju, DB_APP_NAME) + initial_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + logging.info("Refresh the charm") juju.refresh(app=DB_APP_NAME, path=charm) - logging.info("Application refresh is blocked due to incompatibility") - juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked) + logging.info("Waiting for refresh to block") + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) units = get_app_units(juju, DB_APP_NAME) unit_names = sorted(units.keys()) if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") juju.run( unit=unit_names[-1], action="force-refresh-start", @@ -101,3 +101,9 @@ def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> None: logging.info("Ensure continuous writes are incrementing") check_db_units_writes_increment(juju, DB_APP_NAME) + + logging.info("checking the number of switchovers") + final_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( + "Number of switchovers is greater than 2" + ) From 3ee013940bd4e1fd18b57f22676b8bad473008db Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Sun, 12 Oct 2025 01:30:40 +0300 Subject: [PATCH 21/33] Abort test run on first failure --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 01a7af6bda..eeeca91230 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,6 +98,7 @@ minversion = "6.0" log_cli_level = "INFO" asyncio_mode = "auto" markers = ["juju3", "juju_secrets"] +addopts = "--exitfirst" # Formatting tools configuration [tool.black] From 9b433f54d4f7a4febd66e3e47e6420044435f1e8 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Sun, 12 Oct 2025 08:22:21 +0300 Subject: [PATCH 22/33] Upgrade test --- .../high_availability/test_upgrade.py | 59 ++-- .../test_upgrade_from_stable.py | 42 +-- .../test_upgrade_rollback_incompat.py | 269 ++++++++++++++++++ .../test_upgrade_skip_pre_upgrade_check.py | 119 ++++++++ .../task.yaml | 7 + .../task.yaml | 7 + 6 files changed, 454 insertions(+), 49 deletions(-) create mode 100644 tests/integration/high_availability/test_upgrade_rollback_incompat.py create mode 100644 tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py create mode 100644 tests/spread/test_upgrade_rollback_incompat.py/task.yaml create mode 100644 tests/spread/test_upgrade_skip_pre_upgrade_check.py/task.yaml diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py index bfa7fdd195..ebea59537b 100644 --- a/tests/integration/high_availability/test_upgrade.py +++ b/tests/integration/high_availability/test_upgrade.py @@ -78,26 +78,35 @@ def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: logging.info("Refresh the charm") juju.refresh(app=DB_APP_NAME, path=charm) + logging.info("Wait for refresh to block as paused or incompatible") + try: + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") + juju.run( + unit=unit_names[-1], + action="force-refresh-start", + params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, + ) + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + logging.info("Run resume-refresh action") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + except TimeoutError: + logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") + assert juju.status().apps[DB_APP_NAME].is_active - logging.info("Waiting for refresh to block") - juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) - - units = get_app_units(juju, DB_APP_NAME) - unit_names = sorted(units.keys()) - - if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: - logging.info("Application refresh is blocked due to incompatibility") - juju.run( - unit=unit_names[-1], - action="force-refresh-start", - params={"check-compatibility": False}, - wait=5 * MINUTE_SECS, - ) - - juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) - - logging.info("Run resume-refresh action") - juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) logging.info("Wait for upgrade to complete") juju.wait( @@ -146,22 +155,12 @@ def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: logging.info("Ensure continuous writes on all units") check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) - logging.info("Re-run pre-refresh-check action") - task = juju.run(unit=db_app_leader, action="pre-refresh-check") - task.raise_on_failure() - logging.info("Re-refresh the charm") juju.refresh(app=DB_APP_NAME, path=charm) - logging.info("Wait for upgrade to start") - juju.wait( - ready=lambda status: jubilant.any_maintenance(status, DB_APP_NAME), - timeout=10 * MINUTE_SECS, - ) - logging.info("Wait for upgrade to complete") juju.wait( - ready=lambda status: jubilant.all_active(status, DB_APP_NAME), + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), timeout=20 * MINUTE_SECS, ) diff --git a/tests/integration/high_availability/test_upgrade_from_stable.py b/tests/integration/high_availability/test_upgrade_from_stable.py index bc40f2ca0a..cc594d7fb3 100644 --- a/tests/integration/high_availability/test_upgrade_from_stable.py +++ b/tests/integration/high_availability/test_upgrade_from_stable.py @@ -73,25 +73,29 @@ def test_upgrade_from_stable(juju: Juju, charm: str, continuous_writes) -> None: logging.info("Refresh the charm") juju.refresh(app=DB_APP_NAME, path=charm) - logging.info("Waiting for refresh to block") - juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) - - units = get_app_units(juju, DB_APP_NAME) - unit_names = sorted(units.keys()) - - if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: - logging.info("Application refresh is blocked due to incompatibility") - juju.run( - unit=unit_names[-1], - action="force-refresh-start", - params={"check-compatibility": False}, - wait=5 * MINUTE_SECS, - ) - - juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) - - logging.info("Run resume-refresh action") - juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + logging.info("Wait for refresh to block as paused or incompatible") + try: + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") + juju.run( + unit=unit_names[-1], + action="force-refresh-start", + params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, + ) + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + logging.info("Run resume-refresh action") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + except TimeoutError: + logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") + assert juju.status().apps[DB_APP_NAME].is_active logging.info("Wait for upgrade to complete") juju.wait( diff --git a/tests/integration/high_availability/test_upgrade_rollback_incompat.py b/tests/integration/high_availability/test_upgrade_rollback_incompat.py new file mode 100644 index 0000000000..b9b66dd39a --- /dev/null +++ b/tests/integration/high_availability/test_upgrade_rollback_incompat.py @@ -0,0 +1,269 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +import json +import logging +import shutil +import time +import zipfile +from ast import literal_eval +from collections.abc import Generator +from pathlib import Path + +import jubilant +import pytest +from jubilant import Juju + +from ..markers import amd64_only +from .high_availability_helpers_new import ( + check_mysql_units_writes_increment, + get_app_leader, + get_relation_data, + get_unit_by_number, + get_unit_status_log, + wait_for_apps_status, + wait_for_unit_status, +) + +MYSQL_APP_NAME = "mysql" +MYSQL_TEST_APP_NAME = "mysql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +@pytest.fixture() +def continuous_writes(juju: Juju) -> Generator: + """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" + test_app_leader = get_app_leader(juju, MYSQL_TEST_APP_NAME) + + logging.info("Clearing continuous writes") + juju.run(test_app_leader, "clear-continuous-writes") + logging.info("Starting continuous writes") + juju.run(test_app_leader, "start-continuous-writes") + + yield + + logging.info("Clearing continuous writes") + juju.run(test_app_leader, "clear-continuous-writes") + + +# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap +# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) +@amd64_only +@pytest.mark.abort_on_fail +async def test_build_and_deploy(juju: Juju, charm: str) -> None: + """Simple test to ensure that the MySQL and application charms get deployed.""" + snap_revisions = Path("snap_revisions.json") + with snap_revisions.open("r") as file: + old_revisions = json.load(file) + + # TODO: support arm64 & s390x + new_revisions = old_revisions.copy() + new_revisions["x86_64"] = "69" + + with snap_revisions.open("w") as file: + json.dump(new_revisions, file) + + local_charm = get_locally_built_charm(charm) + + with snap_revisions.open("w") as file: + json.dump(old_revisions, file) + + juju.deploy( + charm=local_charm, + app=MYSQL_APP_NAME, + base="ubuntu@22.04", + config={"profile": "testing", "plugin-audit-enabled": False}, + num_units=3, + ) + juju.deploy( + charm=MYSQL_TEST_APP_NAME, + app=MYSQL_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + config={"auto_start_writes": False, "sleep_interval": 500}, + num_units=1, + ) + + juju.integrate( + f"{MYSQL_APP_NAME}:database", + f"{MYSQL_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, MYSQL_APP_NAME, MYSQL_TEST_APP_NAME), + error=jubilant.any_blocked, + timeout=20 * MINUTE_SECS, + ) + + +# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap +# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) +@amd64_only +@pytest.mark.abort_on_fail +async def test_pre_upgrade_check(juju: Juju) -> None: + """Test that the pre-upgrade-check action runs successfully.""" + mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) + + logging.info("Run pre-upgrade-check action") + task = juju.run(unit=mysql_leader, action="pre-upgrade-check") + task.raise_on_failure() + + +# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap +# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) +@amd64_only +@pytest.mark.abort_on_fail +async def test_upgrade_to_failing(juju: Juju, charm: str, continuous_writes) -> None: + logging.info("Ensure continuous_writes") + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) + + with InjectFailure( + path="src/upgrade.py", + original_str="self.charm.recover_unit_after_restart()", + replace_str="raise Exception", + ): + logging.info("Build charm with failure injected") + new_charm = get_locally_built_charm(charm) + + logging.info("Refresh the charm") + juju.refresh(app=MYSQL_APP_NAME, path=new_charm) + + logging.info("Wait for upgrade to start") + juju.wait( + ready=lambda status: jubilant.any_maintenance(status, MYSQL_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Get first upgrading unit") + relation_data = get_relation_data(juju, MYSQL_APP_NAME, "upgrade") + upgrade_stack = relation_data[0]["application-data"]["upgrade-stack"] + upgrade_unit = get_unit_by_number(juju, MYSQL_APP_NAME, literal_eval(upgrade_stack)[-1]) + + logging.info("Wait for upgrade to fail on upgrading unit") + juju.wait( + ready=wait_for_unit_status(MYSQL_APP_NAME, upgrade_unit, "blocked"), + timeout=10 * MINUTE_SECS, + ) + + +# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap +# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) +@amd64_only +@pytest.mark.abort_on_fail +async def test_rollback(juju: Juju, charm: str, continuous_writes) -> None: + """Test upgrade rollback to a healthy revision.""" + relation_data = get_relation_data(juju, MYSQL_APP_NAME, "upgrade") + upgrade_stack = relation_data[0]["application-data"]["upgrade-stack"] + upgrade_unit = get_unit_by_number(juju, MYSQL_APP_NAME, literal_eval(upgrade_stack)[-1]) + + snap_revisions = Path("snap_revisions.json") + with snap_revisions.open("r") as file: + old_revisions = json.load(file) + + # TODO: support arm64 & s390x + new_revisions = old_revisions.copy() + new_revisions["x86_64"] = "69" + + with snap_revisions.open("w") as file: + json.dump(new_revisions, file) + + mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) + local_charm = get_locally_built_charm(charm) + + time.sleep(10) + + logging.info("Run pre-upgrade-check action") + task = juju.run(unit=mysql_leader, action="pre-upgrade-check") + task.raise_on_failure() + + time.sleep(20) + + logging.info("Refresh with previous charm") + juju.refresh(app=MYSQL_APP_NAME, path=local_charm) + + logging.info("Wait for upgrade to start") + juju.wait( + ready=lambda status: jubilant.any_maintenance(status, MYSQL_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + juju.wait( + ready=lambda status: jubilant.all_active(status, MYSQL_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure rollback has taken place") + unit_status_logs = get_unit_status_log(juju, upgrade_unit, 100) + + upgrade_failed_index = get_unit_log_message( + status_logs=unit_status_logs[:], + unit_message="upgrade failed. Check logs for rollback instruction", + ) + assert upgrade_failed_index is not None + + upgrade_complete_index = get_unit_log_message( + status_logs=unit_status_logs[upgrade_failed_index:], + unit_message="upgrade completed", + ) + assert upgrade_complete_index is not None + + logging.info("Ensure continuous writes after rollback procedure") + await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) + + +class InjectFailure: + def __init__(self, path: str, original_str: str, replace_str: str): + self.path = path + self.original_str = original_str + self.replace_str = replace_str + with open(path) as file: + self.original_content = file.read() + + def __enter__(self): + """Inject failure context.""" + logging.info("Injecting failure") + assert self.original_str in self.original_content, "replace content not found" + new_content = self.original_content.replace(self.original_str, self.replace_str) + assert self.original_str not in new_content, "original string not replaced" + with open(self.path, "w") as file: + file.write(new_content) + + def __exit__(self, exc_type, exc_value, traceback): + """Inject failure context.""" + logging.info("Reverting failure") + with open(self.path, "w") as file: + file.write(self.original_content) + + +def get_unit_log_message(status_logs: list[dict], unit_message: str) -> int | None: + """Returns the index of a status log containing the desired message.""" + for index, status_log in enumerate(status_logs): + if status_log.get("message") == unit_message: + return index + + return None + + +def get_locally_built_charm(charm: str) -> str: + """Wrapper for a local charm build zip file updating.""" + local_charm_paths = Path().glob("local-*.charm") + + # Clean up local charms from previous runs + # to avoid pytest_operator_cache globbing them + for charm_path in local_charm_paths: + charm_path.unlink() + + # Create a copy of the charm to avoid modifying the original + local_charm_path = shutil.copy(charm, f"local-{Path(charm).stem}.charm") + local_charm_path = Path(local_charm_path) + + for path in ["snap_revisions.json", "src/upgrade.py"]: + with open(path) as f: + content = f.read() + with zipfile.ZipFile(local_charm_path, mode="a") as charm_zip: + charm_zip.writestr(path, content) + + return f"{local_charm_path.resolve()}" diff --git a/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py b/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py new file mode 100644 index 0000000000..978a8dd3dd --- /dev/null +++ b/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py @@ -0,0 +1,119 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import jubilant +from jubilant import Juju + +from .high_availability_helpers_new import ( + check_db_units_writes_increment, + count_switchovers, + get_app_units, + wait_for_apps_status, + wait_for_unit_status, +) + +DB_APP_NAME = "postgresql" +DB_TEST_APP_NAME = "postgresql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +def test_deploy_stable(juju: Juju) -> None: + """Simple test to ensure that the PostgreSQL and application charms get deployed.""" + logging.info("Deploying PostgreSQL cluster") + juju.deploy( + charm=DB_APP_NAME, + app=DB_APP_NAME, + base="ubuntu@24.04", + channel="16/stable", + config={"profile": "testing"}, + num_units=3, + ) + juju.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + ) + + juju.integrate( + f"{DB_APP_NAME}:database", + f"{DB_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME, DB_TEST_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + +def test_refresh_without_pre_upgrade_check(juju: Juju, charm: str) -> None: + """Test updating from stable channel.""" + initial_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + + logging.info("Refresh the charm") + juju.refresh(app=DB_APP_NAME, path=charm) + + logging.info("Wait for refresh to block as paused or incompatible") + try: + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") + juju.run( + unit=unit_names[-1], + action="force-refresh-start", + params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, + ) + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + logging.info("Run resume-refresh action") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + except TimeoutError: + logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") + assert juju.status().apps[DB_APP_NAME].is_active + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes are incrementing") + check_db_units_writes_increment(juju, DB_APP_NAME) + + logging.info("checking the number of switchovers") + final_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( + "Number of switchovers is greater than 2" + ) + + +async def test_rollback_without_pre_upgrade_check(juju: Juju, charm: str) -> None: + """Test refresh back to stable channel.""" + # Early Jubilant 1.X.Y versions do not support the `switch` option + logging.info("Refresh the charm to stable channel") + juju.cli("refresh", "--channel=16/stable", f"--switch={DB_APP_NAME}", DB_APP_NAME) + + logging.info("Wait for rolling restart") + app_units = get_app_units(juju, DB_APP_NAME) + app_units_funcs = [wait_for_unit_status(DB_APP_NAME, unit, "error") for unit in app_units] + + juju.wait( + ready=lambda status: any(status_func(status) for status_func in app_units_funcs), + timeout=10 * MINUTE_SECS, + successes=1, + ) + + check_db_units_writes_increment(juju, DB_APP_NAME) diff --git a/tests/spread/test_upgrade_rollback_incompat.py/task.yaml b/tests/spread/test_upgrade_rollback_incompat.py/task.yaml new file mode 100644 index 0000000000..2c48fbee58 --- /dev/null +++ b/tests/spread/test_upgrade_rollback_incompat.py/task.yaml @@ -0,0 +1,7 @@ +summary: test_upgrade.py +environment: + TEST_MODULE: high_availability/test_upgrade_rollback_incompat.py +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results diff --git a/tests/spread/test_upgrade_skip_pre_upgrade_check.py/task.yaml b/tests/spread/test_upgrade_skip_pre_upgrade_check.py/task.yaml new file mode 100644 index 0000000000..79ed8357d0 --- /dev/null +++ b/tests/spread/test_upgrade_skip_pre_upgrade_check.py/task.yaml @@ -0,0 +1,7 @@ +summary: test_upgrade.py +environment: + TEST_MODULE: high_availability/test_upgrade_skip_pre_upgrade_check.py +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results From c99a5f5e7c73819b62331d121d41c1d2cc69509b Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Sun, 12 Oct 2025 17:44:10 +0300 Subject: [PATCH 23/33] Fix skip pre-refresh-check test --- .../test_upgrade_rollback_incompat.py | 269 ------------------ .../test_upgrade_skip_pre_upgrade_check.py | 39 ++- .../task.yaml | 7 - 3 files changed, 30 insertions(+), 285 deletions(-) delete mode 100644 tests/integration/high_availability/test_upgrade_rollback_incompat.py delete mode 100644 tests/spread/test_upgrade_rollback_incompat.py/task.yaml diff --git a/tests/integration/high_availability/test_upgrade_rollback_incompat.py b/tests/integration/high_availability/test_upgrade_rollback_incompat.py deleted file mode 100644 index b9b66dd39a..0000000000 --- a/tests/integration/high_availability/test_upgrade_rollback_incompat.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -import json -import logging -import shutil -import time -import zipfile -from ast import literal_eval -from collections.abc import Generator -from pathlib import Path - -import jubilant -import pytest -from jubilant import Juju - -from ..markers import amd64_only -from .high_availability_helpers_new import ( - check_mysql_units_writes_increment, - get_app_leader, - get_relation_data, - get_unit_by_number, - get_unit_status_log, - wait_for_apps_status, - wait_for_unit_status, -) - -MYSQL_APP_NAME = "mysql" -MYSQL_TEST_APP_NAME = "mysql-test-app" - -MINUTE_SECS = 60 - -logging.getLogger("jubilant.wait").setLevel(logging.WARNING) - - -@pytest.fixture() -def continuous_writes(juju: Juju) -> Generator: - """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" - test_app_leader = get_app_leader(juju, MYSQL_TEST_APP_NAME) - - logging.info("Clearing continuous writes") - juju.run(test_app_leader, "clear-continuous-writes") - logging.info("Starting continuous writes") - juju.run(test_app_leader, "start-continuous-writes") - - yield - - logging.info("Clearing continuous writes") - juju.run(test_app_leader, "clear-continuous-writes") - - -# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap -# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) -@amd64_only -@pytest.mark.abort_on_fail -async def test_build_and_deploy(juju: Juju, charm: str) -> None: - """Simple test to ensure that the MySQL and application charms get deployed.""" - snap_revisions = Path("snap_revisions.json") - with snap_revisions.open("r") as file: - old_revisions = json.load(file) - - # TODO: support arm64 & s390x - new_revisions = old_revisions.copy() - new_revisions["x86_64"] = "69" - - with snap_revisions.open("w") as file: - json.dump(new_revisions, file) - - local_charm = get_locally_built_charm(charm) - - with snap_revisions.open("w") as file: - json.dump(old_revisions, file) - - juju.deploy( - charm=local_charm, - app=MYSQL_APP_NAME, - base="ubuntu@22.04", - config={"profile": "testing", "plugin-audit-enabled": False}, - num_units=3, - ) - juju.deploy( - charm=MYSQL_TEST_APP_NAME, - app=MYSQL_TEST_APP_NAME, - base="ubuntu@22.04", - channel="latest/edge", - config={"auto_start_writes": False, "sleep_interval": 500}, - num_units=1, - ) - - juju.integrate( - f"{MYSQL_APP_NAME}:database", - f"{MYSQL_TEST_APP_NAME}:database", - ) - - logging.info("Wait for applications to become active") - juju.wait( - ready=wait_for_apps_status(jubilant.all_active, MYSQL_APP_NAME, MYSQL_TEST_APP_NAME), - error=jubilant.any_blocked, - timeout=20 * MINUTE_SECS, - ) - - -# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap -# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) -@amd64_only -@pytest.mark.abort_on_fail -async def test_pre_upgrade_check(juju: Juju) -> None: - """Test that the pre-upgrade-check action runs successfully.""" - mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) - - logging.info("Run pre-upgrade-check action") - task = juju.run(unit=mysql_leader, action="pre-upgrade-check") - task.raise_on_failure() - - -# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap -# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) -@amd64_only -@pytest.mark.abort_on_fail -async def test_upgrade_to_failing(juju: Juju, charm: str, continuous_writes) -> None: - logging.info("Ensure continuous_writes") - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) - - with InjectFailure( - path="src/upgrade.py", - original_str="self.charm.recover_unit_after_restart()", - replace_str="raise Exception", - ): - logging.info("Build charm with failure injected") - new_charm = get_locally_built_charm(charm) - - logging.info("Refresh the charm") - juju.refresh(app=MYSQL_APP_NAME, path=new_charm) - - logging.info("Wait for upgrade to start") - juju.wait( - ready=lambda status: jubilant.any_maintenance(status, MYSQL_APP_NAME), - timeout=10 * MINUTE_SECS, - ) - - logging.info("Get first upgrading unit") - relation_data = get_relation_data(juju, MYSQL_APP_NAME, "upgrade") - upgrade_stack = relation_data[0]["application-data"]["upgrade-stack"] - upgrade_unit = get_unit_by_number(juju, MYSQL_APP_NAME, literal_eval(upgrade_stack)[-1]) - - logging.info("Wait for upgrade to fail on upgrading unit") - juju.wait( - ready=wait_for_unit_status(MYSQL_APP_NAME, upgrade_unit, "blocked"), - timeout=10 * MINUTE_SECS, - ) - - -# TODO: remove AMD64 marker after next incompatible MySQL server version is released in our snap -# (details: https://github.com/canonical/mysql-operator/pull/472#discussion_r1659300069) -@amd64_only -@pytest.mark.abort_on_fail -async def test_rollback(juju: Juju, charm: str, continuous_writes) -> None: - """Test upgrade rollback to a healthy revision.""" - relation_data = get_relation_data(juju, MYSQL_APP_NAME, "upgrade") - upgrade_stack = relation_data[0]["application-data"]["upgrade-stack"] - upgrade_unit = get_unit_by_number(juju, MYSQL_APP_NAME, literal_eval(upgrade_stack)[-1]) - - snap_revisions = Path("snap_revisions.json") - with snap_revisions.open("r") as file: - old_revisions = json.load(file) - - # TODO: support arm64 & s390x - new_revisions = old_revisions.copy() - new_revisions["x86_64"] = "69" - - with snap_revisions.open("w") as file: - json.dump(new_revisions, file) - - mysql_leader = get_app_leader(juju, MYSQL_APP_NAME) - local_charm = get_locally_built_charm(charm) - - time.sleep(10) - - logging.info("Run pre-upgrade-check action") - task = juju.run(unit=mysql_leader, action="pre-upgrade-check") - task.raise_on_failure() - - time.sleep(20) - - logging.info("Refresh with previous charm") - juju.refresh(app=MYSQL_APP_NAME, path=local_charm) - - logging.info("Wait for upgrade to start") - juju.wait( - ready=lambda status: jubilant.any_maintenance(status, MYSQL_APP_NAME), - timeout=10 * MINUTE_SECS, - ) - juju.wait( - ready=lambda status: jubilant.all_active(status, MYSQL_APP_NAME), - timeout=20 * MINUTE_SECS, - ) - - logging.info("Ensure rollback has taken place") - unit_status_logs = get_unit_status_log(juju, upgrade_unit, 100) - - upgrade_failed_index = get_unit_log_message( - status_logs=unit_status_logs[:], - unit_message="upgrade failed. Check logs for rollback instruction", - ) - assert upgrade_failed_index is not None - - upgrade_complete_index = get_unit_log_message( - status_logs=unit_status_logs[upgrade_failed_index:], - unit_message="upgrade completed", - ) - assert upgrade_complete_index is not None - - logging.info("Ensure continuous writes after rollback procedure") - await check_mysql_units_writes_increment(juju, MYSQL_APP_NAME) - - -class InjectFailure: - def __init__(self, path: str, original_str: str, replace_str: str): - self.path = path - self.original_str = original_str - self.replace_str = replace_str - with open(path) as file: - self.original_content = file.read() - - def __enter__(self): - """Inject failure context.""" - logging.info("Injecting failure") - assert self.original_str in self.original_content, "replace content not found" - new_content = self.original_content.replace(self.original_str, self.replace_str) - assert self.original_str not in new_content, "original string not replaced" - with open(self.path, "w") as file: - file.write(new_content) - - def __exit__(self, exc_type, exc_value, traceback): - """Inject failure context.""" - logging.info("Reverting failure") - with open(self.path, "w") as file: - file.write(self.original_content) - - -def get_unit_log_message(status_logs: list[dict], unit_message: str) -> int | None: - """Returns the index of a status log containing the desired message.""" - for index, status_log in enumerate(status_logs): - if status_log.get("message") == unit_message: - return index - - return None - - -def get_locally_built_charm(charm: str) -> str: - """Wrapper for a local charm build zip file updating.""" - local_charm_paths = Path().glob("local-*.charm") - - # Clean up local charms from previous runs - # to avoid pytest_operator_cache globbing them - for charm_path in local_charm_paths: - charm_path.unlink() - - # Create a copy of the charm to avoid modifying the original - local_charm_path = shutil.copy(charm, f"local-{Path(charm).stem}.charm") - local_charm_path = Path(local_charm_path) - - for path in ["snap_revisions.json", "src/upgrade.py"]: - with open(path) as f: - content = f.read() - with zipfile.ZipFile(local_charm_path, mode="a") as charm_zip: - charm_zip.writestr(path, content) - - return f"{local_charm_path.resolve()}" diff --git a/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py b/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py index 978a8dd3dd..67806e3796 100644 --- a/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py +++ b/tests/integration/high_availability/test_upgrade_skip_pre_upgrade_check.py @@ -11,7 +11,6 @@ count_switchovers, get_app_units, wait_for_apps_status, - wait_for_unit_status, ) DB_APP_NAME = "postgresql" @@ -53,7 +52,7 @@ def test_deploy_stable(juju: Juju) -> None: ) -def test_refresh_without_pre_upgrade_check(juju: Juju, charm: str) -> None: +def test_refresh_without_pre_refresh_check(juju: Juju, charm: str, continuous_writes) -> None: """Test updating from stable channel.""" initial_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) @@ -100,20 +99,42 @@ def test_refresh_without_pre_upgrade_check(juju: Juju, charm: str) -> None: ) -async def test_rollback_without_pre_upgrade_check(juju: Juju, charm: str) -> None: +async def test_rollback_without_pre_refresh_check( + juju: Juju, charm: str, continuous_writes +) -> None: """Test refresh back to stable channel.""" # Early Jubilant 1.X.Y versions do not support the `switch` option logging.info("Refresh the charm to stable channel") juju.cli("refresh", "--channel=16/stable", f"--switch={DB_APP_NAME}", DB_APP_NAME) - logging.info("Wait for rolling restart") - app_units = get_app_units(juju, DB_APP_NAME) - app_units_funcs = [wait_for_unit_status(DB_APP_NAME, unit, "error") for unit in app_units] + logging.info("Wait for refresh to block as paused or incompatible") + try: + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") + juju.run( + unit=unit_names[-1], + action="force-refresh-start", + params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, + ) + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + logging.info("Run resume-refresh action") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + except TimeoutError: + logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") + assert juju.status().apps[DB_APP_NAME].is_active + + logging.info("Wait for upgrade to complete") juju.wait( - ready=lambda status: any(status_func(status) for status_func in app_units_funcs), - timeout=10 * MINUTE_SECS, - successes=1, + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, ) check_db_units_writes_increment(juju, DB_APP_NAME) diff --git a/tests/spread/test_upgrade_rollback_incompat.py/task.yaml b/tests/spread/test_upgrade_rollback_incompat.py/task.yaml deleted file mode 100644 index 2c48fbee58..0000000000 --- a/tests/spread/test_upgrade_rollback_incompat.py/task.yaml +++ /dev/null @@ -1,7 +0,0 @@ -summary: test_upgrade.py -environment: - TEST_MODULE: high_availability/test_upgrade_rollback_incompat.py -execute: | - tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" -artifacts: - - allure-results From 18dc9065b04dc3389d45b7e8e297917590dce5eb Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Mon, 13 Oct 2025 16:05:52 +0300 Subject: [PATCH 24/33] Async replication test WIP --- .../integration/high_availability/conftest.py | 11 +- .../high_availability_helpers_new.py | 1 - .../test_async_replication.py | 209 ++++++------------ 3 files changed, 74 insertions(+), 147 deletions(-) diff --git a/tests/integration/high_availability/conftest.py b/tests/integration/high_availability/conftest.py index 1636863cc2..7d02b894fa 100644 --- a/tests/integration/high_availability/conftest.py +++ b/tests/integration/high_availability/conftest.py @@ -5,10 +5,7 @@ import logging import pytest -from tenacity import ( - Retrying, - stop_after_attempt, -) +from tenacity import Retrying, stop_after_attempt from .high_availability_helpers_new import get_app_leader @@ -23,8 +20,7 @@ def continuous_writes(juju): application_unit = get_app_leader(juju, DB_TEST_APP_NAME) logger.info("Clearing continuous writes") - result = juju.run(unit=application_unit, action="clear-continuous-writes", wait=120) - result.raise_on_failure() + juju.run(unit=application_unit, action="clear-continuous-writes", wait=120).raise_on_failure() logger.info("Starting continuous writes") @@ -38,5 +34,4 @@ def continuous_writes(juju): yield logger.info("Clearing continuous writes") - result = juju.run(unit=application_unit, action="clear-continuous-writes", wait=120) - result.raise_on_failure() + juju.run(unit=application_unit, action="clear-continuous-writes", wait=120).raise_on_failure() diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 4f0e2e7747..6a736278b3 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -37,7 +37,6 @@ def check_db_units_writes_increment( app_primary = get_db_primary_unit(juju, app_name) app_max_value = get_db_max_written_value(juju, app_name, app_primary) - juju.model_config({"update-status-hook-interval": "15s"}) for unit_name in app_units: for attempt in Retrying( reraise=True, diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index 9b4f18d628..147af1dffc 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -9,20 +9,20 @@ import jubilant import pytest from jubilant import Juju +from tenacity import Retrying, stop_after_attempt from .. import architecture -from ..markers import juju3 from .high_availability_helpers_new import ( get_app_leader, get_app_units, - get_db_cluster_status, get_db_max_written_value, wait_for_apps_status, ) -POSTGRESQL_APP_1 = "db1" -POSTGRESQL_APP_2 = "db2" -POSTGRESQL_TEST_APP_NAME = "postgresql-test-app" +DB_APP_1 = "db1" +DB_APP_2 = "db2" +DB_TEST_APP_NAME = "postgresql-test-app" +DB_TEST_APP_1 = "test-app1" MINUTE_SECS = 60 @@ -52,23 +52,33 @@ def second_model(juju: Juju, request: pytest.FixtureRequest) -> Generator: @pytest.fixture() -def continuous_writes(first_model: str) -> Generator: - """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" +def first_model_continuous_writes(first_model: str) -> Generator: + """Starts continuous writes to the cluster for a test and clear the writes at the end.""" model_1 = Juju(model=first_model) - model_1_test_app_leader = get_app_leader(model_1, POSTGRESQL_TEST_APP_NAME) + application_unit = get_app_leader(model_1, DB_TEST_APP_1) logging.info("Clearing continuous writes") - model_1.run(model_1_test_app_leader, "clear-continuous-writes") + model_1.run( + unit=application_unit, action="clear-continuous-writes", wait=120 + ).raise_on_failure() + logging.info("Starting continuous writes") - model_1.run(model_1_test_app_leader, "start-continuous-writes") + + for attempt in Retrying(stop=stop_after_attempt(10), reraise=True): + with attempt: + result = model_1.run(unit=application_unit, action="start-continuous-writes") + result.raise_on_failure() + + assert result.results["result"] == "True" yield logging.info("Clearing continuous writes") - model_1.run(model_1_test_app_leader, "clear-continuous-writes") + model_1.run( + unit=application_unit, action="clear-continuous-writes", wait=120 + ).raise_on_failure() -@juju3 def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> None: """Simple test to ensure that the MySQL application charms get deployed.""" configuration = {"profile": "testing"} @@ -78,7 +88,7 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No model_1 = Juju(model=first_model) model_1.deploy( charm=charm, - app=POSTGRESQL_APP_1, + app=DB_APP_1, base="ubuntu@24.04", config=configuration, constraints=constraints, @@ -87,7 +97,7 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No model_2 = Juju(model=second_model) model_2.deploy( charm=charm, - app=POSTGRESQL_APP_2, + app=DB_APP_2, base="ubuntu@24.04", config=configuration, constraints=constraints, @@ -96,51 +106,46 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), - timeout=15 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), + timeout=20 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_2), - timeout=15 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), + timeout=20 * MINUTE_SECS, ) -@juju3 def test_async_relate(first_model: str, second_model: str) -> None: """Relate the two MySQL clusters.""" logging.info("Creating offers in first model") model_1 = Juju(model=first_model) - model_1.offer(POSTGRESQL_APP_1, endpoint="replication-offer") + model_1.offer(f"{first_model}.{DB_APP_1}", endpoint="replication-offer") logging.info("Consuming offer in second model") model_2 = Juju(model=second_model) - model_2.consume(f"{first_model}.{POSTGRESQL_APP_1}") + model_2.consume(f"{first_model}.{DB_APP_1}") logging.info("Relating the two postgresql clusters") - model_2.integrate( - f"{POSTGRESQL_APP_1}", - f"{POSTGRESQL_APP_2}:replication", - ) + model_2.integrate(f"{DB_APP_1}", f"{DB_APP_2}:replication") logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.any_blocked, POSTGRESQL_APP_1), + ready=wait_for_apps_status(jubilant.any_active, DB_APP_1), timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant.any_waiting, POSTGRESQL_APP_2), + ready=wait_for_apps_status(jubilant.any_active, DB_APP_2), timeout=10 * MINUTE_SECS, ) -@juju3 def test_deploy_app(first_model: str) -> None: """Deploy the router and the test application.""" logging.info("Deploying test application") model_1 = Juju(model=first_model) model_1.deploy( - charm=POSTGRESQL_TEST_APP_NAME, - app=POSTGRESQL_TEST_APP_NAME, + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_1, base="ubuntu@22.04", channel="latest/edge", num_units=1, @@ -148,18 +153,13 @@ def test_deploy_app(first_model: str) -> None: ) logging.info("Relating test application") - model_1.integrate( - f"{POSTGRESQL_TEST_APP_NAME}:database", - f"{POSTGRESQL_APP_1}:database", - ) + model_1.integrate(f"{DB_TEST_APP_1}:database", f"{DB_APP_1}:database") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_TEST_APP_NAME), - timeout=10 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_active, DB_TEST_APP_1), timeout=10 * MINUTE_SECS ) -@juju3 def test_create_replication(first_model: str, second_model: str) -> None: """Run the create-replication action and wait for the applications to settle.""" model_1 = Juju(model=first_model) @@ -167,25 +167,22 @@ def test_create_replication(first_model: str, second_model: str) -> None: logging.info("Running create replication action") task = model_1.run( - unit=get_app_leader(model_1, POSTGRESQL_APP_1), - action="create-replication", - wait=5 * MINUTE_SECS, + unit=get_app_leader(model_1, DB_APP_1), action="create-replication", wait=5 * MINUTE_SECS ) task.raise_on_failure() logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), - timeout=10 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_2), - timeout=10 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=20 * MINUTE_SECS ) -@juju3 -def test_data_replication(first_model: str, second_model: str, continuous_writes) -> None: +def test_data_replication( + first_model: str, second_model: str, first_model_continuous_writes +) -> None: """Test to write to primary, and read the same data back from replicas.""" logging.info("Testing data replication") results = get_db_max_written_values(first_model, second_model) @@ -195,17 +192,16 @@ def test_data_replication(first_model: str, second_model: str, continuous_writes assert results[0] > 1, "No data was written to the database" -@juju3 -def test_standby_promotion(first_model: str, second_model: str, continuous_writes) -> None: +def test_standby_promotion( + first_model: str, second_model: str, first_model_continuous_writes +) -> None: """Test graceful promotion of a standby cluster to primary.""" model_2 = Juju(model=second_model) - model_2_postgresql_leader = get_app_leader(model_2, POSTGRESQL_APP_2) + model_2_postgresql_leader = get_app_leader(model_2, DB_APP_2) logging.info("Promoting standby cluster to primary") promotion_task = model_2.run( - unit=model_2_postgresql_leader, - action="promote-to-primary", - params={"scope": "cluster", "force": "true"}, + unit=model_2_postgresql_leader, action="promote-to-primary", params={"scope": "cluster"} ) promotion_task.raise_on_failure() @@ -214,23 +210,12 @@ def test_standby_promotion(first_model: str, second_model: str, continuous_write assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" - cluster_set_status = get_db_cluster_status( - juju=model_2, - unit=model_2_postgresql_leader, - cluster_set=True, - ) - assert cluster_set_status["clusters"]["cuzco"]["clusterrole"] == "primary", ( - "standby not promoted to primary" - ) - - -@juju3 def test_failover(first_model: str, second_model: str) -> None: """Test switchover on primary cluster fail.""" logging.info("Freezing postgres on primary cluster units") model_2 = Juju(model=second_model) - model_2_postgresql_units = get_app_units(model_2, POSTGRESQL_APP_2) + model_2_postgresql_units = get_app_units(model_2, DB_APP_2) # Simulating a failure on the primary cluster for unit_name in model_2_postgresql_units: @@ -238,106 +223,54 @@ def test_failover(first_model: str, second_model: str) -> None: logging.info("Promoting standby cluster to primary with force flag") model_1 = Juju(model=first_model) - model_1_postgresql_leader = get_app_leader(model_1, POSTGRESQL_APP_1) + model_1_postgresql_leader = get_app_leader(model_1, DB_APP_1) - promotion_task = model_1.run( + model_1.run( unit=model_1_postgresql_leader, action="promote-to-primary", params={"scope": "cluster", "force": True}, wait=5 * MINUTE_SECS, - ) - promotion_task.raise_on_failure() + ).raise_on_failure() # Restore postgres process logging.info("Unfreezing postgres on primary cluster units") for unit_name in model_2_postgresql_units: model_2.exec("sudo pkill -x postgres --signal SIGCONT", unit=unit_name) - logging.info("Checking clusters statuses") - cluster_set_status = get_db_cluster_status( - juju=model_1, - unit=model_1_postgresql_leader, - cluster_set=True, - ) - - assert cluster_set_status["clusters"]["lima"]["clusterrole"] == "primary", ( - "standby not promoted to primary", - ) - assert cluster_set_status["clusters"]["cuzco"]["globalstatus"] == "invalidated", ( - "old primary not invalidated" - ) - -@juju3 -def test_rejoin_invalidated_cluster( - first_model: str, second_model: str, continuous_writes -) -> None: - """Test rejoin invalidated cluster with.""" - model_1 = Juju(model=first_model) - model_1_postgresql_leader = get_app_leader(model_1, POSTGRESQL_APP_1) - - task = model_1.run( - unit=model_1_postgresql_leader, - action="rejoin-cluster", - wait=5 * MINUTE_SECS, - ) - task.raise_on_failure() - - results = get_db_max_written_values(first_model, second_model) - assert len(results) == 6 - assert all(results[0] == x for x in results), "Data is not consistent across units" - assert results[0] > 1, "No data was written to the database" - - -@juju3 -@pytest.mark.abort_on_fail -def test_unrelate_and_relate(first_model: str, second_model: str, continuous_writes) -> None: +def test_unrelate_and_relate(first_model: str, second_model: str) -> None: """Test removing and re-relating the two postgresql clusters.""" model_1 = Juju(model=first_model) model_2 = Juju(model=second_model) logging.info("Remove async relation") - model_2.remove_relation( - f"{POSTGRESQL_APP_1}", - f"{POSTGRESQL_APP_2}:replication", - ) + model_2.remove_relation(f"{DB_APP_1}", f"{DB_APP_2}:replication") logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), - timeout=10 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=10 * MINUTE_SECS ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_blocked, POSTGRESQL_APP_2), - timeout=10 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_blocked, DB_APP_2), timeout=10 * MINUTE_SECS ) - logging.info("Re relating the two postgresql clusters") - model_2.integrate( - f"{POSTGRESQL_APP_1}", - f"{POSTGRESQL_APP_2}:replication", - ) + logging.info("Re-relating the two postgresql clusters") + model_2.integrate(f"{DB_APP_1}", f"{DB_APP_2}:replication") model_1.wait( - ready=wait_for_apps_status(jubilant.any_blocked, POSTGRESQL_APP_1), - timeout=5 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.any_blocked, DB_APP_1), timeout=5 * MINUTE_SECS ) logging.info("Running create replication action") - task = model_1.run( - unit=get_app_leader(model_1, POSTGRESQL_APP_1), - action="create-replication", - wait=5 * MINUTE_SECS, - ) - task.raise_on_failure() + model_1.run( + unit=get_app_leader(model_1, DB_APP_1), action="create-replication", wait=5 * MINUTE_SECS + ).raise_on_failure() logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_1), - timeout=10 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, POSTGRESQL_APP_2), - timeout=10 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=20 * MINUTE_SECS ) results = get_db_max_written_values(first_model, second_model) @@ -353,21 +286,21 @@ def get_db_max_written_values(first_model: str, second_model: str) -> list[int]: logging.info("Stopping continuous writes") stopping_task = model_1.run( - unit=get_app_leader(model_1, POSTGRESQL_TEST_APP_NAME), action="stop-continuous-writes" + unit=get_app_leader(model_1, DB_TEST_APP_1), action="stop-continuous-writes" ) stopping_task.raise_on_failure() time.sleep(5) results = [] - logging.info(f"Querying max value on all {POSTGRESQL_APP_1} units") - for unit_name in get_app_units(model_1, POSTGRESQL_APP_1): - unit_max_value = get_db_max_written_value(model_1, POSTGRESQL_APP_1, unit_name) + logging.info(f"Querying max value on all {DB_APP_1} units") + for unit_name in get_app_units(model_1, DB_APP_1): + unit_max_value = get_db_max_written_value(model_1, DB_APP_1, unit_name) results.append(unit_max_value) - logging.info(f"Querying max value on all {POSTGRESQL_APP_2} units") - for unit_name in get_app_units(model_2, POSTGRESQL_APP_2): - unit_max_value = get_db_max_written_value(model_2, POSTGRESQL_APP_2, unit_name) + logging.info(f"Querying max value on all {DB_APP_2} units") + for unit_name in get_app_units(model_2, DB_APP_2): + unit_max_value = get_db_max_written_value(model_2, DB_APP_2, unit_name) results.append(unit_max_value) return results From a4b4fa54f7adbf3529f5f186fc5a720decaa25bb Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 14 Oct 2025 02:41:47 +0300 Subject: [PATCH 25/33] Async replication tweaks --- .../high_availability_helpers_new.py | 29 +++- .../test_async_replication.py | 147 +++++++++++++----- 2 files changed, 131 insertions(+), 45 deletions(-) diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 6a736278b3..0c09581fee 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -24,7 +24,10 @@ def check_db_units_writes_increment( - juju: Juju, app_name: str, app_units: list[str] | None = None + juju: Juju, + app_name: str, + app_units: list[str] | None = None, + db_name: str = "postgresql_test_app_database", ) -> None: """Ensure that continuous writes is incrementing on all units. @@ -35,7 +38,7 @@ def check_db_units_writes_increment( app_units = get_app_units(juju, app_name) app_primary = get_db_primary_unit(juju, app_name) - app_max_value = get_db_max_written_value(juju, app_name, app_primary) + app_max_value = get_db_max_written_value(juju, app_name, app_primary, db_name) for unit_name in app_units: for attempt in Retrying( @@ -44,7 +47,7 @@ def check_db_units_writes_increment( wait=wait_fixed(10), ): with attempt: - unit_max_value = get_db_max_written_value(juju, app_name, unit_name) + unit_max_value = get_db_max_written_value(juju, app_name, unit_name, db_name) assert unit_max_value > app_max_value, "Writes not incrementing" app_max_value = unit_max_value @@ -172,13 +175,29 @@ def get_db_primary_unit(juju: Juju, app_name: str) -> str: raise Exception("No primary node found") -def get_db_max_written_value(juju: Juju, app_name: str, unit_name: str) -> int: +def get_db_standby_leader_unit(juju: Juju, app_name: str) -> str: + """Get the current standby node of the cluster.""" + unit_address = get_unit_ip(juju, app_name, get_app_leader(juju, app_name)) + + for member in requests.get(f"https://{unit_address}:8008/history", verify=False).json()[ + "members" + ]: + if member["role"] == "standby_leader": + return member["name"][::-1].replace("-", "/")[::-1] + + raise Exception("No standby primary node found") + + +def get_db_max_written_value( + juju: Juju, app_name: str, unit_name: str, db_name: str = "postgresql_test_app_database" +) -> int: """Retrieve the max written value in the MySQL database. Args: juju: The Juju model. app_name: The application name. unit_name: The unit name. + db_name: The database to connect to. """ password = get_user_password(juju, app_name, SERVER_CONFIG_USERNAME) @@ -187,7 +206,7 @@ def get_db_max_written_value(juju: Juju, app_name: str, unit_name: str) -> int: SERVER_CONFIG_USERNAME, password, ["SELECT MAX(number) FROM continuous_writes;"], - "postgresql_test_app_database", + db_name, ) return output[0] diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index 147af1dffc..7bd5026cf4 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -16,6 +16,8 @@ get_app_leader, get_app_units, get_db_max_written_value, + get_db_primary_unit, + get_db_standby_leader_unit, wait_for_apps_status, ) @@ -23,6 +25,7 @@ DB_APP_2 = "db2" DB_TEST_APP_NAME = "postgresql-test-app" DB_TEST_APP_1 = "test-app1" +DB_TEST_APP_2 = "test-app2" MINUTE_SECS = 60 @@ -79,8 +82,8 @@ def first_model_continuous_writes(first_model: str) -> Generator: ).raise_on_failure() -def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> None: - """Simple test to ensure that the MySQL application charms get deployed.""" +def test_deploy(first_model: str, second_model: str, charm: str) -> None: + """Simple test to ensure that the database application charms get deployed.""" configuration = {"profile": "testing"} constraints = {"arch": architecture.architecture} @@ -106,12 +109,10 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), - timeout=20 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), - timeout=20 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=20 * MINUTE_SECS ) @@ -139,25 +140,39 @@ def test_async_relate(first_model: str, second_model: str) -> None: ) -def test_deploy_app(first_model: str) -> None: +def test_deploy_app(first_model: str, second_model: str) -> None: """Deploy the router and the test application.""" + constraints = {"arch": architecture.architecture} logging.info("Deploying test application") model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) model_1.deploy( charm=DB_TEST_APP_NAME, app=DB_TEST_APP_1, base="ubuntu@22.04", channel="latest/edge", num_units=1, - trust=False, + constraints=constraints, + ) + model_2.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_2, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + constraints=constraints, ) logging.info("Relating test application") model_1.integrate(f"{DB_TEST_APP_1}:database", f"{DB_APP_1}:database") + model_2.integrate(f"{DB_TEST_APP_2}:database", f"{DB_APP_2}:database") model_1.wait( ready=wait_for_apps_status(jubilant.all_active, DB_TEST_APP_1), timeout=10 * MINUTE_SECS ) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_TEST_APP_2), timeout=10 * MINUTE_SECS + ) def test_create_replication(first_model: str, second_model: str) -> None: @@ -185,16 +200,14 @@ def test_data_replication( ) -> None: """Test to write to primary, and read the same data back from replicas.""" logging.info("Testing data replication") - results = get_db_max_written_values(first_model, second_model) + results = get_db_max_written_values(first_model, second_model, first_model, DB_TEST_APP_1) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" -def test_standby_promotion( - first_model: str, second_model: str, first_model_continuous_writes -) -> None: +def test_standby_promotion(first_model: str, second_model: str) -> None: """Test graceful promotion of a standby cluster to primary.""" model_2 = Juju(model=second_model) model_2_postgresql_leader = get_app_leader(model_2, DB_APP_2) @@ -205,37 +218,58 @@ def test_standby_promotion( ) promotion_task.raise_on_failure() - results = get_db_max_written_values(first_model, second_model) + rerelate_test_app(model_2, DB_APP_2, DB_TEST_APP_2) + + results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_2) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" -def test_failover(first_model: str, second_model: str) -> None: - """Test switchover on primary cluster fail.""" - logging.info("Freezing postgres on primary cluster units") +def test_failover_in_main_cluster(first_model: str, second_model: str) -> None: + """Test that async replication fails over correctly.""" model_2 = Juju(model=second_model) - model_2_postgresql_units = get_app_units(model_2, DB_APP_2) - # Simulating a failure on the primary cluster - for unit_name in model_2_postgresql_units: - model_2.exec("sudo pkill -x postgres --signal SIGSTOP", unit=unit_name) + rerelate_test_app(model_2, DB_APP_2, DB_TEST_APP_2) + + primary = get_db_primary_unit(model_2, DB_APP_2) + model_2.remove_unit(primary, force=True) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=10 * MINUTE_SECS + ) + + results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_2) + + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=10 * MINUTE_SECS + ) + assert len(results) == 5 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" - logging.info("Promoting standby cluster to primary with force flag") + assert primary != get_db_primary_unit(model_2, DB_APP_2) + + +def test_failover_in_standby_cluster(first_model: str, second_model: str) -> None: + """Test that async replication fails over correctly.""" model_1 = Juju(model=first_model) - model_1_postgresql_leader = get_app_leader(model_1, DB_APP_1) + model_2 = Juju(model=second_model) - model_1.run( - unit=model_1_postgresql_leader, - action="promote-to-primary", - params={"scope": "cluster", "force": True}, - wait=5 * MINUTE_SECS, - ).raise_on_failure() + rerelate_test_app(model_2, DB_APP_2, DB_TEST_APP_2) - # Restore postgres process - logging.info("Unfreezing postgres on primary cluster units") - for unit_name in model_2_postgresql_units: - model_2.exec("sudo pkill -x postgres --signal SIGCONT", unit=unit_name) + standby = get_db_standby_leader_unit(model_1, DB_APP_2) + model_1.remove_unit(standby, force=True) + + results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_2) + + model_1.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=10 * MINUTE_SECS + ) + assert len(results) == 4 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + assert standby != get_db_standby_leader_unit(model_1, DB_APP_2) def test_unrelate_and_relate(first_model: str, second_model: str) -> None: @@ -260,6 +294,8 @@ def test_unrelate_and_relate(first_model: str, second_model: str) -> None: ready=wait_for_apps_status(jubilant.any_blocked, DB_APP_1), timeout=5 * MINUTE_SECS ) + rerelate_test_app(model_1, DB_APP_1, DB_TEST_APP_1) + logging.info("Running create replication action") model_1.run( unit=get_app_leader(model_1, DB_APP_1), action="create-replication", wait=5 * MINUTE_SECS @@ -273,34 +309,65 @@ def test_unrelate_and_relate(first_model: str, second_model: str) -> None: ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=20 * MINUTE_SECS ) - results = get_db_max_written_values(first_model, second_model) + results = get_db_max_written_values(first_model, second_model, first_model, DB_TEST_APP_1) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" -def get_db_max_written_values(first_model: str, second_model: str) -> list[int]: +def get_db_max_written_values( + first_model: str, second_model: str, test_model: str, test_app: str +) -> list[int]: """Return list with max written value from all units.""" + db_name = f"{test_app.replace('-', '_')}_database" model_1 = Juju(model=first_model) model_2 = Juju(model=second_model) + test_app_model = model_1 if test_model == first_model else model_2 logging.info("Stopping continuous writes") - stopping_task = model_1.run( - unit=get_app_leader(model_1, DB_TEST_APP_1), action="stop-continuous-writes" - ) - stopping_task.raise_on_failure() + test_app_model.run( + unit=get_app_leader(test_app_model, test_app), action="stop-continuous-writes" + ).raise_on_failure() time.sleep(5) results = [] logging.info(f"Querying max value on all {DB_APP_1} units") for unit_name in get_app_units(model_1, DB_APP_1): - unit_max_value = get_db_max_written_value(model_1, DB_APP_1, unit_name) + unit_max_value = get_db_max_written_value(model_1, DB_APP_1, unit_name, db_name) results.append(unit_max_value) logging.info(f"Querying max value on all {DB_APP_2} units") for unit_name in get_app_units(model_2, DB_APP_2): - unit_max_value = get_db_max_written_value(model_2, DB_APP_2, unit_name) + unit_max_value = get_db_max_written_value(model_2, DB_APP_2, unit_name, db_name) results.append(unit_max_value) return results + + +def rerelate_test_app(juju: Juju, db_name: str, test_app_name: str) -> None: + logging.info(f"Reintegrating {db_name} and {test_app_name}") + juju.remove_relation(db_name, f"{test_app_name}:database") + juju.wait( + ready=wait_for_apps_status(jubilant.all_blocked, test_app_name) + and wait_for_apps_status(jubilant.all_active, db_name), + timeout=10 * MINUTE_SECS, + ) + + juju.integrate(f"{db_name}:database", f"{test_app_name}:database") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, test_app_name, db_name), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Clearing continuous writes") + application_unit = get_app_leader(juju, test_app_name) + juju.run(unit=application_unit, action="clear-continuous-writes", wait=120).raise_on_failure() + + logging.info("Starting continuous writes") + for attempt in Retrying(stop=stop_after_attempt(10), reraise=True): + with attempt: + result = juju.run(unit=application_unit, action="start-continuous-writes") + result.raise_on_failure() + + assert result.results["result"] == "True" From cfb5740a0e7a6d14c9e849b053994b4a9b1ce977 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 15 Oct 2025 00:37:19 +0300 Subject: [PATCH 26/33] Move rerelate test up --- .../test_async_replication.py | 174 +++++++++--------- 1 file changed, 83 insertions(+), 91 deletions(-) diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index 7bd5026cf4..bbf54ba36c 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -107,12 +107,38 @@ def test_deploy(first_model: str, second_model: str, charm: str) -> None: num_units=3, ) + logging.info("Deploying test application") + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + model_1.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_1, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + constraints=constraints, + ) + model_2.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_2, + base="ubuntu@22.04", + channel="latest/edge", + num_units=1, + constraints=constraints, + ) + + logging.info("Relating test application") + model_1.integrate(f"{DB_TEST_APP_1}:database", f"{DB_APP_1}:database") + model_2.integrate(f"{DB_TEST_APP_2}:database", f"{DB_APP_2}:database") + logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1, DB_TEST_APP_1), + timeout=20 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=20 * MINUTE_SECS + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2, DB_TEST_APP_2), + timeout=20 * MINUTE_SECS, ) @@ -140,51 +166,15 @@ def test_async_relate(first_model: str, second_model: str) -> None: ) -def test_deploy_app(first_model: str, second_model: str) -> None: - """Deploy the router and the test application.""" - constraints = {"arch": architecture.architecture} - logging.info("Deploying test application") - model_1 = Juju(model=first_model) - model_2 = Juju(model=second_model) - model_1.deploy( - charm=DB_TEST_APP_NAME, - app=DB_TEST_APP_1, - base="ubuntu@22.04", - channel="latest/edge", - num_units=1, - constraints=constraints, - ) - model_2.deploy( - charm=DB_TEST_APP_NAME, - app=DB_TEST_APP_2, - base="ubuntu@22.04", - channel="latest/edge", - num_units=1, - constraints=constraints, - ) - - logging.info("Relating test application") - model_1.integrate(f"{DB_TEST_APP_1}:database", f"{DB_APP_1}:database") - model_2.integrate(f"{DB_TEST_APP_2}:database", f"{DB_APP_2}:database") - - model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_TEST_APP_1), timeout=10 * MINUTE_SECS - ) - model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_TEST_APP_2), timeout=10 * MINUTE_SECS - ) - - def test_create_replication(first_model: str, second_model: str) -> None: """Run the create-replication action and wait for the applications to settle.""" model_1 = Juju(model=first_model) model_2 = Juju(model=second_model) logging.info("Running create replication action") - task = model_1.run( + model_1.run( unit=get_app_leader(model_1, DB_APP_1), action="create-replication", wait=5 * MINUTE_SECS - ) - task.raise_on_failure() + ).raise_on_failure() logging.info("Waiting for the applications to settle") model_1.wait( @@ -226,52 +216,6 @@ def test_standby_promotion(first_model: str, second_model: str) -> None: assert results[0] > 1, "No data was written to the database" -def test_failover_in_main_cluster(first_model: str, second_model: str) -> None: - """Test that async replication fails over correctly.""" - model_2 = Juju(model=second_model) - - rerelate_test_app(model_2, DB_APP_2, DB_TEST_APP_2) - - primary = get_db_primary_unit(model_2, DB_APP_2) - model_2.remove_unit(primary, force=True) - model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=10 * MINUTE_SECS - ) - - results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_2) - - model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=10 * MINUTE_SECS - ) - assert len(results) == 5 - assert all(results[0] == x for x in results), "Data is not consistent across units" - assert results[0] > 1, "No data was written to the database" - - assert primary != get_db_primary_unit(model_2, DB_APP_2) - - -def test_failover_in_standby_cluster(first_model: str, second_model: str) -> None: - """Test that async replication fails over correctly.""" - model_1 = Juju(model=first_model) - model_2 = Juju(model=second_model) - - rerelate_test_app(model_2, DB_APP_2, DB_TEST_APP_2) - - standby = get_db_standby_leader_unit(model_1, DB_APP_2) - model_1.remove_unit(standby, force=True) - - results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_2) - - model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=10 * MINUTE_SECS - ) - assert len(results) == 4 - assert all(results[0] == x for x in results), "Data is not consistent across units" - assert results[0] > 1, "No data was written to the database" - - assert standby != get_db_standby_leader_unit(model_1, DB_APP_2) - - def test_unrelate_and_relate(first_model: str, second_model: str) -> None: """Test removing and re-relating the two postgresql clusters.""" model_1 = Juju(model=first_model) @@ -282,25 +226,29 @@ def test_unrelate_and_relate(first_model: str, second_model: str) -> None: logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=10 * MINUTE_SECS + ready=wait_for_apps_status(jubilant.all_agents_idle, DB_APP_1), timeout=10 * MINUTE_SECS ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_blocked, DB_APP_2), timeout=10 * MINUTE_SECS + ready=wait_for_apps_status(jubilant.all_agents_idle, DB_APP_2), timeout=10 * MINUTE_SECS ) logging.info("Re-relating the two postgresql clusters") model_2.integrate(f"{DB_APP_1}", f"{DB_APP_2}:replication") + model_1.wait( - ready=wait_for_apps_status(jubilant.any_blocked, DB_APP_1), timeout=5 * MINUTE_SECS + ready=wait_for_apps_status(jubilant.all_agents_idle, DB_APP_1), timeout=10 * MINUTE_SECS + ) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_agents_idle, DB_APP_2), timeout=10 * MINUTE_SECS ) - - rerelate_test_app(model_1, DB_APP_1, DB_TEST_APP_1) logging.info("Running create replication action") model_1.run( unit=get_app_leader(model_1, DB_APP_1), action="create-replication", wait=5 * MINUTE_SECS ).raise_on_failure() + rerelate_test_app(model_1, DB_APP_1, DB_TEST_APP_1) + logging.info("Waiting for the applications to settle") model_1.wait( ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS @@ -315,6 +263,50 @@ def test_unrelate_and_relate(first_model: str, second_model: str) -> None: assert results[0] > 1, "No data was written to the database" +def test_failover_in_main_cluster(first_model: str, second_model: str) -> None: + """Test that async replication fails over correctly.""" + model_1 = Juju(model=second_model) + + rerelate_test_app(model_1, DB_APP_1, DB_TEST_APP_1) + + primary = get_db_primary_unit(model_1, DB_APP_1) + model_1.remove_unit(primary) + model_1.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=10 * MINUTE_SECS + ) + + results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_1) + + assert len(results) == 5 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + assert primary != get_db_primary_unit(model_1, DB_APP_1) + + +def test_failover_in_standby_cluster(first_model: str, second_model: str) -> None: + """Test that async replication fails over correctly.""" + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + + rerelate_test_app(model_1, DB_APP_1, DB_TEST_APP_1) + + standby = get_db_standby_leader_unit(model_2, DB_APP_2) + model_2.remove_unit(standby) + + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=10 * MINUTE_SECS + ) + + results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_1) + + assert len(results) == 4 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + assert standby != get_db_standby_leader_unit(model_2, DB_APP_2) + + def get_db_max_written_values( first_model: str, second_model: str, test_model: str, test_app: str ) -> list[int]: From 993a3418f39d0c5144878fde1a4fad1ab4034113 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 15 Oct 2025 04:30:31 +0300 Subject: [PATCH 27/33] Async replication tweaks --- poetry.lock | 246 ++++++++-------- pyproject.toml | 2 +- src/charm.py | 33 ++- .../high_availability_helpers_new.py | 2 +- .../test_async_replication.py | 6 +- .../test_async_replication_upgrade.py | 277 ++++++++++++++++++ .../task.yaml | 7 + .../task.yaml | 2 +- 8 files changed, 432 insertions(+), 143 deletions(-) create mode 100644 tests/integration/high_availability/test_async_replication_upgrade.py create mode 100644 tests/spread/test_async_replication_upgrade.py/task.yaml diff --git a/poetry.lock b/poetry.lock index b6c30150ca..cbfd03e134 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1971,19 +1971,19 @@ files = [ [[package]] name = "pydantic" -version = "2.12.1" +version = "2.12.2" description = "Data validation using Python type hints" optional = false python-versions = ">=3.9" groups = ["main", "charm-libs"] files = [ - {file = "pydantic-2.12.1-py3-none-any.whl", hash = "sha256:665931f5b4ab40c411439e66f99060d631d1acc58c3d481957b9123343d674d1"}, - {file = "pydantic-2.12.1.tar.gz", hash = "sha256:0af849d00e1879199babd468ec9db13b956f6608e9250500c1a9d69b6a62824e"}, + {file = "pydantic-2.12.2-py3-none-any.whl", hash = "sha256:25ff718ee909acd82f1ff9b1a4acfd781bb23ab3739adaa7144f19a6a4e231ae"}, + {file = "pydantic-2.12.2.tar.gz", hash = "sha256:7b8fa15b831a4bbde9d5b84028641ac3080a4ca2cbd4a621a661687e741624fd"}, ] [package.dependencies] annotated-types = ">=0.6.0" -pydantic-core = "2.41.3" +pydantic-core = "2.41.4" typing-extensions = ">=4.14.1" typing-inspection = ">=0.4.2" @@ -1993,129 +1993,129 @@ timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows [[package]] name = "pydantic-core" -version = "2.41.3" +version = "2.41.4" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.9" groups = ["main", "charm-libs"] files = [ - {file = "pydantic_core-2.41.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:1a572d7d06b9fa6efeec32fbcd18c73081af66942b345664669867cf8e69c7b0"}, - {file = "pydantic_core-2.41.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:63d787ea760052585c6bfc34310aa379346f2cec363fe178659664f80421804b"}, - {file = "pydantic_core-2.41.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa5a2327538f6b3c040604618cd36a960224ad7c22be96717b444c269f1a8b2"}, - {file = "pydantic_core-2.41.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:947e1c5e79c54e313742c9dc25a439d38c5dcfde14f6a9a9069b3295f190c444"}, - {file = "pydantic_core-2.41.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d0a1e90642dd6040cfcf509230fb1c3df257f7420d52b5401b3ce164acb0a342"}, - {file = "pydantic_core-2.41.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8f7d4504d7bdce582a2700615d52dbe5f9de4ffab4815431f6da7edf5acc1329"}, - {file = "pydantic_core-2.41.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7528ff51a26985072291c4170bd1f16f396a46ef845a428ae97bdb01ebaee7f4"}, - {file = "pydantic_core-2.41.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:21b3a07248e481c06c4f208c53402fc143e817ce652a114f0c5d2acfd97b8b91"}, - {file = "pydantic_core-2.41.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:45b445c09095df0d422e8ef01065f1c0a7424a17b37646b71d857ead6428b084"}, - {file = "pydantic_core-2.41.3-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:c32474bb2324b574dc57aea40cb415c8ca81b73bc103f5644a15095d5552df8f"}, - {file = "pydantic_core-2.41.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:91a38e48cdcc17763ac0abcb27c2b5fca47c2bc79ca0821b5211b2adeb06c4d0"}, - {file = "pydantic_core-2.41.3-cp310-cp310-win32.whl", hash = "sha256:b0947cd92f782cfc7bb595fd046a5a5c83e9f9524822f071f6b602f08d14b653"}, - {file = "pydantic_core-2.41.3-cp310-cp310-win_amd64.whl", hash = "sha256:6d972c97e91e294f1ce4c74034211b5c16d91b925c08704f5786e5e3743d8a20"}, - {file = "pydantic_core-2.41.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:91dfe6a6e02916fd1fb630f1ebe0c18f9fd9d3cbfe84bb2599f195ebbb0edb9b"}, - {file = "pydantic_core-2.41.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e301551c63d46122972ab5523a1438772cdde5d62d34040dac6f11017f18cc5d"}, - {file = "pydantic_core-2.41.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d986b1defbe27867812dc3d8b3401d72be14449b255081e505046c02687010a"}, - {file = "pydantic_core-2.41.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:351b2c5c073ae8caaa11e4336f8419d844c9b936e123e72dbe2c43fa97e54781"}, - {file = "pydantic_core-2.41.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7be34f5217ffc28404fc0ca6f07491a2a6a770faecfcf306384c142bccd2fdb4"}, - {file = "pydantic_core-2.41.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3cbcad992c281b4960cb5550e218ff39a679c730a59859faa0bc9b8d87efbe6a"}, - {file = "pydantic_core-2.41.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8741b0ab2acdd20c804432e08052791e66cf797afa5451e7e435367f88474b0b"}, - {file = "pydantic_core-2.41.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1ac3ba94f3be9437da4ad611dacd356f040120668c5b1733b8ae035a13663c48"}, - {file = "pydantic_core-2.41.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:971efe83bac3d5db781ee1b4836ac2cdd53cf7f727edfd4bb0a18029f9409ef2"}, - {file = "pydantic_core-2.41.3-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:98c54e5ad0399ac79c0b6b567693d0f8c44b5a0d67539826cc1dd495e47d1307"}, - {file = "pydantic_core-2.41.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60110fe616b599c6e057142f2d75873e213bc0cbdac88f58dda8afb27a82f978"}, - {file = "pydantic_core-2.41.3-cp311-cp311-win32.whl", hash = "sha256:75428ae73865ee366f159b68b9281c754df832494419b4eb46b7c3fbdb27756c"}, - {file = "pydantic_core-2.41.3-cp311-cp311-win_amd64.whl", hash = "sha256:c0178ad5e586d3e394f4b642f0bb7a434bcf34d1e9716cc4bd74e34e35283152"}, - {file = "pydantic_core-2.41.3-cp311-cp311-win_arm64.whl", hash = "sha256:5dd40bb57cdae2a35e20d06910b93b13e8f57ffff5a0b0a45927953bad563a03"}, - {file = "pydantic_core-2.41.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:7bdc8b70bc4b68e4d891b46d018012cac7bbfe3b981a7c874716dde09ff09fd5"}, - {file = "pydantic_core-2.41.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:446361e93f4ffe509edae5862fb89a0d24cbc8f2935f05c6584c2f2ca6e7b6df"}, - {file = "pydantic_core-2.41.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9af9a9ae24b866ce58462a7de61c33ff035e052b7a9c05c29cf496bd6a16a63f"}, - {file = "pydantic_core-2.41.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fc836eb8561f04fede7b73747463bd08715be0f55c427e0f0198aa2f1d92f913"}, - {file = "pydantic_core-2.41.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16f80f366472eb6a3744149289c263e5ef182c8b18422192166b67625fef3c50"}, - {file = "pydantic_core-2.41.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d699904cd13d0f509bdbb17f0784abb332d4aa42df4b0a8b65932096fcd4b21"}, - {file = "pydantic_core-2.41.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:485398dacc5dddb2be280fd3998367531eccae8631f4985d048c2406a5ee5ecc"}, - {file = "pydantic_core-2.41.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6dfe0898272bf675941cd1ea701677341357b77acadacabbd43d71e09763dceb"}, - {file = "pydantic_core-2.41.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:86ffbf5291c367a56b5718590dc3452890f2c1ac7b76d8f4a1e66df90bd717f6"}, - {file = "pydantic_core-2.41.3-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:c58c5acda77802eedde3aaf22be09e37cfec060696da64bf6e6ffb2480fdabd0"}, - {file = "pydantic_core-2.41.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:40db5705aec66371ca5792415c3e869137ae2bab48c48608db3f84986ccaf016"}, - {file = "pydantic_core-2.41.3-cp312-cp312-win32.whl", hash = "sha256:668fcb317a0b3c84781796891128111c32f83458d436b022014ed0ea07f66e1b"}, - {file = "pydantic_core-2.41.3-cp312-cp312-win_amd64.whl", hash = "sha256:248a5d1dac5382454927edf32660d0791d2df997b23b06a8cac6e3375bc79cee"}, - {file = "pydantic_core-2.41.3-cp312-cp312-win_arm64.whl", hash = "sha256:347a23094c98b7ea2ba6fff93b52bd2931a48c9c1790722d9e841f30e4b7afcd"}, - {file = "pydantic_core-2.41.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:a8596700fdd3ee12b0d9c1f2395f4c32557e7ebfbfacdc08055b0bcbe7d2827e"}, - {file = "pydantic_core-2.41.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:624503f918e472c0eed6935020c01b6a6b4bcdb7955a848da5c8805d40f15c0f"}, - {file = "pydantic_core-2.41.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36388958d0c614df9f5de1a5f88f4b79359016b9ecdfc352037788a628616aa2"}, - {file = "pydantic_core-2.41.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c50eba144add9104cf43ef9a3d81c37ebf48bfd0924b584b78ec2e03ec91daf"}, - {file = "pydantic_core-2.41.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c6ea2102958eb5ad560d570c49996e215a6939d9bffd0e9fd3b9e808a55008cc"}, - {file = "pydantic_core-2.41.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd0d26f1e4335d5f84abfc880da0afa080c8222410482f9ee12043bb05f55ec8"}, - {file = "pydantic_core-2.41.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41c38700094045b12c0cff35c8585954de66cf6dd63909fed1c2e6b8f38e1e1e"}, - {file = "pydantic_core-2.41.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4061cc82d7177417fdb90e23e67b27425ecde2652cfd2053b5b4661a489ddc19"}, - {file = "pydantic_core-2.41.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:b1d9699a4dae10a7719951cca1e30b591ef1dd9cdda9fec39282a283576c0241"}, - {file = "pydantic_core-2.41.3-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:d5099f1b97e79f0e45cb6a236a5bd1a20078ed50b1b28f3d17f6c83ff3585baa"}, - {file = "pydantic_core-2.41.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:b5ff0467a8c1b6abb0ab9c9ea80e2e3a9788592e44c726c2db33fdaf1b5e7d0b"}, - {file = "pydantic_core-2.41.3-cp313-cp313-win32.whl", hash = "sha256:edfe9b4cee4a91da7247c25732f24504071f3e101c050694d18194b7d2d320bf"}, - {file = "pydantic_core-2.41.3-cp313-cp313-win_amd64.whl", hash = "sha256:44af3276c0c2c14efde6590523e4d7e04bcd0e46e0134f0dbef1be0b64b2d3e3"}, - {file = "pydantic_core-2.41.3-cp313-cp313-win_arm64.whl", hash = "sha256:59aeed341f92440d51fdcc82c8e930cfb234f1843ed1d4ae1074f5fb9789a64b"}, - {file = "pydantic_core-2.41.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ef37228238b3a280170ac43a010835c4a7005742bc8831c2c1a9560de4595dbe"}, - {file = "pydantic_core-2.41.3-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cb19f36253152c509abe76c1d1b185436e0c75f392a82934fe37f4a1264449"}, - {file = "pydantic_core-2.41.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91be4756e05367ce19a70e1db3b77f01f9e40ca70d26fb4cdfa993e53a08964a"}, - {file = "pydantic_core-2.41.3-cp313-cp313t-win_amd64.whl", hash = "sha256:ce7d8f4353f82259b55055bd162bbaf599f6c40cd0c098e989eeb95f9fdc022f"}, - {file = "pydantic_core-2.41.3-cp313-cp313t-win_arm64.whl", hash = "sha256:f06a9e81da60e5a0ef584f6f4790f925c203880ae391bf363d97126fd1790b21"}, - {file = "pydantic_core-2.41.3-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:0c77e8e72344e34052ea26905fa7551ecb75fc12795ca1a8e44f816918f4c718"}, - {file = "pydantic_core-2.41.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32be442a017e82a6c496a52ef5db5f5ac9abf31c3064f5240ee15a1d27cc599e"}, - {file = "pydantic_core-2.41.3-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af10c78f0e9086d2d883ddd5a6482a613ad435eb5739cf1467b1f86169e63d91"}, - {file = "pydantic_core-2.41.3-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6212874118704e27d177acee5b90b83556b14b2eb88aae01bae51cd9efe27019"}, - {file = "pydantic_core-2.41.3-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c6a24c82674a3a8e7f7306e57e98219e5c1cdfc0f57bc70986930dda136230b2"}, - {file = "pydantic_core-2.41.3-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8e0c81dc047c18059410c959a437540abcefea6a882d6e43b9bf45c291eaacd9"}, - {file = "pydantic_core-2.41.3-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0d7e1a9f80f00a8180b9194ecef66958eb03f3c3ae2d77195c9d665ac0a61e"}, - {file = "pydantic_core-2.41.3-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2868fabfc35ec0738539ce0d79aab37aeffdcb9682b9b91f0ac4b0ba31abb1eb"}, - {file = "pydantic_core-2.41.3-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:cb4f40c93307e1c50996e4edcddf338e1f3f1fb86fb69b654111c6050ae3b081"}, - {file = "pydantic_core-2.41.3-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:287cbcd3407a875eaf0b1efa2e5288493d5b79bfd3629459cf0b329ad8a9071a"}, - {file = "pydantic_core-2.41.3-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:5253835aa145049205a67056884555a936f9b3fea7c3ce860bff62be6a1ae4d1"}, - {file = "pydantic_core-2.41.3-cp314-cp314-win32.whl", hash = "sha256:69297795efe5349156d18eebea818b75d29a1d3d1d5f26a250f22ab4220aacd6"}, - {file = "pydantic_core-2.41.3-cp314-cp314-win_amd64.whl", hash = "sha256:e1c133e3447c2f6d95e47ede58fff0053370758112a1d39117d0af8c93584049"}, - {file = "pydantic_core-2.41.3-cp314-cp314-win_arm64.whl", hash = "sha256:54534eecbb7a331521f832e15fc307296f491ee1918dacfd4d5b900da6ee3332"}, - {file = "pydantic_core-2.41.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6b4be10152098b43c093a4b5e9e9da1ac7a1c954c1934d4438d07ba7b7bcf293"}, - {file = "pydantic_core-2.41.3-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe4ebd676c158a7994253161151b476dbbef2acbd2f547cfcfdf332cf67cc29"}, - {file = "pydantic_core-2.41.3-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:984ca0113b39dda1d7c358d6db03dd6539ef244d0558351806c1327239e035bf"}, - {file = "pydantic_core-2.41.3-cp314-cp314t-win_amd64.whl", hash = "sha256:2a7dd8a6f5a9a2f8c7f36e4fc0982a985dbc4ac7176ee3df9f63179b7295b626"}, - {file = "pydantic_core-2.41.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b387f08b378924fa82bd86e03c9d61d6daca1a73ffb3947bdcfe12ea14c41f68"}, - {file = "pydantic_core-2.41.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:267b64a4845471c33f12155140d7449643c0c190b5ae3be6a7a3c04461ac494b"}, - {file = "pydantic_core-2.41.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99b17a3ed3b8bf769815c782710e520b9b4efcede14eeea71ef57a2a16870ec9"}, - {file = "pydantic_core-2.41.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7f96e6fc3ab59e1ba1132f3105be9b8b7f80d071c73f7e8d2e1f594cbb64907"}, - {file = "pydantic_core-2.41.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:503923874b5496b0a7d6479f481e02342771c1561e96c1e28b97a5ad056e55e9"}, - {file = "pydantic_core-2.41.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:18dd9a88bc1017bea142a4936de1a32aec9723f13d6cb434bd2aeec23208143a"}, - {file = "pydantic_core-2.41.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:95da6803d101b5c35e4ea80f44da5ba5422f6695690570d7cc15f04a12ca4e33"}, - {file = "pydantic_core-2.41.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcc6bbcc83979b82fc1642dafd94b07c49f9b8e3b1df625f1c1aa676f952e48"}, - {file = "pydantic_core-2.41.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:70c01c179e1a786af804b93e3eb7506cd818744bff8cf9e3cda0d8bbb2d12204"}, - {file = "pydantic_core-2.41.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7c1010c4d2cc10703da089543c38909aa832656ffb85cd31dc3e3d73362e0249"}, - {file = "pydantic_core-2.41.3-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:cb13d215db8cb0f601227785f6d32c577387253ba3a47cbef72e7c6c93c13023"}, - {file = "pydantic_core-2.41.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92d96bb0abce0ce71f90845ad25b5521fbf8ce6e5589f4937cb047e4f5a36c76"}, - {file = "pydantic_core-2.41.3-cp39-cp39-win32.whl", hash = "sha256:8c8f7cae4451a7e83d781bd862c43b3591ede41b6d6adc5dead81300c3e0fbae"}, - {file = "pydantic_core-2.41.3-cp39-cp39-win_amd64.whl", hash = "sha256:2de13998e396d556c17065d7847e03f6c1ce6210eb1719a778a25425284f1a17"}, - {file = "pydantic_core-2.41.3-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:98ad9402d6cc194b21adb4626ead88fcce8bc287ef434502dbb4d5b71bdb9a47"}, - {file = "pydantic_core-2.41.3-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:539b1c01251fbc0789ad4e1dccf3e888062dd342b2796f403406855498afbc36"}, - {file = "pydantic_core-2.41.3-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:12019e3a4ded7c4e84b11a761be843dfa9837444a1d7f621888ad499f0f72643"}, - {file = "pydantic_core-2.41.3-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e01519c8322a489167abb1aceaab1a9e4c7d3e665dc3f7b0b1355910fcb698"}, - {file = "pydantic_core-2.41.3-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:a6ded5abbb7391c0db9e002aaa5f0e3a49a024b0a22e2ed09ab69087fd5ab8a8"}, - {file = "pydantic_core-2.41.3-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:43abc869cce9104ff35cb4eff3028e9a87346c95fe44e0173036bf4d782bdc3d"}, - {file = "pydantic_core-2.41.3-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb3c63f4014a603caee687cd5c3c63298d2c8951b7acb2ccd0befbf2e1c0b8ad"}, - {file = "pydantic_core-2.41.3-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88461e25f62e58db4d8b180e2612684f31b5844db0a8f8c1c421498c97bc197b"}, - {file = "pydantic_core-2.41.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:219a95d7638c6b3a50de749747afdf1c2bdf027653e4a3e1df2fefa1e238d8eb"}, - {file = "pydantic_core-2.41.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:21d4e730b75cfc62b3e24261030bd223ed5f867039f971027c551a7ab911f460"}, - {file = "pydantic_core-2.41.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79d9a98a80309189a49cffcd507c85032a2df35d005bd12d655f425ca80eec3d"}, - {file = "pydantic_core-2.41.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:20f7d53153eb2a5c2f7a8cccf1a45022e2b75668cad274f998b43313da03053d"}, - {file = "pydantic_core-2.41.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e2135eff48d3b6a2abfe7b26395d350ea76a460d3de3cf2521fe2f15f222fa29"}, - {file = "pydantic_core-2.41.3-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:005bf20e48f6272803de8ba0be076e5bd7d015b7f02ebcc989bc24f85636d1d8"}, - {file = "pydantic_core-2.41.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d4ebfa1864046c44669cd789a613ec39ee194fe73842e369d129d716730216d9"}, - {file = "pydantic_core-2.41.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cb82cd643a2ad7ebf94bdb7fa6c339801b0fe8c7920610d6da7b691647ef5842"}, - {file = "pydantic_core-2.41.3-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5e67f86ffb40127851dba662b2d0ab400264ed37cfedeab6100515df41ccb325"}, - {file = "pydantic_core-2.41.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ecad4d7d264f6df23db68ca3024919a7aab34b4c44d9a9280952863a7a0c5e81"}, - {file = "pydantic_core-2.41.3-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fce6e6505b9807d3c20476fa016d0bd4d54a858fe648d6f5ef065286410c3da7"}, - {file = "pydantic_core-2.41.3-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05974468cff84ea112ad4992823f1300d822ad51df0eba4c3af3c4a4cbe5eca0"}, - {file = "pydantic_core-2.41.3-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:091d3966dc2379e07b45b4fd9651fbab5b24ea3c62cc40637beaf691695e5f5a"}, - {file = "pydantic_core-2.41.3-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:16f216e4371a05ad3baa5aed152eae056c7e724663c2bcbb38edd607c17baa89"}, - {file = "pydantic_core-2.41.3-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:2e169371f88113c8e642f7ac42c798109f1270832b577b5144962a7a028bfb0c"}, - {file = "pydantic_core-2.41.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:83847aa6026fb7149b9ef06e10c73ff83ac1d2aa478b28caa4f050670c1c9a37"}, - {file = "pydantic_core-2.41.3.tar.gz", hash = "sha256:cdebb34b36ad05e8d77b4e797ad38a2a775c2a07a8fa386d4f6943b7778dcd39"}, + {file = "pydantic_core-2.41.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2442d9a4d38f3411f22eb9dd0912b7cbf4b7d5b6c92c4173b75d3e1ccd84e36e"}, + {file = "pydantic_core-2.41.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:30a9876226dda131a741afeab2702e2d127209bde3c65a2b8133f428bc5d006b"}, + {file = "pydantic_core-2.41.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d55bbac04711e2980645af68b97d445cdbcce70e5216de444a6c4b6943ebcccd"}, + {file = "pydantic_core-2.41.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e1d778fb7849a42d0ee5927ab0f7453bf9f85eef8887a546ec87db5ddb178945"}, + {file = "pydantic_core-2.41.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1b65077a4693a98b90ec5ad8f203ad65802a1b9b6d4a7e48066925a7e1606706"}, + {file = "pydantic_core-2.41.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:62637c769dee16eddb7686bf421be48dfc2fae93832c25e25bc7242e698361ba"}, + {file = "pydantic_core-2.41.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dfe3aa529c8f501babf6e502936b9e8d4698502b2cfab41e17a028d91b1ac7b"}, + {file = "pydantic_core-2.41.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ca2322da745bf2eeb581fc9ea3bbb31147702163ccbcbf12a3bb630e4bf05e1d"}, + {file = "pydantic_core-2.41.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e8cd3577c796be7231dcf80badcf2e0835a46665eaafd8ace124d886bab4d700"}, + {file = "pydantic_core-2.41.4-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:1cae8851e174c83633f0833e90636832857297900133705ee158cf79d40f03e6"}, + {file = "pydantic_core-2.41.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a26d950449aae348afe1ac8be5525a00ae4235309b729ad4d3399623125b43c9"}, + {file = "pydantic_core-2.41.4-cp310-cp310-win32.whl", hash = "sha256:0cf2a1f599efe57fa0051312774280ee0f650e11152325e41dfd3018ef2c1b57"}, + {file = "pydantic_core-2.41.4-cp310-cp310-win_amd64.whl", hash = "sha256:a8c2e340d7e454dc3340d3d2e8f23558ebe78c98aa8f68851b04dcb7bc37abdc"}, + {file = "pydantic_core-2.41.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:28ff11666443a1a8cf2a044d6a545ebffa8382b5f7973f22c36109205e65dc80"}, + {file = "pydantic_core-2.41.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:61760c3925d4633290292bad462e0f737b840508b4f722247d8729684f6539ae"}, + {file = "pydantic_core-2.41.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eae547b7315d055b0de2ec3965643b0ab82ad0106a7ffd29615ee9f266a02827"}, + {file = "pydantic_core-2.41.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ef9ee5471edd58d1fcce1c80ffc8783a650e3e3a193fe90d52e43bb4d87bff1f"}, + {file = "pydantic_core-2.41.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:15dd504af121caaf2c95cb90c0ebf71603c53de98305621b94da0f967e572def"}, + {file = "pydantic_core-2.41.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3a926768ea49a8af4d36abd6a8968b8790f7f76dd7cbd5a4c180db2b4ac9a3a2"}, + {file = "pydantic_core-2.41.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6916b9b7d134bff5440098a4deb80e4cb623e68974a87883299de9124126c2a8"}, + {file = "pydantic_core-2.41.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5cf90535979089df02e6f17ffd076f07237efa55b7343d98760bde8743c4b265"}, + {file = "pydantic_core-2.41.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7533c76fa647fade2d7ec75ac5cc079ab3f34879626dae5689b27790a6cf5a5c"}, + {file = "pydantic_core-2.41.4-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:37e516bca9264cbf29612539801ca3cd5d1be465f940417b002905e6ed79d38a"}, + {file = "pydantic_core-2.41.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0c19cb355224037c83642429b8ce261ae108e1c5fbf5c028bac63c77b0f8646e"}, + {file = "pydantic_core-2.41.4-cp311-cp311-win32.whl", hash = "sha256:09c2a60e55b357284b5f31f5ab275ba9f7f70b7525e18a132ec1f9160b4f1f03"}, + {file = "pydantic_core-2.41.4-cp311-cp311-win_amd64.whl", hash = "sha256:711156b6afb5cb1cb7c14a2cc2c4a8b4c717b69046f13c6b332d8a0a8f41ca3e"}, + {file = "pydantic_core-2.41.4-cp311-cp311-win_arm64.whl", hash = "sha256:6cb9cf7e761f4f8a8589a45e49ed3c0d92d1d696a45a6feaee8c904b26efc2db"}, + {file = "pydantic_core-2.41.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:ab06d77e053d660a6faaf04894446df7b0a7e7aba70c2797465a0a1af00fc887"}, + {file = "pydantic_core-2.41.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c53ff33e603a9c1179a9364b0a24694f183717b2e0da2b5ad43c316c956901b2"}, + {file = "pydantic_core-2.41.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:304c54176af2c143bd181d82e77c15c41cbacea8872a2225dd37e6544dce9999"}, + {file = "pydantic_core-2.41.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:025ba34a4cf4fb32f917d5d188ab5e702223d3ba603be4d8aca2f82bede432a4"}, + {file = "pydantic_core-2.41.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9f5f30c402ed58f90c70e12eff65547d3ab74685ffe8283c719e6bead8ef53f"}, + {file = "pydantic_core-2.41.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd96e5d15385d301733113bcaa324c8bcf111275b7675a9c6e88bfb19fc05e3b"}, + {file = "pydantic_core-2.41.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98f348cbb44fae6e9653c1055db7e29de67ea6a9ca03a5fa2c2e11a47cff0e47"}, + {file = "pydantic_core-2.41.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ec22626a2d14620a83ca583c6f5a4080fa3155282718b6055c2ea48d3ef35970"}, + {file = "pydantic_core-2.41.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3a95d4590b1f1a43bf33ca6d647b990a88f4a3824a8c4572c708f0b45a5290ed"}, + {file = "pydantic_core-2.41.4-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:f9672ab4d398e1b602feadcffcdd3af44d5f5e6ddc15bc7d15d376d47e8e19f8"}, + {file = "pydantic_core-2.41.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:84d8854db5f55fead3b579f04bda9a36461dab0730c5d570e1526483e7bb8431"}, + {file = "pydantic_core-2.41.4-cp312-cp312-win32.whl", hash = "sha256:9be1c01adb2ecc4e464392c36d17f97e9110fbbc906bcbe1c943b5b87a74aabd"}, + {file = "pydantic_core-2.41.4-cp312-cp312-win_amd64.whl", hash = "sha256:d682cf1d22bab22a5be08539dca3d1593488a99998f9f412137bc323179067ff"}, + {file = "pydantic_core-2.41.4-cp312-cp312-win_arm64.whl", hash = "sha256:833eebfd75a26d17470b58768c1834dfc90141b7afc6eb0429c21fc5a21dcfb8"}, + {file = "pydantic_core-2.41.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:85e050ad9e5f6fe1004eec65c914332e52f429bc0ae12d6fa2092407a462c746"}, + {file = "pydantic_core-2.41.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e7393f1d64792763a48924ba31d1e44c2cfbc05e3b1c2c9abb4ceeadd912cced"}, + {file = "pydantic_core-2.41.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94dab0940b0d1fb28bcab847adf887c66a27a40291eedf0b473be58761c9799a"}, + {file = "pydantic_core-2.41.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:de7c42f897e689ee6f9e93c4bec72b99ae3b32a2ade1c7e4798e690ff5246e02"}, + {file = "pydantic_core-2.41.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:664b3199193262277b8b3cd1e754fb07f2c6023289c815a1e1e8fb415cb247b1"}, + {file = "pydantic_core-2.41.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d95b253b88f7d308b1c0b417c4624f44553ba4762816f94e6986819b9c273fb2"}, + {file = "pydantic_core-2.41.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1351f5bbdbbabc689727cb91649a00cb9ee7203e0a6e54e9f5ba9e22e384b84"}, + {file = "pydantic_core-2.41.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1affa4798520b148d7182da0615d648e752de4ab1a9566b7471bc803d88a062d"}, + {file = "pydantic_core-2.41.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7b74e18052fea4aa8dea2fb7dbc23d15439695da6cbe6cfc1b694af1115df09d"}, + {file = "pydantic_core-2.41.4-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:285b643d75c0e30abda9dc1077395624f314a37e3c09ca402d4015ef5979f1a2"}, + {file = "pydantic_core-2.41.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:f52679ff4218d713b3b33f88c89ccbf3a5c2c12ba665fb80ccc4192b4608dbab"}, + {file = "pydantic_core-2.41.4-cp313-cp313-win32.whl", hash = "sha256:ecde6dedd6fff127c273c76821bb754d793be1024bc33314a120f83a3c69460c"}, + {file = "pydantic_core-2.41.4-cp313-cp313-win_amd64.whl", hash = "sha256:d081a1f3800f05409ed868ebb2d74ac39dd0c1ff6c035b5162356d76030736d4"}, + {file = "pydantic_core-2.41.4-cp313-cp313-win_arm64.whl", hash = "sha256:f8e49c9c364a7edcbe2a310f12733aad95b022495ef2a8d653f645e5d20c1564"}, + {file = "pydantic_core-2.41.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ed97fd56a561f5eb5706cebe94f1ad7c13b84d98312a05546f2ad036bafe87f4"}, + {file = "pydantic_core-2.41.4-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a870c307bf1ee91fc58a9a61338ff780d01bfae45922624816878dce784095d2"}, + {file = "pydantic_core-2.41.4-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d25e97bc1f5f8f7985bdc2335ef9e73843bb561eb1fa6831fdfc295c1c2061cf"}, + {file = "pydantic_core-2.41.4-cp313-cp313t-win_amd64.whl", hash = "sha256:d405d14bea042f166512add3091c1af40437c2e7f86988f3915fabd27b1e9cd2"}, + {file = "pydantic_core-2.41.4-cp313-cp313t-win_arm64.whl", hash = "sha256:19f3684868309db5263a11bace3c45d93f6f24afa2ffe75a647583df22a2ff89"}, + {file = "pydantic_core-2.41.4-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:e9205d97ed08a82ebb9a307e92914bb30e18cdf6f6b12ca4bedadb1588a0bfe1"}, + {file = "pydantic_core-2.41.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:82df1f432b37d832709fbcc0e24394bba04a01b6ecf1ee87578145c19cde12ac"}, + {file = "pydantic_core-2.41.4-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc3b4cc4539e055cfa39a3763c939f9d409eb40e85813257dcd761985a108554"}, + {file = "pydantic_core-2.41.4-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b1eb1754fce47c63d2ff57fdb88c351a6c0150995890088b33767a10218eaa4e"}, + {file = "pydantic_core-2.41.4-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e6ab5ab30ef325b443f379ddb575a34969c333004fca5a1daa0133a6ffaad616"}, + {file = "pydantic_core-2.41.4-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:31a41030b1d9ca497634092b46481b937ff9397a86f9f51bd41c4767b6fc04af"}, + {file = "pydantic_core-2.41.4-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a44ac1738591472c3d020f61c6df1e4015180d6262ebd39bf2aeb52571b60f12"}, + {file = "pydantic_core-2.41.4-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d72f2b5e6e82ab8f94ea7d0d42f83c487dc159c5240d8f83beae684472864e2d"}, + {file = "pydantic_core-2.41.4-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:c4d1e854aaf044487d31143f541f7aafe7b482ae72a022c664b2de2e466ed0ad"}, + {file = "pydantic_core-2.41.4-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:b568af94267729d76e6ee5ececda4e283d07bbb28e8148bb17adad93d025d25a"}, + {file = "pydantic_core-2.41.4-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:6d55fb8b1e8929b341cc313a81a26e0d48aa3b519c1dbaadec3a6a2b4fcad025"}, + {file = "pydantic_core-2.41.4-cp314-cp314-win32.whl", hash = "sha256:5b66584e549e2e32a1398df11da2e0a7eff45d5c2d9db9d5667c5e6ac764d77e"}, + {file = "pydantic_core-2.41.4-cp314-cp314-win_amd64.whl", hash = "sha256:557a0aab88664cc552285316809cab897716a372afaf8efdbef756f8b890e894"}, + {file = "pydantic_core-2.41.4-cp314-cp314-win_arm64.whl", hash = "sha256:3f1ea6f48a045745d0d9f325989d8abd3f1eaf47dd00485912d1a3a63c623a8d"}, + {file = "pydantic_core-2.41.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6c1fe4c5404c448b13188dd8bd2ebc2bdd7e6727fa61ff481bcc2cca894018da"}, + {file = "pydantic_core-2.41.4-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:523e7da4d43b113bf8e7b49fa4ec0c35bf4fe66b2230bfc5c13cc498f12c6c3e"}, + {file = "pydantic_core-2.41.4-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5729225de81fb65b70fdb1907fcf08c75d498f4a6f15af005aabb1fdadc19dfa"}, + {file = "pydantic_core-2.41.4-cp314-cp314t-win_amd64.whl", hash = "sha256:de2cfbb09e88f0f795fd90cf955858fc2c691df65b1f21f0aa00b99f3fbc661d"}, + {file = "pydantic_core-2.41.4-cp314-cp314t-win_arm64.whl", hash = "sha256:d34f950ae05a83e0ede899c595f312ca976023ea1db100cd5aa188f7005e3ab0"}, + {file = "pydantic_core-2.41.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:646e76293345954acea6966149683047b7b2ace793011922208c8e9da12b0062"}, + {file = "pydantic_core-2.41.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cc8e85a63085a137d286e2791037f5fdfff0aabb8b899483ca9c496dd5797338"}, + {file = "pydantic_core-2.41.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:692c622c8f859a17c156492783902d8370ac7e121a611bd6fe92cc71acf9ee8d"}, + {file = "pydantic_core-2.41.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d1e2906efb1031a532600679b424ef1d95d9f9fb507f813951f23320903adbd7"}, + {file = "pydantic_core-2.41.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e04e2f7f8916ad3ddd417a7abdd295276a0bf216993d9318a5d61cc058209166"}, + {file = "pydantic_core-2.41.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df649916b81822543d1c8e0e1d079235f68acdc7d270c911e8425045a8cfc57e"}, + {file = "pydantic_core-2.41.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66c529f862fdba70558061bb936fe00ddbaaa0c647fd26e4a4356ef1d6561891"}, + {file = "pydantic_core-2.41.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fc3b4c5a1fd3a311563ed866c2c9b62da06cb6398bee186484ce95c820db71cb"}, + {file = "pydantic_core-2.41.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:6e0fc40d84448f941df9b3334c4b78fe42f36e3bf631ad54c3047a0cdddc2514"}, + {file = "pydantic_core-2.41.4-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:44e7625332683b6c1c8b980461475cde9595eff94447500e80716db89b0da005"}, + {file = "pydantic_core-2.41.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:170ee6835f6c71081d031ef1c3b4dc4a12b9efa6a9540f93f95b82f3c7571ae8"}, + {file = "pydantic_core-2.41.4-cp39-cp39-win32.whl", hash = "sha256:3adf61415efa6ce977041ba9745183c0e1f637ca849773afa93833e04b163feb"}, + {file = "pydantic_core-2.41.4-cp39-cp39-win_amd64.whl", hash = "sha256:a238dd3feee263eeaeb7dc44aea4ba1364682c4f9f9467e6af5596ba322c2332"}, + {file = "pydantic_core-2.41.4-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:a1b2cfec3879afb742a7b0bcfa53e4f22ba96571c9e54d6a3afe1052d17d843b"}, + {file = "pydantic_core-2.41.4-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:d175600d975b7c244af6eb9c9041f10059f20b8bbffec9e33fdd5ee3f67cdc42"}, + {file = "pydantic_core-2.41.4-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f184d657fa4947ae5ec9c47bd7e917730fa1cbb78195037e32dcbab50aca5ee"}, + {file = "pydantic_core-2.41.4-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ed810568aeffed3edc78910af32af911c835cc39ebbfacd1f0ab5dd53028e5c"}, + {file = "pydantic_core-2.41.4-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:4f5d640aeebb438517150fdeec097739614421900e4a08db4a3ef38898798537"}, + {file = "pydantic_core-2.41.4-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:4a9ab037b71927babc6d9e7fc01aea9e66dc2a4a34dff06ef0724a4049629f94"}, + {file = "pydantic_core-2.41.4-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4dab9484ec605c3016df9ad4fd4f9a390bc5d816a3b10c6550f8424bb80b18c"}, + {file = "pydantic_core-2.41.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8a5028425820731d8c6c098ab642d7b8b999758e24acae03ed38a66eca8335"}, + {file = "pydantic_core-2.41.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:1e5ab4fc177dd41536b3c32b2ea11380dd3d4619a385860621478ac2d25ceb00"}, + {file = "pydantic_core-2.41.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:3d88d0054d3fa11ce936184896bed3c1c5441d6fa483b498fac6a5d0dd6f64a9"}, + {file = "pydantic_core-2.41.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b2a054a8725f05b4b6503357e0ac1c4e8234ad3b0c2ac130d6ffc66f0e170e2"}, + {file = "pydantic_core-2.41.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b0d9db5a161c99375a0c68c058e227bee1d89303300802601d76a3d01f74e258"}, + {file = "pydantic_core-2.41.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:6273ea2c8ffdac7b7fda2653c49682db815aebf4a89243a6feccf5e36c18c347"}, + {file = "pydantic_core-2.41.4-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:4c973add636efc61de22530b2ef83a65f39b6d6f656df97f678720e20de26caa"}, + {file = "pydantic_core-2.41.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b69d1973354758007f46cf2d44a4f3d0933f10b6dc9bf15cf1356e037f6f731a"}, + {file = "pydantic_core-2.41.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:3619320641fd212aaf5997b6ca505e97540b7e16418f4a241f44cdf108ffb50d"}, + {file = "pydantic_core-2.41.4-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:491535d45cd7ad7e4a2af4a5169b0d07bebf1adfd164b0368da8aa41e19907a5"}, + {file = "pydantic_core-2.41.4-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:54d86c0cada6aba4ec4c047d0e348cbad7063b87ae0f005d9f8c9ad04d4a92a2"}, + {file = "pydantic_core-2.41.4-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eca1124aced216b2500dc2609eade086d718e8249cb9696660ab447d50a758bd"}, + {file = "pydantic_core-2.41.4-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6c9024169becccf0cb470ada03ee578d7348c119a0d42af3dcf9eda96e3a247c"}, + {file = "pydantic_core-2.41.4-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:26895a4268ae5a2849269f4991cdc97236e4b9c010e51137becf25182daac405"}, + {file = "pydantic_core-2.41.4-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:ca4df25762cf71308c446e33c9b1fdca2923a3f13de616e2a949f38bf21ff5a8"}, + {file = "pydantic_core-2.41.4-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:5a28fcedd762349519276c36634e71853b4541079cab4acaaac60c4421827308"}, + {file = "pydantic_core-2.41.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c173ddcd86afd2535e2b695217e82191580663a1d1928239f877f5a1649ef39f"}, + {file = "pydantic_core-2.41.4.tar.gz", hash = "sha256:70e47929a9d4a1905a67e4b687d5946026390568a8e952b92824118063cee4d5"}, ] [package.dependencies] @@ -3150,4 +3150,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "585c28e85f219855a851b33c4881fc8a594e75c962e687436572fe216c588486" +content-hash = "81c6f8a402bcd05a9e24cb1875f4f3f5d2e5e739d7fcab6eabadc93fbe82116b" diff --git a/pyproject.toml b/pyproject.toml index 5b16c17b82..60f25a94b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ pgconnstr = "^1.0.1" requests = "^2.32.5" tenacity = "^9.1.2" psycopg2 = "^2.9.11" -pydantic = "^2.11.4" +pydantic = "^2.12.2" jinja2 = "^3.1.6" pysyncobj = "^0.3.14" psutil = "^7.1.0" diff --git a/src/charm.py b/src/charm.py index abf20da132..75399a3754 100755 --- a/src/charm.py +++ b/src/charm.py @@ -2342,6 +2342,24 @@ def _can_connect_to_postgresql(self) -> bool: return False return True + def _api_update_config(self) -> None: + # Use config value if set, calculate otherwise + max_connections = ( + self.config.experimental_max_connections + if self.config.experimental_max_connections + else max(4 * self.cpu_count, 100) + ) + cfg_patch = { + "max_connections": max_connections, + "max_prepared_transactions": self.config.memory_max_prepared_transactions, + "max_replication_slots": 25, + "max_wal_senders": 25, + "wal_keep_size": self.config.durability_wal_keep_size, + } + if primary_endpoint := self.async_replication.get_primary_cluster_endpoint(): + cfg_patch["standby_cluster"] = {"host": primary_endpoint} + self._patroni.bulk_update_parameters_controller_by_patroni(cfg_patch) + def update_config( self, is_creating_backup: bool = False, @@ -2411,20 +2429,7 @@ def update_config( logger.warning("Early exit update_config: Cannot connect to Postgresql") return False - # Use config value if set, calculate otherwise - max_connections = ( - self.config.experimental_max_connections - if self.config.experimental_max_connections - else max(4 * self.cpu_count, 100) - ) - - self._patroni.bulk_update_parameters_controller_by_patroni({ - "max_connections": max_connections, - "max_prepared_transactions": self.config.memory_max_prepared_transactions, - "max_replication_slots": 25, - "max_wal_senders": 25, - "wal_keep_size": self.config.durability_wal_keep_size, - }) + self._api_update_config() self._patroni.ensure_slots_controller_by_patroni(replication_slots) diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 0c09581fee..e3adb90e9c 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -179,7 +179,7 @@ def get_db_standby_leader_unit(juju: Juju, app_name: str) -> str: """Get the current standby node of the cluster.""" unit_address = get_unit_ip(juju, app_name, get_app_leader(juju, app_name)) - for member in requests.get(f"https://{unit_address}:8008/history", verify=False).json()[ + for member in requests.get(f"https://{unit_address}:8008/cluster", verify=False).json()[ "members" ]: if member["role"] == "standby_leader": diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index bbf54ba36c..1352fe56e8 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -265,7 +265,7 @@ def test_unrelate_and_relate(first_model: str, second_model: str) -> None: def test_failover_in_main_cluster(first_model: str, second_model: str) -> None: """Test that async replication fails over correctly.""" - model_1 = Juju(model=second_model) + model_1 = Juju(model=first_model) rerelate_test_app(model_1, DB_APP_1, DB_TEST_APP_1) @@ -275,7 +275,7 @@ def test_failover_in_main_cluster(first_model: str, second_model: str) -> None: ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=10 * MINUTE_SECS ) - results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_1) + results = get_db_max_written_values(first_model, second_model, first_model, DB_TEST_APP_1) assert len(results) == 5 assert all(results[0] == x for x in results), "Data is not consistent across units" @@ -298,7 +298,7 @@ def test_failover_in_standby_cluster(first_model: str, second_model: str) -> Non ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=10 * MINUTE_SECS ) - results = get_db_max_written_values(first_model, second_model, second_model, DB_TEST_APP_1) + results = get_db_max_written_values(first_model, second_model, first_model, DB_TEST_APP_1) assert len(results) == 4 assert all(results[0] == x for x in results), "Data is not consistent across units" diff --git a/tests/integration/high_availability/test_async_replication_upgrade.py b/tests/integration/high_availability/test_async_replication_upgrade.py new file mode 100644 index 0000000000..0a078290cf --- /dev/null +++ b/tests/integration/high_availability/test_async_replication_upgrade.py @@ -0,0 +1,277 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +import time +from collections.abc import Generator + +import jubilant +import pytest +from jubilant import Juju + +from .. import architecture +from .high_availability_helpers_new import ( + check_db_units_writes_increment, + get_app_leader, + get_app_units, + get_db_max_written_value, + wait_for_apps_status, +) + +DB_APP_NAME = "postgresql" +DB_APP_1 = "db1" +DB_APP_2 = "db2" +DB_TEST_APP_NAME = "postgresql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +@pytest.fixture(scope="module") +def first_model(juju: Juju, request: pytest.FixtureRequest) -> Generator: + """Creates and return the first model.""" + yield juju.model + + +@pytest.fixture(scope="module") +def second_model(juju: Juju, request: pytest.FixtureRequest) -> Generator: + """Creates and returns the second model.""" + model_name = f"{juju.model}-other" + + logging.info(f"Creating model: {model_name}") + juju.add_model(model_name) + + yield model_name + if request.config.getoption("--keep-models"): + return + + logging.info(f"Destroying model: {model_name}") + juju.destroy_model(model_name, destroy_storage=True, force=True) + + +@pytest.fixture() +def continuous_writes(first_model: str) -> Generator: + """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" + model_1 = Juju(model=first_model) + model_1_test_app_leader = get_app_leader(model_1, DB_TEST_APP_NAME) + + logging.info("Clearing continuous writes") + model_1.run(model_1_test_app_leader, "clear-continuous-writes") + logging.info("Starting continuous writes") + model_1.run(model_1_test_app_leader, "start-continuous-writes") + + yield + + logging.info("Clearing continuous writes") + model_1.run(model_1_test_app_leader, "clear-continuous-writes") + + +def test_deploy(first_model: str, second_model: str, charm: str) -> None: + """Simple test to ensure that the MySQL application charms get deployed.""" + configuration = {"profile": "testing"} + constraints = {"arch": architecture.architecture} + + logging.info("Deploying mysql clusters") + model_1 = Juju(model=first_model) + model_1.deploy( + charm=DB_APP_NAME, + app=DB_APP_1, + base="ubuntu@24.04", + channel="16/edge", + config=configuration, + constraints=constraints, + num_units=3, + ) + model_2 = Juju(model=second_model) + model_2.deploy( + charm=DB_APP_NAME, + app=DB_APP_2, + base="ubuntu@24.04", + channel="16/edge", + config=configuration, + constraints=constraints, + num_units=3, + ) + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS + ) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=20 * MINUTE_SECS + ) + + +def test_async_relate(first_model: str, second_model: str) -> None: + """Relate the two MySQL clusters.""" + logging.info("Creating offers in first model") + model_1 = Juju(model=first_model) + model_1.offer(DB_APP_1, endpoint="replication-offer") + + logging.info("Consuming offer in second model") + model_2 = Juju(model=second_model) + model_2.consume(f"{first_model}.{DB_APP_1}") + + logging.info("Relating the two mysql clusters") + model_2.integrate(f"{DB_APP_1}", f"{DB_APP_2}:replication") + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant.any_blocked, DB_APP_1), timeout=5 * MINUTE_SECS + ) + model_2.wait( + ready=wait_for_apps_status(jubilant.any_waiting, DB_APP_2), timeout=5 * MINUTE_SECS + ) + + +def test_deploy_test_app(first_model: str) -> None: + """Deploy the test application.""" + constraints = {"arch": architecture.architecture} + + logging.info("Deploying the test application") + model_1 = Juju(model=first_model) + model_1.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + constraints=constraints, + num_units=1, + ) + + logging.info("Relating the test application") + model_1.integrate(f"{DB_APP_1}:database", f"{DB_TEST_APP_NAME}:database") + + model_1.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_TEST_APP_NAME), timeout=10 * MINUTE_SECS + ) + + +def test_create_replication(first_model: str, second_model: str) -> None: + """Run the create-replication action and wait for the applications to settle.""" + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + + logging.info("Running create replication action") + task = model_1.run( + unit=get_app_leader(model_1, DB_APP_1), + action="create-replication", + wait=5 * MINUTE_SECS, + ) + task.raise_on_failure() + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=5 * MINUTE_SECS + ) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=5 * MINUTE_SECS + ) + + +def test_upgrade_from_edge( + first_model: str, second_model: str, charm: str, continuous_writes +) -> None: + """Upgrade the two MySQL clusters.""" + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + + run_pre_refresh_checks(model_1, DB_APP_1) + run_upgrade_from_edge(model_1, DB_APP_1, charm) + + run_pre_refresh_checks(model_2, DB_APP_2) + run_upgrade_from_edge(model_2, DB_APP_2, charm) + + +def test_data_replication(first_model: str, second_model: str, continuous_writes) -> None: + """Test to write to primary, and read the same data back from replicas.""" + logging.info("Testing data replication") + results = await get_mysql_max_written_values(first_model, second_model) + + assert len(results) == 6 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + +def get_mysql_max_written_values(first_model: str, second_model: str) -> list[int]: + """Return list with max written value from all units.""" + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + + logging.info("Stopping continuous writes") + stopping_task = model_1.run( + unit=get_app_leader(model_1, DB_TEST_APP_NAME), action="stop-continuous-writes", params={} + ) + stopping_task.raise_on_failure() + + time.sleep(5) + results = [] + + logging.info(f"Querying max value on all {DB_APP_1} units") + for unit_name in get_app_units(model_1, DB_APP_1): + unit_max_value = await get_db_max_written_value(model_1, DB_APP_1, unit_name) + results.append(unit_max_value) + + logging.info(f"Querying max value on all {DB_APP_2} units") + for unit_name in get_app_units(model_2, DB_APP_2): + unit_max_value = await get_db_max_written_value(model_2, DB_APP_2, unit_name) + results.append(unit_max_value) + + return results + + +def run_pre_refresh_checks(juju: Juju, app_name: str) -> None: + """Run the pre-refresh-check actions.""" + app_leader = get_app_leader(juju, app_name) + + logging.info("Run pre-upgrade-check action") + juju.run(unit=app_leader, action="pre-refresh-check").raise_on_failure() + + +def run_upgrade_from_edge(juju: Juju, app_name: str, charm: str) -> None: + """Update the second cluster.""" + logging.info("Ensure continuous writes are incrementing") + check_db_units_writes_increment(juju, DB_APP_NAME) + + logging.info("Refresh the charm") + juju.refresh(app=DB_APP_NAME, path=charm) + logging.info("Wait for refresh to block as paused or incompatible") + try: + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") + juju.run( + unit=unit_names[-1], + action="force-refresh-start", + params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, + ) + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + logging.info("Run resume-refresh action") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + except TimeoutError: + logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") + assert juju.status().apps[DB_APP_NAME].is_active + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes are incrementing") + check_db_units_writes_increment(juju, DB_APP_NAME) diff --git a/tests/spread/test_async_replication_upgrade.py/task.yaml b/tests/spread/test_async_replication_upgrade.py/task.yaml new file mode 100644 index 0000000000..b5b034dcef --- /dev/null +++ b/tests/spread/test_async_replication_upgrade.py/task.yaml @@ -0,0 +1,7 @@ +summary: test_async_replication_upgrade.py +environment: + TEST_MODULE: high_availability/test_async_replication_upgrade.py +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results diff --git a/tests/spread/test_upgrade_skip_pre_upgrade_check.py/task.yaml b/tests/spread/test_upgrade_skip_pre_upgrade_check.py/task.yaml index 79ed8357d0..a57bec914b 100644 --- a/tests/spread/test_upgrade_skip_pre_upgrade_check.py/task.yaml +++ b/tests/spread/test_upgrade_skip_pre_upgrade_check.py/task.yaml @@ -1,4 +1,4 @@ -summary: test_upgrade.py +summary: test_upgrade_skip_pre_upgrade_check.py environment: TEST_MODULE: high_availability/test_upgrade_skip_pre_upgrade_check.py execute: | From 0d7bc5f1cc82e5525f286fccae097de00c6aff5b Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 15 Oct 2025 05:35:16 +0300 Subject: [PATCH 28/33] Scale up and more tweaks --- .../test_async_replication.py | 23 ++++++++ .../test_async_replication_upgrade.py | 57 +++++++++++-------- 2 files changed, 57 insertions(+), 23 deletions(-) diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index 1352fe56e8..e3d8e4f240 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -307,6 +307,29 @@ def test_failover_in_standby_cluster(first_model: str, second_model: str) -> Non assert standby != get_db_standby_leader_unit(model_2, DB_APP_2) +def scale_up(first_model: str, second_model: str) -> None: + model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) + + rerelate_test_app(model_1, DB_APP_1, DB_TEST_APP_1) + model_1.add_unit(DB_APP_1) + model_2.add_unit(DB_APP_2) + + logging.info("Waiting for the applications to settle") + model_1.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS + ) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=20 * MINUTE_SECS + ) + + results = get_db_max_written_values(first_model, second_model, first_model, DB_TEST_APP_1) + + assert len(results) == 6 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" + + def get_db_max_written_values( first_model: str, second_model: str, test_model: str, test_app: str ) -> list[int]: diff --git a/tests/integration/high_availability/test_async_replication_upgrade.py b/tests/integration/high_availability/test_async_replication_upgrade.py index 0a078290cf..0371882a86 100644 --- a/tests/integration/high_availability/test_async_replication_upgrade.py +++ b/tests/integration/high_availability/test_async_replication_upgrade.py @@ -9,6 +9,7 @@ import jubilant import pytest from jubilant import Juju +from tenacity import Retrying, stop_after_attempt from .. import architecture from .high_availability_helpers_new import ( @@ -52,20 +53,31 @@ def second_model(juju: Juju, request: pytest.FixtureRequest) -> Generator: @pytest.fixture() -def continuous_writes(first_model: str) -> Generator: - """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" +def first_model_continuous_writes(first_model: str) -> Generator: + """Starts continuous writes to the cluster for a test and clear the writes at the end.""" model_1 = Juju(model=first_model) - model_1_test_app_leader = get_app_leader(model_1, DB_TEST_APP_NAME) + application_unit = get_app_leader(model_1, DB_TEST_APP_NAME) logging.info("Clearing continuous writes") - model_1.run(model_1_test_app_leader, "clear-continuous-writes") + model_1.run( + unit=application_unit, action="clear-continuous-writes", wait=120 + ).raise_on_failure() + logging.info("Starting continuous writes") - model_1.run(model_1_test_app_leader, "start-continuous-writes") + + for attempt in Retrying(stop=stop_after_attempt(10), reraise=True): + with attempt: + result = model_1.run(unit=application_unit, action="start-continuous-writes") + result.raise_on_failure() + + assert result.results["result"] == "True" yield logging.info("Clearing continuous writes") - model_1.run(model_1_test_app_leader, "clear-continuous-writes") + model_1.run( + unit=application_unit, action="clear-continuous-writes", wait=120 + ).raise_on_failure() def test_deploy(first_model: str, second_model: str, charm: str) -> None: @@ -108,21 +120,23 @@ def test_async_relate(first_model: str, second_model: str) -> None: """Relate the two MySQL clusters.""" logging.info("Creating offers in first model") model_1 = Juju(model=first_model) - model_1.offer(DB_APP_1, endpoint="replication-offer") + model_1.offer(f"{first_model}.{DB_APP_1}", endpoint="replication-offer") logging.info("Consuming offer in second model") model_2 = Juju(model=second_model) model_2.consume(f"{first_model}.{DB_APP_1}") - logging.info("Relating the two mysql clusters") + logging.info("Relating the two postgresql clusters") model_2.integrate(f"{DB_APP_1}", f"{DB_APP_2}:replication") logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.any_blocked, DB_APP_1), timeout=5 * MINUTE_SECS + ready=wait_for_apps_status(jubilant.any_active, DB_APP_1), + timeout=10 * MINUTE_SECS, ) model_2.wait( - ready=wait_for_apps_status(jubilant.any_waiting, DB_APP_2), timeout=5 * MINUTE_SECS + ready=wait_for_apps_status(jubilant.any_active, DB_APP_2), + timeout=10 * MINUTE_SECS, ) @@ -155,24 +169,21 @@ def test_create_replication(first_model: str, second_model: str) -> None: model_2 = Juju(model=second_model) logging.info("Running create replication action") - task = model_1.run( - unit=get_app_leader(model_1, DB_APP_1), - action="create-replication", - wait=5 * MINUTE_SECS, - ) - task.raise_on_failure() + model_1.run( + unit=get_app_leader(model_1, DB_APP_1), action="create-replication", wait=5 * MINUTE_SECS + ).raise_on_failure() logging.info("Waiting for the applications to settle") model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=5 * MINUTE_SECS + ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS ) model_2.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=5 * MINUTE_SECS + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=20 * MINUTE_SECS ) def test_upgrade_from_edge( - first_model: str, second_model: str, charm: str, continuous_writes + first_model: str, second_model: str, charm: str, first_continuous_writes ) -> None: """Upgrade the two MySQL clusters.""" model_1 = Juju(model=first_model) @@ -188,14 +199,14 @@ def test_upgrade_from_edge( def test_data_replication(first_model: str, second_model: str, continuous_writes) -> None: """Test to write to primary, and read the same data back from replicas.""" logging.info("Testing data replication") - results = await get_mysql_max_written_values(first_model, second_model) + results = get_db_max_written_values(first_model, second_model) assert len(results) == 6 assert all(results[0] == x for x in results), "Data is not consistent across units" assert results[0] > 1, "No data was written to the database" -def get_mysql_max_written_values(first_model: str, second_model: str) -> list[int]: +def get_db_max_written_values(first_model: str, second_model: str) -> list[int]: """Return list with max written value from all units.""" model_1 = Juju(model=first_model) model_2 = Juju(model=second_model) @@ -211,12 +222,12 @@ def get_mysql_max_written_values(first_model: str, second_model: str) -> list[in logging.info(f"Querying max value on all {DB_APP_1} units") for unit_name in get_app_units(model_1, DB_APP_1): - unit_max_value = await get_db_max_written_value(model_1, DB_APP_1, unit_name) + unit_max_value = get_db_max_written_value(model_1, DB_APP_1, unit_name) results.append(unit_max_value) logging.info(f"Querying max value on all {DB_APP_2} units") for unit_name in get_app_units(model_2, DB_APP_2): - unit_max_value = await get_db_max_written_value(model_2, DB_APP_2, unit_name) + unit_max_value = get_db_max_written_value(model_2, DB_APP_2, unit_name) results.append(unit_max_value) return results From be9df499d60fff8e27819357c130f284c18155e7 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 15 Oct 2025 17:40:47 +0300 Subject: [PATCH 29/33] Handle leadership name in async clsuter --- src/charm.py | 13 ++++-- src/cluster.py | 10 ++++- .../test_async_replication_upgrade.py | 43 ++++++------------- .../high_availability/test_upgrade.py | 3 +- 4 files changed, 33 insertions(+), 36 deletions(-) diff --git a/src/charm.py b/src/charm.py index 75399a3754..482ce7bcc5 100755 --- a/src/charm.py +++ b/src/charm.py @@ -164,8 +164,10 @@ def run_pre_refresh_checks_after_1_unit_refreshed() -> None: pass def run_pre_refresh_checks_before_any_units_refreshed(self) -> None: - if not self._charm._patroni.are_all_members_ready(): - raise charm_refresh.PrecheckFailed("PostgreSQL is not running on 1+ units") + for attempt in Retrying(stop=stop_after_attempt(2), wait=wait_fixed(1), reraise=True): + with attempt: + if not self._charm._patroni.are_all_members_ready(): + raise charm_refresh.PrecheckFailed("PostgreSQL is not running on 1+ units") if self._charm._patroni.is_creating_backup: raise charm_refresh.PrecheckFailed("Backup in progress") @@ -189,7 +191,12 @@ def unit_number(unit_name: str): ) else: try: - self._charm._patroni.switchover(candidate=last_unit_to_refresh) + self._charm._patroni.switchover( + candidate=last_unit_to_refresh, + async_cluster=bool( + self._charm.async_replication.get_primary_cluster_endpoint() + ), + ) except SwitchoverFailedError as e: logger.warning(f"switchover failed with reason: {e}") raise charm_refresh.PrecheckFailed("Unable to switch primary") diff --git a/src/cluster.py b/src/cluster.py index 4b01f1053d..5a8f02452d 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -857,12 +857,18 @@ def stop_patroni(self) -> bool: logger.exception(error_message, exc_info=e) return False - def switchover(self, candidate: str | None = None) -> None: + def switchover(self, candidate: str | None = None, async_cluster: bool = False) -> None: """Trigger a switchover.""" # Try to trigger the switchover. for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): with attempt: - current_primary = self.get_primary() + current_primary = ( + self.get_primary() if not async_cluster else self.get_standby_leader() + ) + if current_primary == candidate: + logger.info("Candidate and leader are the same") + return + body = {"leader": current_primary} if candidate: body["candidate"] = candidate diff --git a/tests/integration/high_availability/test_async_replication_upgrade.py b/tests/integration/high_availability/test_async_replication_upgrade.py index 0371882a86..c267d1ac55 100644 --- a/tests/integration/high_availability/test_async_replication_upgrade.py +++ b/tests/integration/high_availability/test_async_replication_upgrade.py @@ -13,7 +13,6 @@ from .. import architecture from .high_availability_helpers_new import ( - check_db_units_writes_increment, get_app_leader, get_app_units, get_db_max_written_value, @@ -85,23 +84,22 @@ def test_deploy(first_model: str, second_model: str, charm: str) -> None: configuration = {"profile": "testing"} constraints = {"arch": architecture.architecture} - logging.info("Deploying mysql clusters") + # TODO Deploy from edge + logging.info("Deploying postgresql clusters") model_1 = Juju(model=first_model) model_1.deploy( - charm=DB_APP_NAME, + charm=charm, app=DB_APP_1, base="ubuntu@24.04", - channel="16/edge", config=configuration, constraints=constraints, num_units=3, ) model_2 = Juju(model=second_model) model_2.deploy( - charm=DB_APP_NAME, + charm=charm, app=DB_APP_2, base="ubuntu@24.04", - channel="16/edge", config=configuration, constraints=constraints, num_units=3, @@ -183,7 +181,7 @@ def test_create_replication(first_model: str, second_model: str) -> None: def test_upgrade_from_edge( - first_model: str, second_model: str, charm: str, first_continuous_writes + first_model: str, second_model: str, charm: str, first_model_continuous_writes ) -> None: """Upgrade the two MySQL clusters.""" model_1 = Juju(model=first_model) @@ -196,7 +194,9 @@ def test_upgrade_from_edge( run_upgrade_from_edge(model_2, DB_APP_2, charm) -def test_data_replication(first_model: str, second_model: str, continuous_writes) -> None: +def test_data_replication( + first_model: str, second_model: str, first_model_continuous_writes +) -> None: """Test to write to primary, and read the same data back from replicas.""" logging.info("Testing data replication") results = get_db_max_written_values(first_model, second_model) @@ -243,19 +243,16 @@ def run_pre_refresh_checks(juju: Juju, app_name: str) -> None: def run_upgrade_from_edge(juju: Juju, app_name: str, charm: str) -> None: """Update the second cluster.""" - logging.info("Ensure continuous writes are incrementing") - check_db_units_writes_increment(juju, DB_APP_NAME) - logging.info("Refresh the charm") - juju.refresh(app=DB_APP_NAME, path=charm) + juju.refresh(app=app_name, path=charm) logging.info("Wait for refresh to block as paused or incompatible") try: - juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + juju.wait(lambda status: status.apps[app_name].is_blocked, timeout=5 * MINUTE_SECS) - units = get_app_units(juju, DB_APP_NAME) + units = get_app_units(juju, app_name) unit_names = sorted(units.keys()) - if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + if "Refresh incompatible" in juju.status().apps[app_name].app_status.message: logging.info("Application refresh is blocked due to incompatibility") juju.run( unit=unit_names[-1], @@ -270,19 +267,7 @@ def run_upgrade_from_edge(juju: Juju, app_name: str, charm: str) -> None: juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) except TimeoutError: logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") - assert juju.status().apps[DB_APP_NAME].is_active + assert juju.status().apps[app_name].is_active logging.info("Wait for upgrade to complete") - juju.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), - timeout=20 * MINUTE_SECS, - ) - - logging.info("Wait for upgrade to complete") - juju.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), - timeout=20 * MINUTE_SECS, - ) - - logging.info("Ensure continuous writes are incrementing") - check_db_units_writes_increment(juju, DB_APP_NAME) + juju.wait(ready=wait_for_apps_status(jubilant.all_active, app_name), timeout=20 * MINUTE_SECS) diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py index ebea59537b..91979b2559 100644 --- a/tests/integration/high_availability/test_upgrade.py +++ b/tests/integration/high_availability/test_upgrade.py @@ -160,8 +160,7 @@ def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: logging.info("Wait for upgrade to complete") juju.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), - timeout=20 * MINUTE_SECS, + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), timeout=20 * MINUTE_SECS ) logging.info("Ensure continuous writes after rollback procedure") From 0e4097152867b63856e73d897ade23f934447410 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 15 Oct 2025 18:02:59 +0300 Subject: [PATCH 30/33] Deploy in one go --- .../test_async_replication_upgrade.py | 37 +++++++------------ 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/tests/integration/high_availability/test_async_replication_upgrade.py b/tests/integration/high_availability/test_async_replication_upgrade.py index c267d1ac55..cb20ab1aa2 100644 --- a/tests/integration/high_availability/test_async_replication_upgrade.py +++ b/tests/integration/high_availability/test_async_replication_upgrade.py @@ -105,6 +105,20 @@ def test_deploy(first_model: str, second_model: str, charm: str) -> None: num_units=3, ) + logging.info("Deploying the test application") + model_1 = Juju(model=first_model) + model_1.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_NAME, + base="ubuntu@22.04", + channel="latest/edge", + constraints=constraints, + num_units=1, + ) + + logging.info("Relating the test application") + model_1.integrate(f"{DB_APP_1}:database", f"{DB_TEST_APP_NAME}:database") + logging.info("Waiting for the applications to settle") model_1.wait( ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=20 * MINUTE_SECS @@ -138,29 +152,6 @@ def test_async_relate(first_model: str, second_model: str) -> None: ) -def test_deploy_test_app(first_model: str) -> None: - """Deploy the test application.""" - constraints = {"arch": architecture.architecture} - - logging.info("Deploying the test application") - model_1 = Juju(model=first_model) - model_1.deploy( - charm=DB_TEST_APP_NAME, - app=DB_TEST_APP_NAME, - base="ubuntu@22.04", - channel="latest/edge", - constraints=constraints, - num_units=1, - ) - - logging.info("Relating the test application") - model_1.integrate(f"{DB_APP_1}:database", f"{DB_TEST_APP_NAME}:database") - - model_1.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_TEST_APP_NAME), timeout=10 * MINUTE_SECS - ) - - def test_create_replication(first_model: str, second_model: str) -> None: """Run the create-replication action and wait for the applications to settle.""" model_1 = Juju(model=first_model) From c9f550136cf10a28f6b19bdcbdeaf968c59051b0 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 15 Oct 2025 23:38:47 +0300 Subject: [PATCH 31/33] Patch at base --- src/charm.py | 5 +++-- src/cluster.py | 9 +++++++-- .../high_availability/test_async_replication.py | 4 ++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/charm.py b/src/charm.py index 482ce7bcc5..ffb19c7add 100755 --- a/src/charm.py +++ b/src/charm.py @@ -2363,9 +2363,10 @@ def _api_update_config(self) -> None: "max_wal_senders": 25, "wal_keep_size": self.config.durability_wal_keep_size, } + base_patch = {} if primary_endpoint := self.async_replication.get_primary_cluster_endpoint(): - cfg_patch["standby_cluster"] = {"host": primary_endpoint} - self._patroni.bulk_update_parameters_controller_by_patroni(cfg_patch) + base_patch["standby_cluster"] = {"host": primary_endpoint} + self._patroni.bulk_update_parameters_controller_by_patroni(cfg_patch, base_patch) def update_config( self, diff --git a/src/cluster.py b/src/cluster.py index 5a8f02452d..06b423de01 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -1107,11 +1107,15 @@ def reinitialize_postgresql(self) -> None: logger.debug("API reinitialize_postgresql: %s (%s)", r, r.elapsed.total_seconds()) @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) - def bulk_update_parameters_controller_by_patroni(self, parameters: dict[str, Any]) -> None: + def bulk_update_parameters_controller_by_patroni( + self, parameters: dict[str, Any], base_parameters: dict[str, Any] | None + ) -> None: """Update the value of a parameter controller by Patroni. For more information, check https://patroni.readthedocs.io/en/latest/patroni_configuration.html#postgresql-parameters-controlled-by-patroni. """ + if not base_parameters: + base_parameters = {} r = requests.patch( f"{self._patroni_url}/config", verify=self.verify, @@ -1120,7 +1124,8 @@ def bulk_update_parameters_controller_by_patroni(self, parameters: dict[str, Any "remove_data_directory_on_rewind_failure": False, "remove_data_directory_on_diverged_timelines": False, "parameters": parameters, - } + }, + **base_parameters, }, auth=self._patroni_auth, timeout=PATRONI_TIMEOUT, diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index e3d8e4f240..8f001c58a1 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -266,6 +266,7 @@ def test_unrelate_and_relate(first_model: str, second_model: str) -> None: def test_failover_in_main_cluster(first_model: str, second_model: str) -> None: """Test that async replication fails over correctly.""" model_1 = Juju(model=first_model) + model_2 = Juju(model=second_model) rerelate_test_app(model_1, DB_APP_1, DB_TEST_APP_1) @@ -274,6 +275,9 @@ def test_failover_in_main_cluster(first_model: str, second_model: str) -> None: model_1.wait( ready=wait_for_apps_status(jubilant.all_active, DB_APP_1), timeout=10 * MINUTE_SECS ) + model_2.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=10 * MINUTE_SECS + ) results = get_db_max_written_values(first_model, second_model, first_model, DB_TEST_APP_1) From 3cc09bc9cdb33a5dc7d4d9c3ba854c1df3662109 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 16 Oct 2025 02:03:05 +0300 Subject: [PATCH 32/33] Enable scale up test --- tests/integration/high_availability/test_async_replication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index 8f001c58a1..8e93082fe8 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -311,7 +311,7 @@ def test_failover_in_standby_cluster(first_model: str, second_model: str) -> Non assert standby != get_db_standby_leader_unit(model_2, DB_APP_2) -def scale_up(first_model: str, second_model: str) -> None: +def test_scale_up(first_model: str, second_model: str) -> None: model_1 = Juju(model=first_model) model_2 = Juju(model=second_model) From 239f7f332b8bdf3ee1c75990c74df47312db8085 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 16 Oct 2025 02:52:15 +0300 Subject: [PATCH 33/33] Retry consistency check on main cluster scale down --- .../high_availability/test_async_replication.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/integration/high_availability/test_async_replication.py b/tests/integration/high_availability/test_async_replication.py index 8e93082fe8..f21218dd37 100644 --- a/tests/integration/high_availability/test_async_replication.py +++ b/tests/integration/high_availability/test_async_replication.py @@ -9,7 +9,7 @@ import jubilant import pytest from jubilant import Juju -from tenacity import Retrying, stop_after_attempt +from tenacity import Retrying, stop_after_attempt, wait_fixed from .. import architecture from .high_availability_helpers_new import ( @@ -279,13 +279,18 @@ def test_failover_in_main_cluster(first_model: str, second_model: str) -> None: ready=wait_for_apps_status(jubilant.all_active, DB_APP_2), timeout=10 * MINUTE_SECS ) - results = get_db_max_written_values(first_model, second_model, first_model, DB_TEST_APP_1) + for attempt in Retrying(stop=stop_after_attempt(10), wait=wait_fixed(3), reraise=True): + with attempt: + results = get_db_max_written_values( + first_model, second_model, first_model, DB_TEST_APP_1 + ) + logging.info(f"Results: {results}") - assert len(results) == 5 - assert all(results[0] == x for x in results), "Data is not consistent across units" - assert results[0] > 1, "No data was written to the database" + assert len(results) == 5 + assert all(results[0] == x for x in results), "Data is not consistent across units" + assert results[0] > 1, "No data was written to the database" - assert primary != get_db_primary_unit(model_1, DB_APP_1) + assert primary != get_db_primary_unit(model_1, DB_APP_1) def test_failover_in_standby_cluster(first_model: str, second_model: str) -> None: