From a075bfcd0e2eab717e2accb7577f808eccb9d56d Mon Sep 17 00:00:00 2001 From: Judy Ng Date: Fri, 12 Jan 2024 12:59:34 -0500 Subject: [PATCH 1/2] Add resource leaks integ test Signed-off-by: Judy Ng --- tests/integration-tests/configs/develop.yaml | 7 ++++ .../resource_leaks/test_resource_leaks.py | 42 +++++++++++++++++++ .../test_resource_leaks/pcluster.config.yaml | 23 ++++++++++ 3 files changed, 72 insertions(+) create mode 100644 tests/integration-tests/tests/resource_leaks/test_resource_leaks.py create mode 100644 tests/integration-tests/tests/resource_leaks/test_resource_leaks/test_resource_leaks/pcluster.config.yaml diff --git a/tests/integration-tests/configs/develop.yaml b/tests/integration-tests/configs/develop.yaml index 2e03ad55f8..d2ec6ddee3 100644 --- a/tests/integration-tests/configs/develop.yaml +++ b/tests/integration-tests/configs/develop.yaml @@ -159,3 +159,10 @@ test-suites: - regions: ["eu-west-1"] oss: ["alinux2"] schedulers: ["slurm"] + resource_leaks: + test_resource_leaks.py::test_resource_leaks: + dimensions: + - regions: [ "us-east-1" ] + instances: [ "t2.micro" ] + oss: [ "alinux2" ] + schedulers: [ "slurm" ] diff --git a/tests/integration-tests/tests/resource_leaks/test_resource_leaks.py b/tests/integration-tests/tests/resource_leaks/test_resource_leaks.py new file mode 100644 index 0000000000..c2c3c8d539 --- /dev/null +++ b/tests/integration-tests/tests/resource_leaks/test_resource_leaks.py @@ -0,0 +1,42 @@ +import logging + +import pytest +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor +from utils import get_compute_nodes_instance_ips + +from tests.common.assertions import assert_head_node_is_running + + +@pytest.mark.usefixtures("instance", "os", "scheduler") +def test_resource_leaks( + region, + pcluster_config_reader, + s3_bucket_factory, + clusters_factory, + test_datadir, + scheduler_commands_factory, +): + total_sleep_time = 1800 # 30 minutes + loop_sleep_time = 300 # 5 minutes + + cluster_config = pcluster_config_reader() + cluster = clusters_factory(cluster_config) + assert_head_node_is_running(region, cluster) + remote_command_executor = RemoteCommandExecutor(cluster) + + compute_node_instance_ip = get_compute_nodes_instance_ips(cluster.name, region)[0] + lsof_cmd = f"ssh -q {compute_node_instance_ip} 'sudo lsof -p $(pgrep computemgtd) | wc -l'" + sleep_cmd = f"ssh -q {compute_node_instance_ip} 'sleep {loop_sleep_time}'" + + logging.info("Checking the number of file descriptors...") + initial_no_file_descs = remote_command_executor.run_remote_command(lsof_cmd).stdout + logging.info("Initial number of file descriptors: %s", initial_no_file_descs) + + curr_no_file_descs = initial_no_file_descs + for _ in range(total_sleep_time // loop_sleep_time): + remote_command_executor.run_remote_command(sleep_cmd) + curr_no_file_descs = remote_command_executor.run_remote_command(lsof_cmd).stdout + logging.info("Number of file descriptors after sleeping: %s", curr_no_file_descs) + + assert_that(initial_no_file_descs).is_equal_to(curr_no_file_descs) diff --git a/tests/integration-tests/tests/resource_leaks/test_resource_leaks/test_resource_leaks/pcluster.config.yaml b/tests/integration-tests/tests/resource_leaks/test_resource_leaks/test_resource_leaks/pcluster.config.yaml new file mode 100644 index 0000000000..6d352160b7 --- /dev/null +++ b/tests/integration-tests/tests/resource_leaks/test_resource_leaks/test_resource_leaks/pcluster.config.yaml @@ -0,0 +1,23 @@ +Image: + Os: {{ os }} +HeadNode: + InstanceType: {{ instance }} + Networking: + SubnetId: {{ public_subnet_id }} + Ssh: + KeyName: {{ key_name }} + Imds: + Secured: {{ imds_secured }} +Scheduling: + Scheduler: slurm + SlurmQueues: + - Name: queue-1 + ComputeResources: + - Name: compute-resource-1 + Instances: + - InstanceType: c5.large + MinCount: 1 + MaxCount: 1 + Networking: + SubnetIds: + - {{ private_subnet_id }} From 599feb1ffce67c0c89deed97620946f0303de78d Mon Sep 17 00:00:00 2001 From: Judy Ng Date: Fri, 12 Jan 2024 12:59:34 -0500 Subject: [PATCH 2/2] Add resource leak check into util, call checks in starccm test Signed-off-by: Judy Ng --- tests/integration-tests/configs/develop.yaml | 7 ---- tests/integration-tests/tests/common/utils.py | 39 +++++++++++++++++ .../tests/performance_tests/test_starccm.py | 8 +++- .../resource_leaks/test_resource_leaks.py | 42 ------------------- .../test_resource_leaks/pcluster.config.yaml | 23 ---------- 5 files changed, 46 insertions(+), 73 deletions(-) delete mode 100644 tests/integration-tests/tests/resource_leaks/test_resource_leaks.py delete mode 100644 tests/integration-tests/tests/resource_leaks/test_resource_leaks/test_resource_leaks/pcluster.config.yaml diff --git a/tests/integration-tests/configs/develop.yaml b/tests/integration-tests/configs/develop.yaml index d2ec6ddee3..2e03ad55f8 100644 --- a/tests/integration-tests/configs/develop.yaml +++ b/tests/integration-tests/configs/develop.yaml @@ -159,10 +159,3 @@ test-suites: - regions: ["eu-west-1"] oss: ["alinux2"] schedulers: ["slurm"] - resource_leaks: - test_resource_leaks.py::test_resource_leaks: - dimensions: - - regions: [ "us-east-1" ] - instances: [ "t2.micro" ] - oss: [ "alinux2" ] - schedulers: [ "slurm" ] diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index 1b92b430e3..b02a04ebf1 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -446,3 +446,42 @@ def get_ddb_item(region_name: str, table_name: str, item_key: dict): """ table = boto3.resource("dynamodb", region_name=region_name).Table(table_name) return table.get_item(Key=item_key).get("Item") + + +def get_compute_ip_to_num_files(remote_command_executor, slurm_commands): + """Gets a mapping of compute node instance ip to its current number of open files.""" + logging.info("Checking the number of file descriptors...") + + # Submit job to the test nodes + compute_node_names = slurm_commands.get_compute_nodes(all_nodes=True) + for name in compute_node_names: + slurm_commands.submit_command_and_assert_job_accepted( + submit_command_args={"command": "srun sleep 1", "host": name} + ) + # Wait for all jobs to be completed + slurm_commands.wait_job_queue_empty() + + # Get the number of open files on all the nodes + instance_ip_to_num_files = {} + for node_name in compute_node_names: + compute_node_instance_ip = slurm_commands.get_node_addr(node_name) + lsof_cmd = f"ssh -q {compute_node_instance_ip} 'sudo lsof -p $(pgrep computemgtd) | wc -l'" + num_files = remote_command_executor.run_remote_command(lsof_cmd).stdout + instance_ip_to_num_files[compute_node_instance_ip] = num_files + + logging.info(f"Mapping from instance ip to number of open files in computemgtd: {instance_ip_to_num_files}") + return instance_ip_to_num_files + + +def assert_no_file_handler_leak(init_compute_ip_to_num_files, remote_command_executor, slurm_commands): + """Asserts that the current number of open files for each compute node is the same as the given map""" + current_compute_ip_to_num_files = get_compute_ip_to_num_files(remote_command_executor, slurm_commands) + logging.info( + f"Asserting that the number of open files in computemgtd hasn't grown from " + f"{init_compute_ip_to_num_files} to {current_compute_ip_to_num_files}." + ) + for compute_ip in current_compute_ip_to_num_files: + if compute_ip in init_compute_ip_to_num_files: + assert_that(current_compute_ip_to_num_files[compute_ip]).is_equal_to( + init_compute_ip_to_num_files[compute_ip] + ) diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py index 9b5e6d0271..200e1e0f28 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm.py @@ -5,6 +5,8 @@ import pytest from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor +from tests.common.utils import assert_no_file_handler_leak, get_compute_ip_to_num_files + # timeout in seconds STARCCM_INSTALLATION_TIMEOUT = 1800 STARCCM_JOB_TIMEOUT = 600 @@ -70,13 +72,15 @@ def test_starccm( cluster = clusters_factory(cluster_config) logging.info("Cluster Created") remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = scheduler_commands_factory(remote_command_executor) + init_num_files = get_compute_ip_to_num_files(remote_command_executor, scheduler_commands) + if not starccm_installed(remote_command_executor): logging.info("Installing StarCCM+") remote_command_executor.run_remote_script( str(test_datadir / "starccm.install.sh"), timeout=STARCCM_INSTALLATION_TIMEOUT, hide=False ) logging.info("StarCCM+ Installed") - scheduler_commands = scheduler_commands_factory(remote_command_executor) podkey, licpath = get_starccm_secrets(region) performance_degradation = {} for node in number_of_nodes: @@ -112,6 +116,8 @@ def test_starccm( f"Percentage difference: {percentage_difference}%, Outcome: {outcome}" ) + assert_no_file_handler_leak(init_num_files, remote_command_executor, scheduler_commands) + if performance_degradation: pytest.fail(f"Performance degradation detected: {performance_degradation}") else: diff --git a/tests/integration-tests/tests/resource_leaks/test_resource_leaks.py b/tests/integration-tests/tests/resource_leaks/test_resource_leaks.py deleted file mode 100644 index c2c3c8d539..0000000000 --- a/tests/integration-tests/tests/resource_leaks/test_resource_leaks.py +++ /dev/null @@ -1,42 +0,0 @@ -import logging - -import pytest -from assertpy import assert_that -from remote_command_executor import RemoteCommandExecutor -from utils import get_compute_nodes_instance_ips - -from tests.common.assertions import assert_head_node_is_running - - -@pytest.mark.usefixtures("instance", "os", "scheduler") -def test_resource_leaks( - region, - pcluster_config_reader, - s3_bucket_factory, - clusters_factory, - test_datadir, - scheduler_commands_factory, -): - total_sleep_time = 1800 # 30 minutes - loop_sleep_time = 300 # 5 minutes - - cluster_config = pcluster_config_reader() - cluster = clusters_factory(cluster_config) - assert_head_node_is_running(region, cluster) - remote_command_executor = RemoteCommandExecutor(cluster) - - compute_node_instance_ip = get_compute_nodes_instance_ips(cluster.name, region)[0] - lsof_cmd = f"ssh -q {compute_node_instance_ip} 'sudo lsof -p $(pgrep computemgtd) | wc -l'" - sleep_cmd = f"ssh -q {compute_node_instance_ip} 'sleep {loop_sleep_time}'" - - logging.info("Checking the number of file descriptors...") - initial_no_file_descs = remote_command_executor.run_remote_command(lsof_cmd).stdout - logging.info("Initial number of file descriptors: %s", initial_no_file_descs) - - curr_no_file_descs = initial_no_file_descs - for _ in range(total_sleep_time // loop_sleep_time): - remote_command_executor.run_remote_command(sleep_cmd) - curr_no_file_descs = remote_command_executor.run_remote_command(lsof_cmd).stdout - logging.info("Number of file descriptors after sleeping: %s", curr_no_file_descs) - - assert_that(initial_no_file_descs).is_equal_to(curr_no_file_descs) diff --git a/tests/integration-tests/tests/resource_leaks/test_resource_leaks/test_resource_leaks/pcluster.config.yaml b/tests/integration-tests/tests/resource_leaks/test_resource_leaks/test_resource_leaks/pcluster.config.yaml deleted file mode 100644 index 6d352160b7..0000000000 --- a/tests/integration-tests/tests/resource_leaks/test_resource_leaks/test_resource_leaks/pcluster.config.yaml +++ /dev/null @@ -1,23 +0,0 @@ -Image: - Os: {{ os }} -HeadNode: - InstanceType: {{ instance }} - Networking: - SubnetId: {{ public_subnet_id }} - Ssh: - KeyName: {{ key_name }} - Imds: - Secured: {{ imds_secured }} -Scheduling: - Scheduler: slurm - SlurmQueues: - - Name: queue-1 - ComputeResources: - - Name: compute-resource-1 - Instances: - - InstanceType: c5.large - MinCount: 1 - MaxCount: 1 - Networking: - SubnetIds: - - {{ private_subnet_id }}