diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index 1b92b430e3..b02a04ebf1 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -446,3 +446,42 @@ def get_ddb_item(region_name: str, table_name: str, item_key: dict): """ table = boto3.resource("dynamodb", region_name=region_name).Table(table_name) return table.get_item(Key=item_key).get("Item") + + +def get_compute_ip_to_num_files(remote_command_executor, slurm_commands): + """Gets a mapping of compute node instance ip to its current number of open files.""" + logging.info("Checking the number of file descriptors...") + + # Submit job to the test nodes + compute_node_names = slurm_commands.get_compute_nodes(all_nodes=True) + for name in compute_node_names: + slurm_commands.submit_command_and_assert_job_accepted( + submit_command_args={"command": "srun sleep 1", "host": name} + ) + # Wait for all jobs to be completed + slurm_commands.wait_job_queue_empty() + + # Get the number of open files on all the nodes + instance_ip_to_num_files = {} + for node_name in compute_node_names: + compute_node_instance_ip = slurm_commands.get_node_addr(node_name) + lsof_cmd = f"ssh -q {compute_node_instance_ip} 'sudo lsof -p $(pgrep computemgtd) | wc -l'" + num_files = remote_command_executor.run_remote_command(lsof_cmd).stdout + instance_ip_to_num_files[compute_node_instance_ip] = num_files + + logging.info(f"Mapping from instance ip to number of open files in computemgtd: {instance_ip_to_num_files}") + return instance_ip_to_num_files + + +def assert_no_file_handler_leak(init_compute_ip_to_num_files, remote_command_executor, slurm_commands): + """Asserts that the current number of open files for each compute node is the same as the given map""" + current_compute_ip_to_num_files = get_compute_ip_to_num_files(remote_command_executor, slurm_commands) + logging.info( + f"Asserting that the number of open files in computemgtd hasn't grown from " + f"{init_compute_ip_to_num_files} to {current_compute_ip_to_num_files}." + ) + for compute_ip in current_compute_ip_to_num_files: + if compute_ip in init_compute_ip_to_num_files: + assert_that(current_compute_ip_to_num_files[compute_ip]).is_equal_to( + init_compute_ip_to_num_files[compute_ip] + ) diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py index 9b5e6d0271..200e1e0f28 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm.py @@ -5,6 +5,8 @@ import pytest from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor +from tests.common.utils import assert_no_file_handler_leak, get_compute_ip_to_num_files + # timeout in seconds STARCCM_INSTALLATION_TIMEOUT = 1800 STARCCM_JOB_TIMEOUT = 600 @@ -70,13 +72,15 @@ def test_starccm( cluster = clusters_factory(cluster_config) logging.info("Cluster Created") remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = scheduler_commands_factory(remote_command_executor) + init_num_files = get_compute_ip_to_num_files(remote_command_executor, scheduler_commands) + if not starccm_installed(remote_command_executor): logging.info("Installing StarCCM+") remote_command_executor.run_remote_script( str(test_datadir / "starccm.install.sh"), timeout=STARCCM_INSTALLATION_TIMEOUT, hide=False ) logging.info("StarCCM+ Installed") - scheduler_commands = scheduler_commands_factory(remote_command_executor) podkey, licpath = get_starccm_secrets(region) performance_degradation = {} for node in number_of_nodes: @@ -112,6 +116,8 @@ def test_starccm( f"Percentage difference: {percentage_difference}%, Outcome: {outcome}" ) + assert_no_file_handler_leak(init_num_files, remote_command_executor, scheduler_commands) + if performance_degradation: pytest.fail(f"Performance degradation detected: {performance_degradation}") else: