Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions tests/integration-tests/tests/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,3 +446,42 @@ def get_ddb_item(region_name: str, table_name: str, item_key: dict):
"""
table = boto3.resource("dynamodb", region_name=region_name).Table(table_name)
return table.get_item(Key=item_key).get("Item")


def get_compute_ip_to_num_files(remote_command_executor, slurm_commands):
"""Gets a mapping of compute node instance ip to its current number of open files."""
logging.info("Checking the number of file descriptors...")

# Submit job to the test nodes
compute_node_names = slurm_commands.get_compute_nodes(all_nodes=True)
for name in compute_node_names:
slurm_commands.submit_command_and_assert_job_accepted(
submit_command_args={"command": "srun sleep 1", "host": name}
)
# Wait for all jobs to be completed
slurm_commands.wait_job_queue_empty()

# Get the number of open files on all the nodes
instance_ip_to_num_files = {}
for node_name in compute_node_names:
compute_node_instance_ip = slurm_commands.get_node_addr(node_name)
lsof_cmd = f"ssh -q {compute_node_instance_ip} 'sudo lsof -p $(pgrep computemgtd) | wc -l'"
num_files = remote_command_executor.run_remote_command(lsof_cmd).stdout
instance_ip_to_num_files[compute_node_instance_ip] = num_files

logging.info(f"Mapping from instance ip to number of open files in computemgtd: {instance_ip_to_num_files}")
return instance_ip_to_num_files


def assert_no_file_handler_leak(init_compute_ip_to_num_files, remote_command_executor, slurm_commands):
"""Asserts that the current number of open files for each compute node is the same as the given map"""
current_compute_ip_to_num_files = get_compute_ip_to_num_files(remote_command_executor, slurm_commands)
logging.info(
f"Asserting that the number of open files in computemgtd hasn't grown from "
f"{init_compute_ip_to_num_files} to {current_compute_ip_to_num_files}."
)
for compute_ip in current_compute_ip_to_num_files:
if compute_ip in init_compute_ip_to_num_files:
assert_that(current_compute_ip_to_num_files[compute_ip]).is_equal_to(
init_compute_ip_to_num_files[compute_ip]
)
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import pytest
from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor

from tests.common.utils import assert_no_file_handler_leak, get_compute_ip_to_num_files

# timeout in seconds
STARCCM_INSTALLATION_TIMEOUT = 1800
STARCCM_JOB_TIMEOUT = 600
Expand Down Expand Up @@ -70,13 +72,15 @@ def test_starccm(
cluster = clusters_factory(cluster_config)
logging.info("Cluster Created")
remote_command_executor = RemoteCommandExecutor(cluster)
scheduler_commands = scheduler_commands_factory(remote_command_executor)
init_num_files = get_compute_ip_to_num_files(remote_command_executor, scheduler_commands)

if not starccm_installed(remote_command_executor):
logging.info("Installing StarCCM+")
remote_command_executor.run_remote_script(
str(test_datadir / "starccm.install.sh"), timeout=STARCCM_INSTALLATION_TIMEOUT, hide=False
)
logging.info("StarCCM+ Installed")
scheduler_commands = scheduler_commands_factory(remote_command_executor)
podkey, licpath = get_starccm_secrets(region)
performance_degradation = {}
for node in number_of_nodes:
Expand Down Expand Up @@ -112,6 +116,8 @@ def test_starccm(
f"Percentage difference: {percentage_difference}%, Outcome: {outcome}"
)

assert_no_file_handler_leak(init_num_files, remote_command_executor, scheduler_commands)

if performance_degradation:
pytest.fail(f"Performance degradation detected: {performance_degradation}")
else:
Expand Down