diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index c95bfbcbdf..e8a22454ef 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -1539,19 +1539,31 @@ def _wait_compute_cloudinit_done(remote_command_executor, compute_node): assert_that(compute_cloudinit_status_output).contains("status: done") -@retry(wait_fixed=seconds(10), stop_max_attempt_number=4) -def _check_mpi_process(remote_command_executor, slurm_commands, num_nodes, after_completion): - """Submit script and check for MPI processes.""" - # Clean up old datafiles - remote_command_executor.run_remote_command("rm -f /shared/check_proc.out") - result = slurm_commands.submit_command("ps aux | grep IMB | grep MPI >> /shared/check_proc.out", nodes=num_nodes) +def _assert_mpi_process_completion( + remote_command_executor, slurm_commands, num_nodes, after_completion, check_proc_file +): + result = slurm_commands.submit_command( + f'ps aux | grep "mpiexec.hydra.*sleep" | grep -v "grep" >> {check_proc_file}', nodes=num_nodes + ) job_id = slurm_commands.assert_job_submitted(result.stdout) slurm_commands.wait_job_completed(job_id) - proc_track_result = remote_command_executor.run_remote_command("cat /shared/check_proc.out") + proc_track_result = remote_command_executor.run_remote_command(f"cat {check_proc_file}") if after_completion: - assert_that(proc_track_result.stdout).does_not_contain("IMB-MPI1") + assert_that(proc_track_result.stdout).does_not_match(".*mpiexec.hydra.*sleep") else: - assert_that(proc_track_result.stdout).contains("IMB-MPI1") + assert_that(proc_track_result.stdout).matches(".*mpiexec.hydra.*sleep") + + +def _check_mpi_process(remote_command_executor, slurm_commands, num_nodes, after_completion): + """Submit script and check for MPI processes.""" + # Clean up old datafiles + check_proc_file = "/shared/check_proc.out" + + # Check completion status of MPI process using the shared datafile + remote_command_executor.run_remote_command(f"rm -f {check_proc_file}") + retry(wait_fixed=seconds(10), stop_max_attempt_number=4)(_assert_mpi_process_completion)( + remote_command_executor, slurm_commands, num_nodes, after_completion, check_proc_file + ) def _test_cluster_gpu_limits(slurm_commands, partition, instance_type, max_count, gpu_per_instance, gpu_type): diff --git a/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/mpi_job.sh b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/mpi_job.sh index 4c48e5948c..93f9d7c243 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/mpi_job.sh +++ b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/mpi_job.sh @@ -7,4 +7,4 @@ #SBATCH --output=runscript.out module load intelmpi -mpirun -n 6 IMB-MPI1 Alltoall -npmin 2 +mpirun -n 6 bash -c 'sleep 300' -npmin 2