From 5b5fac377901d1359644a491fca3737d34389e4a Mon Sep 17 00:00:00 2001 From: Allen Liu Date: Tue, 4 Apr 2023 12:54:08 -0700 Subject: [PATCH] fix: increase worker waiting time for ORTE proc (#178) * fix: increase worker waiting time for ORTE proc * remove todo tag for passing pylint chec * ping sagemaker version to avoid a credential error --- src/sagemaker_training/mpi.py | 5 +++-- tox.ini | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/sagemaker_training/mpi.py b/src/sagemaker_training/mpi.py index 34ba9258..de331a17 100644 --- a/src/sagemaker_training/mpi.py +++ b/src/sagemaker_training/mpi.py @@ -171,8 +171,9 @@ def _wait_orted_process_to_finish(): # type: () -> None def _orted_process(): # pylint: disable=inconsistent-return-statements - """Wait a maximum of 5 minutes for orted process to start.""" - for _ in range(5 * 60): + """Wait a maximum of 20 minutes for orted process to start.""" + # the wait time here should be set to a dynamic value according to cluster size + for _ in range(20 * 60): procs = [p for p in psutil.process_iter(attrs=["name"]) if p.info["name"] == "orted"] if procs: logger.info("Process[es]: %s", procs) diff --git a/tox.ini b/tox.ini index da9997d8..a53d1887 100644 --- a/tox.ini +++ b/tox.ini @@ -49,7 +49,7 @@ deps = pytest-asyncio mock awslogs - sagemaker[local] + sagemaker[local]==2.136.0 numpy flask gunicorn