From 29edf67eb29967c66e808527e36ba280ecddd7f3 Mon Sep 17 00:00:00 2001 From: Sirut Buasai <73297481+sirutBuasai@users.noreply.github.com> Date: Tue, 28 May 2024 16:14:08 -0700 Subject: [PATCH] EC2 Instance retirement for P2 and M4 instances (#3950) * EC2 Instance retirement for P2 and M4 instances * correct buildspec * build test 2.2 ec2 * revert toml * revert buildspec --- scheduler/job_requester/requester.py | 11 +++++++---- scheduler/log_return/__init__.py | 2 +- test/dlc_tests/conftest.py | 13 +++++++++---- test/dlc_tests/ec2/test_smdebug.py | 4 ++-- test/dlc_tests/ecs/conftest.py | 13 +++++++++---- .../ecs/mxnet/training/test_ecs_mxnet_training.py | 8 ++++---- test/sagemaker_tests/mxnet/inference/conftest.py | 2 +- test/sagemaker_tests/mxnet/training/conftest.py | 2 +- .../inference/integration/sagemaker/test_mnist.py | 2 +- .../training/integration/sagemaker/test_dgl.py | 2 +- .../sagemaker/test_distributed_operations.py | 2 +- .../test/integration/sagemaker/conftest.py | 2 +- .../tensorflow1_training/integration/conftest.py | 2 +- .../tensorflow2_training/integration/conftest.py | 2 +- .../integration/sagemaker/test_mnist.py | 2 +- test/test_utils/__init__.py | 6 +++--- 16 files changed, 44 insertions(+), 31 deletions(-) diff --git a/scheduler/job_requester/requester.py b/scheduler/job_requester/requester.py index c7c8412bbac1..08cc917f5115 100644 --- a/scheduler/job_requester/requester.py +++ b/scheduler/job_requester/requester.py @@ -113,10 +113,13 @@ def assign_sagemaker_instance_type(self, image): :param image: ECR URI :return: type of instance used by the image """ - if "tensorflow" in image: - return "ml.p3.8xlarge" if "gpu" in image else "ml.c5.4xlarge" - else: - return "ml.p2.8xlarge" if "gpu" in image else "ml.c5.9xlarge" + return ( + "ml.p3.8xlarge" + if "gpu" in image + else "ml.c5.4xlarge" + if "tensorflow" in image + else "ml.c5.9xlarge" + ) def extract_timestamp(self, ticket_key): """ diff --git a/scheduler/log_return/__init__.py b/scheduler/log_return/__init__.py index 436d39a69934..e63c58b3a2d4 100644 --- a/scheduler/log_return/__init__.py +++ b/scheduler/log_return/__init__.py @@ -50,7 +50,7 @@ def update_pool(status, instance_type, num_of_instances, job_type, report_path=N :param job_type: training/inference :param report_path: path to find the xml reports. Only set if status == completed/runtimeError :param status: status of the test job, options: preparing/running/completed/runtimeError - :param instance_type: ml.p3.8xlarge/ml.c5.4xlarge/ml.p2.8xlarge/ml.c5.9xlarge + :param instance_type: ml.p3.8xlarge/ml.c5.4xlarge/ml.c5.9xlarge :param num_of_instances: number of instances required """ s3_client = boto3.client("s3") diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index d58ad621bb50..4e9f5ca8797f 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -312,10 +312,15 @@ def _validate_p4de_usage(request, instance_type): def _restrict_instance_usage(instance_type): - if "c4." in instance_type: - raise RuntimeError( - "C4-family instances are no longer supported in our system. Please use a different instance type (i.e. C5, or another C series instance type)." - ) + restricted_instances = {"c": ["c4"], "m": ["m4"], "p": ["p2"]} + + for instance_serie, instance_list in restricted_instances.items(): + for instance_family in instance_list: + if f"{instance_family}." in instance_type: + raise RuntimeError( + f"{instance_family.upper()}-family instances are no longer supported in our system." + f"Please use a different instance type (i.e. another {instance_serie.upper()} series instance type)." + ) return diff --git a/test/dlc_tests/ec2/test_smdebug.py b/test/dlc_tests/ec2/test_smdebug.py index 94e1d6b344b3..f61f4c2954d0 100644 --- a/test/dlc_tests/ec2/test_smdebug.py +++ b/test/dlc_tests/ec2/test_smdebug.py @@ -167,7 +167,7 @@ def run_smdebug_test( test_script=SMDEBUG_SCRIPT, timeout=2400, ): - large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge") + large_shm_instance_types = ("p3.8xlarge", "m5.16xlarge") shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " " framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") @@ -207,7 +207,7 @@ def run_smprofiler_test( test_script=SMPROFILER_SCRIPT, timeout=2400, ): - large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge") + large_shm_instance_types = ("p3.8xlarge", "m5.16xlarge") shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " " framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") diff --git a/test/dlc_tests/ecs/conftest.py b/test/dlc_tests/ecs/conftest.py index 94718d6fa870..68dc50ccfd94 100644 --- a/test/dlc_tests/ecs/conftest.py +++ b/test/dlc_tests/ecs/conftest.py @@ -82,10 +82,15 @@ def ecs_instance_type(request): def _restrict_instance_usage(instance_type): - if "c4." in instance_type: - raise RuntimeError( - "C4-family instances are no longer supported in our system. Please use a different instance type (i.e. C5, or another C series instance type)." - ) + restricted_instances = {"c": ["c4"], "m": ["m4"], "p": ["p2"]} + + for instance_serie, instance_list in restricted_instances.items(): + for instance_family in instance_list: + if f"{instance_family}." in instance_type: + raise RuntimeError( + f"{instance_family.upper()}-family instances are no longer supported in our system." + f"Please use a different instance type (i.e. another {instance_serie.upper()} series instance type)." + ) return diff --git a/test/dlc_tests/ecs/mxnet/training/test_ecs_mxnet_training.py b/test/dlc_tests/ecs/mxnet/training/test_ecs_mxnet_training.py index fec517a8f907..3462f0866799 100644 --- a/test/dlc_tests/ecs/mxnet/training/test_ecs_mxnet_training.py +++ b/test/dlc_tests/ecs/mxnet/training/test_ecs_mxnet_training.py @@ -48,7 +48,7 @@ def test_ecs_mxnet_training_mnist_gpu( """ GPU mnist test for MXNet Training - Instance Type - p2.8xlarge + Instance Type - p3.8xlarge Given above parameters, registers a task with family named after this test, runs the task, and waits for the task to be stopped before doing teardown operations of instance and cluster. @@ -96,7 +96,7 @@ def test_ecs_mxnet_training_dgl_cpu( @pytest.mark.integration("dgl") @pytest.mark.model("gcn") @pytest.mark.parametrize("training_script", [MX_DGL_TRAINING_SCRIPT], indirect=True) -@pytest.mark.parametrize("ecs_instance_type", ["p2.8xlarge"], indirect=True) +@pytest.mark.parametrize("ecs_instance_type", ["p3.8xlarge"], indirect=True) @pytest.mark.parametrize("ecs_ami", [ECS_AML2_GPU_USWEST2], indirect=True) @pytest.mark.team("dgl") def test_ecs_mxnet_training_dgl_gpu( @@ -105,7 +105,7 @@ def test_ecs_mxnet_training_dgl_gpu( """ GPU DGL test for MXNet Training - Instance Type - p2.xlarge + Instance Type - p3.8xlarge DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run on this function. @@ -165,7 +165,7 @@ def test_ecs_mxnet_training_gluonnlp_gpu( """ GPU Gluon NLP test for MXNet Training - Instance Type - p2.16xlarge + Instance Type - p3.16xlarge DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run on this function. diff --git a/test/sagemaker_tests/mxnet/inference/conftest.py b/test/sagemaker_tests/mxnet/inference/conftest.py index d6f4e35d759d..e102b15bb4c6 100644 --- a/test/sagemaker_tests/mxnet/inference/conftest.py +++ b/test/sagemaker_tests/mxnet/inference/conftest.py @@ -136,7 +136,7 @@ def tag(request, framework_version, processor, py_version): @pytest.fixture(scope="session") def instance_type(request, processor): provided_instance_type = request.config.getoption("--instance-type") - default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge" + default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge" return provided_instance_type if provided_instance_type is not None else default_instance_type diff --git a/test/sagemaker_tests/mxnet/training/conftest.py b/test/sagemaker_tests/mxnet/training/conftest.py index 59c874ef1846..48fb2a1790e2 100644 --- a/test/sagemaker_tests/mxnet/training/conftest.py +++ b/test/sagemaker_tests/mxnet/training/conftest.py @@ -153,7 +153,7 @@ def tag(request, framework_version, processor, py_version): @pytest.fixture(scope="session") def instance_type(request, processor): provided_instance_type = request.config.getoption("--instance-type") - default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge" + default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge" return provided_instance_type if provided_instance_type is not None else default_instance_type diff --git a/test/sagemaker_tests/pytorch/inference/integration/sagemaker/test_mnist.py b/test/sagemaker_tests/pytorch/inference/integration/sagemaker/test_mnist.py index 1cd8bd8661a7..046a2a88b7c5 100644 --- a/test/sagemaker_tests/pytorch/inference/integration/sagemaker/test_mnist.py +++ b/test/sagemaker_tests/pytorch/inference/integration/sagemaker/test_mnist.py @@ -65,7 +65,7 @@ def test_mnist_distributed_cpu(framework_version, ecr_image, instance_type, sage @pytest.mark.gpu_test @pytest.mark.team("conda") def test_mnist_distributed_gpu(framework_version, ecr_image, instance_type, sagemaker_regions): - instance_type = instance_type or "ml.p2.xlarge" + instance_type = instance_type or "ml.p3.xlarge" model_dir = os.path.join(model_cpu_dir, "model_mnist.tar.gz") function_args = { "framework_version": framework_version, diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_dgl.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_dgl.py index 5e6b8de6565c..f395ccef3385 100644 --- a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_dgl.py +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_dgl.py @@ -73,7 +73,7 @@ def test_dgl_gcn_training_gpu(ecr_image, sagemaker_regions, instance_type): if Version(image_framework_version) == Version("1.6") and image_cuda_version == "cu110": pytest.skip("DGL does not support CUDA 11 for PyTorch 1.6") - instance_type = instance_type or "ml.p2.xlarge" + instance_type = instance_type or "ml.p3.xlarge" function_args = { "instance_type": instance_type, } diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py index 081e43b61407..86095e35e8ed 100644 --- a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py @@ -107,7 +107,7 @@ def test_dist_operations_gpu( """ Test is run as multinode """ - instance_type = instance_type or "ml.p2.xlarge" + instance_type = instance_type or "ml.p3.xlarge" function_args = { "framework_version": framework_version, "instance_type": instance_type, diff --git a/test/sagemaker_tests/tensorflow/inference/test/integration/sagemaker/conftest.py b/test/sagemaker_tests/tensorflow/inference/test/integration/sagemaker/conftest.py index 570865ef74c2..08936724be86 100644 --- a/test/sagemaker_tests/tensorflow/inference/test/integration/sagemaker/conftest.py +++ b/test/sagemaker_tests/tensorflow/inference/test/integration/sagemaker/conftest.py @@ -140,7 +140,7 @@ def pytest_configure(config): os.environ["TEST_REGION"] = config.getoption("--region") os.environ["TEST_VERSIONS"] = config.getoption("--versions") or "1.11.1,1.12.0,1.13.0" os.environ["TEST_INSTANCE_TYPES"] = ( - config.getoption("--instance-types") or "ml.m5.xlarge,ml.p2.xlarge" + config.getoption("--instance-types") or "ml.m5.xlarge,ml.p3.xlarge" ) os.environ["TEST_EI_VERSIONS"] = config.getoption("--versions") or "1.11,1.12" diff --git a/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py b/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py index 09ef6ad08d09..9bef451362f3 100644 --- a/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py +++ b/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py @@ -139,7 +139,7 @@ def account_id(request): @pytest.fixture def instance_type(request, processor): provided_instance_type = request.config.getoption("--instance-type") - default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge" + default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge" return provided_instance_type if provided_instance_type is not None else default_instance_type diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py index 8df72febcfcb..8b6438ec0d46 100755 --- a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py +++ b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py @@ -173,7 +173,7 @@ def account_id(request): @pytest.fixture def instance_type(request, processor): provided_instance_type = request.config.getoption("--instance-type") - default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge" + default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge" return provided_instance_type if provided_instance_type is not None else default_instance_type diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py index 96e9389bafd8..0b4ef274ee51 100755 --- a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py +++ b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py @@ -194,7 +194,7 @@ def test_hc_distributed_mnist_ps(ecr_image, sagemaker_regions, instance_type, fr validate_or_skip_test(ecr_image=ecr_image) print("ecr image used for training", ecr_image) - instance_type = instance_type or "ml.p2.xlarge" + instance_type = instance_type or "ml.p3.xlarge" training_group = InstanceGroup("train_group", instance_type, 2) invoke_sm_helper_function( ecr_image, diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 88cc5d1f58e5..303ef4819753 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -604,7 +604,7 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type): Check for all compatibility issues between DLC Image Types and EC2 Instance Types. Currently configured to fail on the following checks: 1. p4d.24xlarge instance type is used with a cuda<11.0 image - 2. p2.8xlarge instance type is used with a cuda=11.0 image for MXNET framework + 2. p3.8xlarge instance type is used with a cuda=11.0 image for MXNET framework :param image_uri: ECR Image URI in valid DLC-format :param ec2_instance_type: EC2 Instance Type @@ -624,7 +624,7 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type): framework == "mxnet" and get_processor_from_image_uri(image_uri) == "gpu" and get_cuda_version_from_tag(image_uri).startswith("cu11") - and ec2_instance_type in ["p2.8xlarge"] + and ec2_instance_type in ["p3.8xlarge"] ) incompatible_conditions.append(image_is_cuda11_on_incompatible_p2_instance_mxnet) @@ -632,7 +632,7 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type): framework == "pytorch" and Version(framework_version) in SpecifierSet("==1.11.*") and get_processor_from_image_uri(image_uri) == "gpu" - and ec2_instance_type in ["p2.8xlarge"] + and ec2_instance_type in ["p3.8xlarge"] ) incompatible_conditions.append(image_is_pytorch_1_11_on_incompatible_p2_instance_pytorch)