From 29edf67eb29967c66e808527e36ba280ecddd7f3 Mon Sep 17 00:00:00 2001
From: Sirut Buasai <73297481+sirutBuasai@users.noreply.github.com>
Date: Tue, 28 May 2024 16:14:08 -0700
Subject: [PATCH] EC2 Instance retirement for P2 and M4 instances (#3950)

* EC2 Instance retirement for P2 and M4 instances

* correct buildspec

* build test 2.2 ec2

* revert toml

* revert buildspec
---
 scheduler/job_requester/requester.py                | 11 +++++++----
 scheduler/log_return/__init__.py                    |  2 +-
 test/dlc_tests/conftest.py                          | 13 +++++++++----
 test/dlc_tests/ec2/test_smdebug.py                  |  4 ++--
 test/dlc_tests/ecs/conftest.py                      | 13 +++++++++----
 .../ecs/mxnet/training/test_ecs_mxnet_training.py   |  8 ++++----
 test/sagemaker_tests/mxnet/inference/conftest.py    |  2 +-
 test/sagemaker_tests/mxnet/training/conftest.py     |  2 +-
 .../inference/integration/sagemaker/test_mnist.py   |  2 +-
 .../training/integration/sagemaker/test_dgl.py      |  2 +-
 .../sagemaker/test_distributed_operations.py        |  2 +-
 .../test/integration/sagemaker/conftest.py          |  2 +-
 .../tensorflow1_training/integration/conftest.py    |  2 +-
 .../tensorflow2_training/integration/conftest.py    |  2 +-
 .../integration/sagemaker/test_mnist.py             |  2 +-
 test/test_utils/__init__.py                         |  6 +++---
 16 files changed, 44 insertions(+), 31 deletions(-)
diff --git a/scheduler/job_requester/requester.py b/scheduler/job_requester/requester.py
index c7c8412bbac1..08cc917f5115 100644
--- a/scheduler/job_requester/requester.py
+++ b/scheduler/job_requester/requester.py
@@ -113,10 +113,13 @@ def assign_sagemaker_instance_type(self, image):
         :param image: <string> ECR URI
         :return: <string> type of instance used by the image
         """
-        if "tensorflow" in image:
-            return "ml.p3.8xlarge" if "gpu" in image else "ml.c5.4xlarge"
-        else:
-            return "ml.p2.8xlarge" if "gpu" in image else "ml.c5.9xlarge"
+        return (
+            "ml.p3.8xlarge"
+            if "gpu" in image
+            else "ml.c5.4xlarge"
+            if "tensorflow" in image
+            else "ml.c5.9xlarge"
+        )
 
     def extract_timestamp(self, ticket_key):
         """
diff --git a/scheduler/log_return/__init__.py b/scheduler/log_return/__init__.py
index 436d39a69934..e63c58b3a2d4 100644
--- a/scheduler/log_return/__init__.py
+++ b/scheduler/log_return/__init__.py
@@ -50,7 +50,7 @@ def update_pool(status, instance_type, num_of_instances, job_type, report_path=N
     :param job_type: <string> training/inference
     :param report_path: <string> path to find the xml reports. Only set if status == completed/runtimeError
     :param status: status of the test job, options: preparing/running/completed/runtimeError
-    :param instance_type: ml.p3.8xlarge/ml.c5.4xlarge/ml.p2.8xlarge/ml.c5.9xlarge
+    :param instance_type: ml.p3.8xlarge/ml.c5.4xlarge/ml.c5.9xlarge
     :param num_of_instances: number of instances required
     """
     s3_client = boto3.client("s3")
diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py
index d58ad621bb50..4e9f5ca8797f 100644
--- a/test/dlc_tests/conftest.py
+++ b/test/dlc_tests/conftest.py
@@ -312,10 +312,15 @@ def _validate_p4de_usage(request, instance_type):
 
 
 def _restrict_instance_usage(instance_type):
-    if "c4." in instance_type:
-        raise RuntimeError(
-            "C4-family instances are no longer supported in our system. Please use a different instance type (i.e. C5, or another C series instance type)."
-        )
+    restricted_instances = {"c": ["c4"], "m": ["m4"], "p": ["p2"]}
+
+    for instance_serie, instance_list in restricted_instances.items():
+        for instance_family in instance_list:
+            if f"{instance_family}." in instance_type:
+                raise RuntimeError(
+                    f"{instance_family.upper()}-family instances are no longer supported in our system."
+                    f"Please use a different instance type (i.e. another {instance_serie.upper()} series instance type)."
+                )
     return
 
 
diff --git a/test/dlc_tests/ec2/test_smdebug.py b/test/dlc_tests/ec2/test_smdebug.py
index 94e1d6b344b3..f61f4c2954d0 100644
--- a/test/dlc_tests/ec2/test_smdebug.py
+++ b/test/dlc_tests/ec2/test_smdebug.py
@@ -167,7 +167,7 @@ def run_smdebug_test(
     test_script=SMDEBUG_SCRIPT,
     timeout=2400,
 ):
-    large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge")
+    large_shm_instance_types = ("p3.8xlarge", "m5.16xlarge")
     shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
     framework = get_framework_from_image_uri(image_uri)
     container_test_local_dir = os.path.join("$HOME", "container_tests")
@@ -207,7 +207,7 @@ def run_smprofiler_test(
     test_script=SMPROFILER_SCRIPT,
     timeout=2400,
 ):
-    large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge")
+    large_shm_instance_types = ("p3.8xlarge", "m5.16xlarge")
     shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
     framework = get_framework_from_image_uri(image_uri)
     container_test_local_dir = os.path.join("$HOME", "container_tests")
diff --git a/test/dlc_tests/ecs/conftest.py b/test/dlc_tests/ecs/conftest.py
index 94718d6fa870..68dc50ccfd94 100644
--- a/test/dlc_tests/ecs/conftest.py
+++ b/test/dlc_tests/ecs/conftest.py
@@ -82,10 +82,15 @@ def ecs_instance_type(request):
 
 
 def _restrict_instance_usage(instance_type):
-    if "c4." in instance_type:
-        raise RuntimeError(
-            "C4-family instances are no longer supported in our system. Please use a different instance type (i.e. C5, or another C series instance type)."
-        )
+    restricted_instances = {"c": ["c4"], "m": ["m4"], "p": ["p2"]}
+
+    for instance_serie, instance_list in restricted_instances.items():
+        for instance_family in instance_list:
+            if f"{instance_family}." in instance_type:
+                raise RuntimeError(
+                    f"{instance_family.upper()}-family instances are no longer supported in our system."
+                    f"Please use a different instance type (i.e. another {instance_serie.upper()} series instance type)."
+                )
     return
 
 
diff --git a/test/dlc_tests/ecs/mxnet/training/test_ecs_mxnet_training.py b/test/dlc_tests/ecs/mxnet/training/test_ecs_mxnet_training.py
index fec517a8f907..3462f0866799 100644
--- a/test/dlc_tests/ecs/mxnet/training/test_ecs_mxnet_training.py
+++ b/test/dlc_tests/ecs/mxnet/training/test_ecs_mxnet_training.py
@@ -48,7 +48,7 @@ def test_ecs_mxnet_training_mnist_gpu(
     """
     GPU mnist test for MXNet Training
 
-    Instance Type - p2.8xlarge
+    Instance Type - p3.8xlarge
 
     Given above parameters, registers a task with family named after this test, runs the task, and waits for
     the task to be stopped before doing teardown operations of instance and cluster.
@@ -96,7 +96,7 @@ def test_ecs_mxnet_training_dgl_cpu(
 @pytest.mark.integration("dgl")
 @pytest.mark.model("gcn")
 @pytest.mark.parametrize("training_script", [MX_DGL_TRAINING_SCRIPT], indirect=True)
-@pytest.mark.parametrize("ecs_instance_type", ["p2.8xlarge"], indirect=True)
+@pytest.mark.parametrize("ecs_instance_type", ["p3.8xlarge"], indirect=True)
 @pytest.mark.parametrize("ecs_ami", [ECS_AML2_GPU_USWEST2], indirect=True)
 @pytest.mark.team("dgl")
 def test_ecs_mxnet_training_dgl_gpu(
@@ -105,7 +105,7 @@ def test_ecs_mxnet_training_dgl_gpu(
     """
     GPU DGL test for MXNet Training
 
-    Instance Type - p2.xlarge
+    Instance Type - p3.8xlarge
 
     DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
     on this function.
@@ -165,7 +165,7 @@ def test_ecs_mxnet_training_gluonnlp_gpu(
     """
     GPU Gluon NLP test for MXNet Training
 
-    Instance Type - p2.16xlarge
+    Instance Type - p3.16xlarge
 
     DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
     on this function.
diff --git a/test/sagemaker_tests/mxnet/inference/conftest.py b/test/sagemaker_tests/mxnet/inference/conftest.py
index d6f4e35d759d..e102b15bb4c6 100644
--- a/test/sagemaker_tests/mxnet/inference/conftest.py
+++ b/test/sagemaker_tests/mxnet/inference/conftest.py
@@ -136,7 +136,7 @@ def tag(request, framework_version, processor, py_version):
 @pytest.fixture(scope="session")
 def instance_type(request, processor):
     provided_instance_type = request.config.getoption("--instance-type")
-    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
+    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
     return provided_instance_type if provided_instance_type is not None else default_instance_type
 
 
diff --git a/test/sagemaker_tests/mxnet/training/conftest.py b/test/sagemaker_tests/mxnet/training/conftest.py
index 59c874ef1846..48fb2a1790e2 100644
--- a/test/sagemaker_tests/mxnet/training/conftest.py
+++ b/test/sagemaker_tests/mxnet/training/conftest.py
@@ -153,7 +153,7 @@ def tag(request, framework_version, processor, py_version):
 @pytest.fixture(scope="session")
 def instance_type(request, processor):
     provided_instance_type = request.config.getoption("--instance-type")
-    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
+    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
     return provided_instance_type if provided_instance_type is not None else default_instance_type
 
 
diff --git a/test/sagemaker_tests/pytorch/inference/integration/sagemaker/test_mnist.py b/test/sagemaker_tests/pytorch/inference/integration/sagemaker/test_mnist.py
index 1cd8bd8661a7..046a2a88b7c5 100644
--- a/test/sagemaker_tests/pytorch/inference/integration/sagemaker/test_mnist.py
+++ b/test/sagemaker_tests/pytorch/inference/integration/sagemaker/test_mnist.py
@@ -65,7 +65,7 @@ def test_mnist_distributed_cpu(framework_version, ecr_image, instance_type, sage
 @pytest.mark.gpu_test
 @pytest.mark.team("conda")
 def test_mnist_distributed_gpu(framework_version, ecr_image, instance_type, sagemaker_regions):
-    instance_type = instance_type or "ml.p2.xlarge"
+    instance_type = instance_type or "ml.p3.xlarge"
     model_dir = os.path.join(model_cpu_dir, "model_mnist.tar.gz")
     function_args = {
         "framework_version": framework_version,
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_dgl.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_dgl.py
index 5e6b8de6565c..f395ccef3385 100644
--- a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_dgl.py
+++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_dgl.py
@@ -73,7 +73,7 @@ def test_dgl_gcn_training_gpu(ecr_image, sagemaker_regions, instance_type):
     if Version(image_framework_version) == Version("1.6") and image_cuda_version == "cu110":
         pytest.skip("DGL does not support CUDA 11 for PyTorch 1.6")
 
-    instance_type = instance_type or "ml.p2.xlarge"
+    instance_type = instance_type or "ml.p3.xlarge"
     function_args = {
         "instance_type": instance_type,
     }
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py
index 081e43b61407..86095e35e8ed 100644
--- a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py
+++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py
@@ -107,7 +107,7 @@ def test_dist_operations_gpu(
     """
     Test is run as multinode
     """
-    instance_type = instance_type or "ml.p2.xlarge"
+    instance_type = instance_type or "ml.p3.xlarge"
     function_args = {
         "framework_version": framework_version,
         "instance_type": instance_type,
diff --git a/test/sagemaker_tests/tensorflow/inference/test/integration/sagemaker/conftest.py b/test/sagemaker_tests/tensorflow/inference/test/integration/sagemaker/conftest.py
index 570865ef74c2..08936724be86 100644
--- a/test/sagemaker_tests/tensorflow/inference/test/integration/sagemaker/conftest.py
+++ b/test/sagemaker_tests/tensorflow/inference/test/integration/sagemaker/conftest.py
@@ -140,7 +140,7 @@ def pytest_configure(config):
     os.environ["TEST_REGION"] = config.getoption("--region")
     os.environ["TEST_VERSIONS"] = config.getoption("--versions") or "1.11.1,1.12.0,1.13.0"
     os.environ["TEST_INSTANCE_TYPES"] = (
-        config.getoption("--instance-types") or "ml.m5.xlarge,ml.p2.xlarge"
+        config.getoption("--instance-types") or "ml.m5.xlarge,ml.p3.xlarge"
     )
 
     os.environ["TEST_EI_VERSIONS"] = config.getoption("--versions") or "1.11,1.12"
diff --git a/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py b/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py
index 09ef6ad08d09..9bef451362f3 100644
--- a/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py
+++ b/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py
@@ -139,7 +139,7 @@ def account_id(request):
 @pytest.fixture
 def instance_type(request, processor):
     provided_instance_type = request.config.getoption("--instance-type")
-    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
+    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
     return provided_instance_type if provided_instance_type is not None else default_instance_type
 
 
diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py
index 8df72febcfcb..8b6438ec0d46 100755
--- a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py
+++ b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py
@@ -173,7 +173,7 @@ def account_id(request):
 @pytest.fixture
 def instance_type(request, processor):
     provided_instance_type = request.config.getoption("--instance-type")
-    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
+    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
     return provided_instance_type if provided_instance_type is not None else default_instance_type
 
 
diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py
index 96e9389bafd8..0b4ef274ee51 100755
--- a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py
+++ b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py
@@ -194,7 +194,7 @@ def test_hc_distributed_mnist_ps(ecr_image, sagemaker_regions, instance_type, fr
 
     validate_or_skip_test(ecr_image=ecr_image)
     print("ecr image used for training", ecr_image)
-    instance_type = instance_type or "ml.p2.xlarge"
+    instance_type = instance_type or "ml.p3.xlarge"
     training_group = InstanceGroup("train_group", instance_type, 2)
     invoke_sm_helper_function(
         ecr_image,
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
index 88cc5d1f58e5..303ef4819753 100644
--- a/test/test_utils/__init__.py
+++ b/test/test_utils/__init__.py
@@ -604,7 +604,7 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type):
     Check for all compatibility issues between DLC Image Types and EC2 Instance Types.
     Currently configured to fail on the following checks:
         1. p4d.24xlarge instance type is used with a cuda<11.0 image
-        2. p2.8xlarge instance type is used with a cuda=11.0 image for MXNET framework
+        2. p3.8xlarge instance type is used with a cuda=11.0 image for MXNET framework
 
     :param image_uri: ECR Image URI in valid DLC-format
     :param ec2_instance_type: EC2 Instance Type
@@ -624,7 +624,7 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type):
         framework == "mxnet"
         and get_processor_from_image_uri(image_uri) == "gpu"
         and get_cuda_version_from_tag(image_uri).startswith("cu11")
-        and ec2_instance_type in ["p2.8xlarge"]
+        and ec2_instance_type in ["p3.8xlarge"]
     )
     incompatible_conditions.append(image_is_cuda11_on_incompatible_p2_instance_mxnet)
 
@@ -632,7 +632,7 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type):
         framework == "pytorch"
         and Version(framework_version) in SpecifierSet("==1.11.*")
         and get_processor_from_image_uri(image_uri) == "gpu"
-        and ec2_instance_type in ["p2.8xlarge"]
+        and ec2_instance_type in ["p3.8xlarge"]
     )
     incompatible_conditions.append(image_is_pytorch_1_11_on_incompatible_p2_instance_pytorch)