EC2 Instance retirement for P2 and M4 instances (#3950)

* EC2 Instance retirement for P2 and M4 instances * correct buildspec * build test 2.2 ec2 * revert toml * revert buildspec
aws · May 28, 2024 · 29edf67 · 29edf67
1 parent c268b66
commit 29edf67
Show file tree

Hide file tree

Showing 16 changed files with 44 additions and 31 deletions.
diff --git a/scheduler/job_requester/requester.py b/scheduler/job_requester/requester.py
@@ -113,10 +113,13 @@ def assign_sagemaker_instance_type(self, image):
         :param image: <string> ECR URI
         :return: <string> type of instance used by the image
         """
-        if "tensorflow" in image:
-            return "ml.p3.8xlarge" if "gpu" in image else "ml.c5.4xlarge"
-        else:
-            return "ml.p2.8xlarge" if "gpu" in image else "ml.c5.9xlarge"
+        return (
+            "ml.p3.8xlarge"
+            if "gpu" in image
+            else "ml.c5.4xlarge"
+            if "tensorflow" in image
+            else "ml.c5.9xlarge"
+        )
 
     def extract_timestamp(self, ticket_key):
         """

diff --git a/scheduler/log_return/__init__.py b/scheduler/log_return/__init__.py
@@ -50,7 +50,7 @@ def update_pool(status, instance_type, num_of_instances, job_type, report_path=N
     :param job_type: <string> training/inference
     :param report_path: <string> path to find the xml reports. Only set if status == completed/runtimeError
     :param status: status of the test job, options: preparing/running/completed/runtimeError
-    :param instance_type: ml.p3.8xlarge/ml.c5.4xlarge/ml.p2.8xlarge/ml.c5.9xlarge
+    :param instance_type: ml.p3.8xlarge/ml.c5.4xlarge/ml.c5.9xlarge
     :param num_of_instances: number of instances required
     """
     s3_client = boto3.client("s3")

diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py
@@ -312,10 +312,15 @@ def _validate_p4de_usage(request, instance_type):
 
 
 def _restrict_instance_usage(instance_type):
-    if "c4." in instance_type:
-        raise RuntimeError(
-            "C4-family instances are no longer supported in our system. Please use a different instance type (i.e. C5, or another C series instance type)."
-        )
+    restricted_instances = {"c": ["c4"], "m": ["m4"], "p": ["p2"]}
+
+    for instance_serie, instance_list in restricted_instances.items():
+        for instance_family in instance_list:
+            if f"{instance_family}." in instance_type:
+                raise RuntimeError(
+                    f"{instance_family.upper()}-family instances are no longer supported in our system."
+                    f"Please use a different instance type (i.e. another {instance_serie.upper()} series instance type)."
+                )
     return
 
 

diff --git a/test/dlc_tests/ec2/test_smdebug.py b/test/dlc_tests/ec2/test_smdebug.py
@@ -167,7 +167,7 @@ def run_smdebug_test(
     test_script=SMDEBUG_SCRIPT,
     timeout=2400,
 ):
-    large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge")
+    large_shm_instance_types = ("p3.8xlarge", "m5.16xlarge")
     shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
     framework = get_framework_from_image_uri(image_uri)
     container_test_local_dir = os.path.join("$HOME", "container_tests")
@@ -207,7 +207,7 @@ def run_smprofiler_test(
     test_script=SMPROFILER_SCRIPT,
     timeout=2400,
 ):
-    large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge")
+    large_shm_instance_types = ("p3.8xlarge", "m5.16xlarge")
     shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
     framework = get_framework_from_image_uri(image_uri)
     container_test_local_dir = os.path.join("$HOME", "container_tests")

diff --git a/test/dlc_tests/ecs/conftest.py b/test/dlc_tests/ecs/conftest.py
@@ -82,10 +82,15 @@ def ecs_instance_type(request):
 
 
 def _restrict_instance_usage(instance_type):
-    if "c4." in instance_type:
-        raise RuntimeError(
-            "C4-family instances are no longer supported in our system. Please use a different instance type (i.e. C5, or another C series instance type)."
-        )
+    restricted_instances = {"c": ["c4"], "m": ["m4"], "p": ["p2"]}
+
+    for instance_serie, instance_list in restricted_instances.items():
+        for instance_family in instance_list:
+            if f"{instance_family}." in instance_type:
+                raise RuntimeError(
+                    f"{instance_family.upper()}-family instances are no longer supported in our system."
+                    f"Please use a different instance type (i.e. another {instance_serie.upper()} series instance type)."
+                )
     return
 
 

diff --git a/test/dlc_tests/ecs/mxnet/training/test_ecs_mxnet_training.py b/test/dlc_tests/ecs/mxnet/training/test_ecs_mxnet_training.py
@@ -48,7 +48,7 @@ def test_ecs_mxnet_training_mnist_gpu(
     """
     GPU mnist test for MXNet Training
 
-    Instance Type - p2.8xlarge
+    Instance Type - p3.8xlarge
 
     Given above parameters, registers a task with family named after this test, runs the task, and waits for
     the task to be stopped before doing teardown operations of instance and cluster.
@@ -96,7 +96,7 @@ def test_ecs_mxnet_training_dgl_cpu(
 @pytest.mark.integration("dgl")
 @pytest.mark.model("gcn")
 @pytest.mark.parametrize("training_script", [MX_DGL_TRAINING_SCRIPT], indirect=True)
-@pytest.mark.parametrize("ecs_instance_type", ["p2.8xlarge"], indirect=True)
+@pytest.mark.parametrize("ecs_instance_type", ["p3.8xlarge"], indirect=True)
 @pytest.mark.parametrize("ecs_ami", [ECS_AML2_GPU_USWEST2], indirect=True)
 @pytest.mark.team("dgl")
 def test_ecs_mxnet_training_dgl_gpu(
@@ -105,7 +105,7 @@ def test_ecs_mxnet_training_dgl_gpu(
     """
     GPU DGL test for MXNet Training
 
-    Instance Type - p2.xlarge
+    Instance Type - p3.8xlarge
 
     DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
     on this function.
@@ -165,7 +165,7 @@ def test_ecs_mxnet_training_gluonnlp_gpu(
     """
     GPU Gluon NLP test for MXNet Training
 
-    Instance Type - p2.16xlarge
+    Instance Type - p3.16xlarge
 
     DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
     on this function.

diff --git a/test/sagemaker_tests/mxnet/inference/conftest.py b/test/sagemaker_tests/mxnet/inference/conftest.py
@@ -136,7 +136,7 @@ def tag(request, framework_version, processor, py_version):
 @pytest.fixture(scope="session")
 def instance_type(request, processor):
     provided_instance_type = request.config.getoption("--instance-type")
-    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
+    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
     return provided_instance_type if provided_instance_type is not None else default_instance_type
 
 

diff --git a/test/sagemaker_tests/mxnet/training/conftest.py b/test/sagemaker_tests/mxnet/training/conftest.py
@@ -153,7 +153,7 @@ def tag(request, framework_version, processor, py_version):
 @pytest.fixture(scope="session")
 def instance_type(request, processor):
     provided_instance_type = request.config.getoption("--instance-type")
-    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
+    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
     return provided_instance_type if provided_instance_type is not None else default_instance_type
 
 

diff --git a/test/sagemaker_tests/pytorch/inference/integration/sagemaker/test_mnist.py b/test/sagemaker_tests/pytorch/inference/integration/sagemaker/test_mnist.py
@@ -65,7 +65,7 @@ def test_mnist_distributed_cpu(framework_version, ecr_image, instance_type, sage
 @pytest.mark.gpu_test
 @pytest.mark.team("conda")
 def test_mnist_distributed_gpu(framework_version, ecr_image, instance_type, sagemaker_regions):
-    instance_type = instance_type or "ml.p2.xlarge"
+    instance_type = instance_type or "ml.p3.xlarge"
     model_dir = os.path.join(model_cpu_dir, "model_mnist.tar.gz")
     function_args = {
         "framework_version": framework_version,

diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_dgl.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_dgl.py
@@ -73,7 +73,7 @@ def test_dgl_gcn_training_gpu(ecr_image, sagemaker_regions, instance_type):
     if Version(image_framework_version) == Version("1.6") and image_cuda_version == "cu110":
         pytest.skip("DGL does not support CUDA 11 for PyTorch 1.6")
 
-    instance_type = instance_type or "ml.p2.xlarge"
+    instance_type = instance_type or "ml.p3.xlarge"
     function_args = {
         "instance_type": instance_type,
     }

diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py
@@ -107,7 +107,7 @@ def test_dist_operations_gpu(
     """
     Test is run as multinode
     """
-    instance_type = instance_type or "ml.p2.xlarge"
+    instance_type = instance_type or "ml.p3.xlarge"
     function_args = {
         "framework_version": framework_version,
         "instance_type": instance_type,

diff --git a/test/sagemaker_tests/tensorflow/inference/test/integration/sagemaker/conftest.py b/test/sagemaker_tests/tensorflow/inference/test/integration/sagemaker/conftest.py
@@ -140,7 +140,7 @@ def pytest_configure(config):
     os.environ["TEST_REGION"] = config.getoption("--region")
     os.environ["TEST_VERSIONS"] = config.getoption("--versions") or "1.11.1,1.12.0,1.13.0"
     os.environ["TEST_INSTANCE_TYPES"] = (
-        config.getoption("--instance-types") or "ml.m5.xlarge,ml.p2.xlarge"
+        config.getoption("--instance-types") or "ml.m5.xlarge,ml.p3.xlarge"
     )
 
     os.environ["TEST_EI_VERSIONS"] = config.getoption("--versions") or "1.11,1.12"

diff --git a/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py b/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py
@@ -139,7 +139,7 @@ def account_id(request):
 @pytest.fixture
 def instance_type(request, processor):
     provided_instance_type = request.config.getoption("--instance-type")
-    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
+    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
     return provided_instance_type if provided_instance_type is not None else default_instance_type
 
 

diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py
@@ -173,7 +173,7 @@ def account_id(request):
 @pytest.fixture
 def instance_type(request, processor):
     provided_instance_type = request.config.getoption("--instance-type")
-    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
+    default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
     return provided_instance_type if provided_instance_type is not None else default_instance_type
 
 

diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py
@@ -194,7 +194,7 @@ def test_hc_distributed_mnist_ps(ecr_image, sagemaker_regions, instance_type, fr
 
     validate_or_skip_test(ecr_image=ecr_image)
     print("ecr image used for training", ecr_image)
-    instance_type = instance_type or "ml.p2.xlarge"
+    instance_type = instance_type or "ml.p3.xlarge"
     training_group = InstanceGroup("train_group", instance_type, 2)
     invoke_sm_helper_function(
         ecr_image,

diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
@@ -604,7 +604,7 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type):
     Check for all compatibility issues between DLC Image Types and EC2 Instance Types.
     Currently configured to fail on the following checks:
         1. p4d.24xlarge instance type is used with a cuda<11.0 image
-        2. p2.8xlarge instance type is used with a cuda=11.0 image for MXNET framework
+        2. p3.8xlarge instance type is used with a cuda=11.0 image for MXNET framework
 
     :param image_uri: ECR Image URI in valid DLC-format
     :param ec2_instance_type: EC2 Instance Type
@@ -624,15 +624,15 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type):
         framework == "mxnet"
         and get_processor_from_image_uri(image_uri) == "gpu"
         and get_cuda_version_from_tag(image_uri).startswith("cu11")
-        and ec2_instance_type in ["p2.8xlarge"]
+        and ec2_instance_type in ["p3.8xlarge"]
     )
     incompatible_conditions.append(image_is_cuda11_on_incompatible_p2_instance_mxnet)
 
     image_is_pytorch_1_11_on_incompatible_p2_instance_pytorch = (
         framework == "pytorch"
         and Version(framework_version) in SpecifierSet("==1.11.*")
         and get_processor_from_image_uri(image_uri) == "gpu"
-        and ec2_instance_type in ["p2.8xlarge"]
+        and ec2_instance_type in ["p3.8xlarge"]
     )
     incompatible_conditions.append(image_is_pytorch_1_11_on_incompatible_p2_instance_pytorch)