Skip to content

Commit

Permalink
EC2 Instance retirement for P2 and M4 instances (#3950)
Browse files Browse the repository at this point in the history
* EC2 Instance retirement for P2 and M4 instances

* correct buildspec

* build test 2.2 ec2

* revert toml

* revert buildspec
  • Loading branch information
sirutBuasai committed May 28, 2024
1 parent c268b66 commit 29edf67
Show file tree
Hide file tree
Showing 16 changed files with 44 additions and 31 deletions.
11 changes: 7 additions & 4 deletions scheduler/job_requester/requester.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,13 @@ def assign_sagemaker_instance_type(self, image):
:param image: <string> ECR URI
:return: <string> type of instance used by the image
"""
if "tensorflow" in image:
return "ml.p3.8xlarge" if "gpu" in image else "ml.c5.4xlarge"
else:
return "ml.p2.8xlarge" if "gpu" in image else "ml.c5.9xlarge"
return (
"ml.p3.8xlarge"
if "gpu" in image
else "ml.c5.4xlarge"
if "tensorflow" in image
else "ml.c5.9xlarge"
)

def extract_timestamp(self, ticket_key):
"""
Expand Down
2 changes: 1 addition & 1 deletion scheduler/log_return/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def update_pool(status, instance_type, num_of_instances, job_type, report_path=N
:param job_type: <string> training/inference
:param report_path: <string> path to find the xml reports. Only set if status == completed/runtimeError
:param status: status of the test job, options: preparing/running/completed/runtimeError
:param instance_type: ml.p3.8xlarge/ml.c5.4xlarge/ml.p2.8xlarge/ml.c5.9xlarge
:param instance_type: ml.p3.8xlarge/ml.c5.4xlarge/ml.c5.9xlarge
:param num_of_instances: number of instances required
"""
s3_client = boto3.client("s3")
Expand Down
13 changes: 9 additions & 4 deletions test/dlc_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,10 +312,15 @@ def _validate_p4de_usage(request, instance_type):


def _restrict_instance_usage(instance_type):
if "c4." in instance_type:
raise RuntimeError(
"C4-family instances are no longer supported in our system. Please use a different instance type (i.e. C5, or another C series instance type)."
)
restricted_instances = {"c": ["c4"], "m": ["m4"], "p": ["p2"]}

for instance_serie, instance_list in restricted_instances.items():
for instance_family in instance_list:
if f"{instance_family}." in instance_type:
raise RuntimeError(
f"{instance_family.upper()}-family instances are no longer supported in our system."
f"Please use a different instance type (i.e. another {instance_serie.upper()} series instance type)."
)
return


Expand Down
4 changes: 2 additions & 2 deletions test/dlc_tests/ec2/test_smdebug.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def run_smdebug_test(
test_script=SMDEBUG_SCRIPT,
timeout=2400,
):
large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge")
large_shm_instance_types = ("p3.8xlarge", "m5.16xlarge")
shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
framework = get_framework_from_image_uri(image_uri)
container_test_local_dir = os.path.join("$HOME", "container_tests")
Expand Down Expand Up @@ -207,7 +207,7 @@ def run_smprofiler_test(
test_script=SMPROFILER_SCRIPT,
timeout=2400,
):
large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge")
large_shm_instance_types = ("p3.8xlarge", "m5.16xlarge")
shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
framework = get_framework_from_image_uri(image_uri)
container_test_local_dir = os.path.join("$HOME", "container_tests")
Expand Down
13 changes: 9 additions & 4 deletions test/dlc_tests/ecs/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,15 @@ def ecs_instance_type(request):


def _restrict_instance_usage(instance_type):
if "c4." in instance_type:
raise RuntimeError(
"C4-family instances are no longer supported in our system. Please use a different instance type (i.e. C5, or another C series instance type)."
)
restricted_instances = {"c": ["c4"], "m": ["m4"], "p": ["p2"]}

for instance_serie, instance_list in restricted_instances.items():
for instance_family in instance_list:
if f"{instance_family}." in instance_type:
raise RuntimeError(
f"{instance_family.upper()}-family instances are no longer supported in our system."
f"Please use a different instance type (i.e. another {instance_serie.upper()} series instance type)."
)
return


Expand Down
8 changes: 4 additions & 4 deletions test/dlc_tests/ecs/mxnet/training/test_ecs_mxnet_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_ecs_mxnet_training_mnist_gpu(
"""
GPU mnist test for MXNet Training
Instance Type - p2.8xlarge
Instance Type - p3.8xlarge
Given above parameters, registers a task with family named after this test, runs the task, and waits for
the task to be stopped before doing teardown operations of instance and cluster.
Expand Down Expand Up @@ -96,7 +96,7 @@ def test_ecs_mxnet_training_dgl_cpu(
@pytest.mark.integration("dgl")
@pytest.mark.model("gcn")
@pytest.mark.parametrize("training_script", [MX_DGL_TRAINING_SCRIPT], indirect=True)
@pytest.mark.parametrize("ecs_instance_type", ["p2.8xlarge"], indirect=True)
@pytest.mark.parametrize("ecs_instance_type", ["p3.8xlarge"], indirect=True)
@pytest.mark.parametrize("ecs_ami", [ECS_AML2_GPU_USWEST2], indirect=True)
@pytest.mark.team("dgl")
def test_ecs_mxnet_training_dgl_gpu(
Expand All @@ -105,7 +105,7 @@ def test_ecs_mxnet_training_dgl_gpu(
"""
GPU DGL test for MXNet Training
Instance Type - p2.xlarge
Instance Type - p3.8xlarge
DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
on this function.
Expand Down Expand Up @@ -165,7 +165,7 @@ def test_ecs_mxnet_training_gluonnlp_gpu(
"""
GPU Gluon NLP test for MXNet Training
Instance Type - p2.16xlarge
Instance Type - p3.16xlarge
DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
on this function.
Expand Down
2 changes: 1 addition & 1 deletion test/sagemaker_tests/mxnet/inference/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def tag(request, framework_version, processor, py_version):
@pytest.fixture(scope="session")
def instance_type(request, processor):
provided_instance_type = request.config.getoption("--instance-type")
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
return provided_instance_type if provided_instance_type is not None else default_instance_type


Expand Down
2 changes: 1 addition & 1 deletion test/sagemaker_tests/mxnet/training/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def tag(request, framework_version, processor, py_version):
@pytest.fixture(scope="session")
def instance_type(request, processor):
provided_instance_type = request.config.getoption("--instance-type")
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
return provided_instance_type if provided_instance_type is not None else default_instance_type


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_mnist_distributed_cpu(framework_version, ecr_image, instance_type, sage
@pytest.mark.gpu_test
@pytest.mark.team("conda")
def test_mnist_distributed_gpu(framework_version, ecr_image, instance_type, sagemaker_regions):
instance_type = instance_type or "ml.p2.xlarge"
instance_type = instance_type or "ml.p3.xlarge"
model_dir = os.path.join(model_cpu_dir, "model_mnist.tar.gz")
function_args = {
"framework_version": framework_version,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_dgl_gcn_training_gpu(ecr_image, sagemaker_regions, instance_type):
if Version(image_framework_version) == Version("1.6") and image_cuda_version == "cu110":
pytest.skip("DGL does not support CUDA 11 for PyTorch 1.6")

instance_type = instance_type or "ml.p2.xlarge"
instance_type = instance_type or "ml.p3.xlarge"
function_args = {
"instance_type": instance_type,
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_dist_operations_gpu(
"""
Test is run as multinode
"""
instance_type = instance_type or "ml.p2.xlarge"
instance_type = instance_type or "ml.p3.xlarge"
function_args = {
"framework_version": framework_version,
"instance_type": instance_type,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def pytest_configure(config):
os.environ["TEST_REGION"] = config.getoption("--region")
os.environ["TEST_VERSIONS"] = config.getoption("--versions") or "1.11.1,1.12.0,1.13.0"
os.environ["TEST_INSTANCE_TYPES"] = (
config.getoption("--instance-types") or "ml.m5.xlarge,ml.p2.xlarge"
config.getoption("--instance-types") or "ml.m5.xlarge,ml.p3.xlarge"
)

os.environ["TEST_EI_VERSIONS"] = config.getoption("--versions") or "1.11,1.12"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def account_id(request):
@pytest.fixture
def instance_type(request, processor):
provided_instance_type = request.config.getoption("--instance-type")
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
return provided_instance_type if provided_instance_type is not None else default_instance_type


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def account_id(request):
@pytest.fixture
def instance_type(request, processor):
provided_instance_type = request.config.getoption("--instance-type")
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
return provided_instance_type if provided_instance_type is not None else default_instance_type


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def test_hc_distributed_mnist_ps(ecr_image, sagemaker_regions, instance_type, fr

validate_or_skip_test(ecr_image=ecr_image)
print("ecr image used for training", ecr_image)
instance_type = instance_type or "ml.p2.xlarge"
instance_type = instance_type or "ml.p3.xlarge"
training_group = InstanceGroup("train_group", instance_type, 2)
invoke_sm_helper_function(
ecr_image,
Expand Down
6 changes: 3 additions & 3 deletions test/test_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type):
Check for all compatibility issues between DLC Image Types and EC2 Instance Types.
Currently configured to fail on the following checks:
1. p4d.24xlarge instance type is used with a cuda<11.0 image
2. p2.8xlarge instance type is used with a cuda=11.0 image for MXNET framework
2. p3.8xlarge instance type is used with a cuda=11.0 image for MXNET framework
:param image_uri: ECR Image URI in valid DLC-format
:param ec2_instance_type: EC2 Instance Type
Expand All @@ -624,15 +624,15 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type):
framework == "mxnet"
and get_processor_from_image_uri(image_uri) == "gpu"
and get_cuda_version_from_tag(image_uri).startswith("cu11")
and ec2_instance_type in ["p2.8xlarge"]
and ec2_instance_type in ["p3.8xlarge"]
)
incompatible_conditions.append(image_is_cuda11_on_incompatible_p2_instance_mxnet)

image_is_pytorch_1_11_on_incompatible_p2_instance_pytorch = (
framework == "pytorch"
and Version(framework_version) in SpecifierSet("==1.11.*")
and get_processor_from_image_uri(image_uri) == "gpu"
and ec2_instance_type in ["p2.8xlarge"]
and ec2_instance_type in ["p3.8xlarge"]
)
incompatible_conditions.append(image_is_pytorch_1_11_on_incompatible_p2_instance_pytorch)

Expand Down

0 comments on commit 29edf67

Please sign in to comment.