Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EC2 Instance retirement for P2 and M4 instances #3950

Merged
merged 7 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions scheduler/job_requester/requester.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,13 @@ def assign_sagemaker_instance_type(self, image):
:param image: <string> ECR URI
:return: <string> type of instance used by the image
"""
if "tensorflow" in image:
return "ml.p3.8xlarge" if "gpu" in image else "ml.c5.4xlarge"
else:
return "ml.p2.8xlarge" if "gpu" in image else "ml.c5.9xlarge"
return (
"ml.p3.8xlarge"
if "gpu" in image
else "ml.c5.4xlarge"
if "tensorflow" in image
else "ml.c5.9xlarge"
)

def extract_timestamp(self, ticket_key):
"""
Expand Down
2 changes: 1 addition & 1 deletion scheduler/log_return/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def update_pool(status, instance_type, num_of_instances, job_type, report_path=N
:param job_type: <string> training/inference
:param report_path: <string> path to find the xml reports. Only set if status == completed/runtimeError
:param status: status of the test job, options: preparing/running/completed/runtimeError
:param instance_type: ml.p3.8xlarge/ml.c5.4xlarge/ml.p2.8xlarge/ml.c5.9xlarge
:param instance_type: ml.p3.8xlarge/ml.c5.4xlarge/ml.c5.9xlarge
:param num_of_instances: number of instances required
"""
s3_client = boto3.client("s3")
Expand Down
13 changes: 9 additions & 4 deletions test/dlc_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,10 +312,15 @@ def _validate_p4de_usage(request, instance_type):


def _restrict_instance_usage(instance_type):
if "c4." in instance_type:
raise RuntimeError(
"C4-family instances are no longer supported in our system. Please use a different instance type (i.e. C5, or another C series instance type)."
)
restricted_instances = {"c": ["c4"], "m": ["m4"], "p": ["p2"]}

for instance_serie, instance_list in restricted_instances.items():
for instance_family in instance_list:
if f"{instance_family}." in instance_type:
raise RuntimeError(
f"{instance_family.upper()}-family instances are no longer supported in our system."
f"Please use a different instance type (i.e. another {instance_serie.upper()} series instance type)."
)
return


Expand Down
4 changes: 2 additions & 2 deletions test/dlc_tests/ec2/test_smdebug.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def run_smdebug_test(
test_script=SMDEBUG_SCRIPT,
timeout=2400,
):
large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge")
large_shm_instance_types = ("p3.8xlarge", "m5.16xlarge")
shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
framework = get_framework_from_image_uri(image_uri)
container_test_local_dir = os.path.join("$HOME", "container_tests")
Expand Down Expand Up @@ -207,7 +207,7 @@ def run_smprofiler_test(
test_script=SMPROFILER_SCRIPT,
timeout=2400,
):
large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge")
large_shm_instance_types = ("p3.8xlarge", "m5.16xlarge")
shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
framework = get_framework_from_image_uri(image_uri)
container_test_local_dir = os.path.join("$HOME", "container_tests")
Expand Down
13 changes: 9 additions & 4 deletions test/dlc_tests/ecs/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,15 @@ def ecs_instance_type(request):


def _restrict_instance_usage(instance_type):
if "c4." in instance_type:
raise RuntimeError(
"C4-family instances are no longer supported in our system. Please use a different instance type (i.e. C5, or another C series instance type)."
)
restricted_instances = {"c": ["c4"], "m": ["m4"], "p": ["p2"]}

for instance_serie, instance_list in restricted_instances.items():
for instance_family in instance_list:
if f"{instance_family}." in instance_type:
raise RuntimeError(
f"{instance_family.upper()}-family instances are no longer supported in our system."
f"Please use a different instance type (i.e. another {instance_serie.upper()} series instance type)."
)
return


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_ecs_mxnet_training_mnist_gpu(
"""
GPU mnist test for MXNet Training

Instance Type - p2.8xlarge
Instance Type - p3.8xlarge

Given above parameters, registers a task with family named after this test, runs the task, and waits for
the task to be stopped before doing teardown operations of instance and cluster.
Expand Down Expand Up @@ -96,7 +96,7 @@ def test_ecs_mxnet_training_dgl_cpu(
@pytest.mark.integration("dgl")
@pytest.mark.model("gcn")
@pytest.mark.parametrize("training_script", [MX_DGL_TRAINING_SCRIPT], indirect=True)
@pytest.mark.parametrize("ecs_instance_type", ["p2.8xlarge"], indirect=True)
@pytest.mark.parametrize("ecs_instance_type", ["p3.8xlarge"], indirect=True)
@pytest.mark.parametrize("ecs_ami", [ECS_AML2_GPU_USWEST2], indirect=True)
@pytest.mark.team("dgl")
def test_ecs_mxnet_training_dgl_gpu(
Expand All @@ -105,7 +105,7 @@ def test_ecs_mxnet_training_dgl_gpu(
"""
GPU DGL test for MXNet Training

Instance Type - p2.xlarge
Instance Type - p3.8xlarge

DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
on this function.
Expand Down Expand Up @@ -165,7 +165,7 @@ def test_ecs_mxnet_training_gluonnlp_gpu(
"""
GPU Gluon NLP test for MXNet Training

Instance Type - p2.16xlarge
Instance Type - p3.16xlarge

DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
on this function.
Expand Down
2 changes: 1 addition & 1 deletion test/sagemaker_tests/mxnet/inference/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def tag(request, framework_version, processor, py_version):
@pytest.fixture(scope="session")
def instance_type(request, processor):
provided_instance_type = request.config.getoption("--instance-type")
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
return provided_instance_type if provided_instance_type is not None else default_instance_type


Expand Down
2 changes: 1 addition & 1 deletion test/sagemaker_tests/mxnet/training/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def tag(request, framework_version, processor, py_version):
@pytest.fixture(scope="session")
def instance_type(request, processor):
provided_instance_type = request.config.getoption("--instance-type")
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
return provided_instance_type if provided_instance_type is not None else default_instance_type


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_mnist_distributed_cpu(framework_version, ecr_image, instance_type, sage
@pytest.mark.gpu_test
@pytest.mark.team("conda")
def test_mnist_distributed_gpu(framework_version, ecr_image, instance_type, sagemaker_regions):
instance_type = instance_type or "ml.p2.xlarge"
instance_type = instance_type or "ml.p3.xlarge"
model_dir = os.path.join(model_cpu_dir, "model_mnist.tar.gz")
function_args = {
"framework_version": framework_version,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_dgl_gcn_training_gpu(ecr_image, sagemaker_regions, instance_type):
if Version(image_framework_version) == Version("1.6") and image_cuda_version == "cu110":
pytest.skip("DGL does not support CUDA 11 for PyTorch 1.6")

instance_type = instance_type or "ml.p2.xlarge"
instance_type = instance_type or "ml.p3.xlarge"
function_args = {
"instance_type": instance_type,
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_dist_operations_gpu(
"""
Test is run as multinode
"""
instance_type = instance_type or "ml.p2.xlarge"
instance_type = instance_type or "ml.p3.xlarge"
function_args = {
"framework_version": framework_version,
"instance_type": instance_type,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def pytest_configure(config):
os.environ["TEST_REGION"] = config.getoption("--region")
os.environ["TEST_VERSIONS"] = config.getoption("--versions") or "1.11.1,1.12.0,1.13.0"
os.environ["TEST_INSTANCE_TYPES"] = (
config.getoption("--instance-types") or "ml.m5.xlarge,ml.p2.xlarge"
config.getoption("--instance-types") or "ml.m5.xlarge,ml.p3.xlarge"
)

os.environ["TEST_EI_VERSIONS"] = config.getoption("--versions") or "1.11,1.12"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def account_id(request):
@pytest.fixture
def instance_type(request, processor):
provided_instance_type = request.config.getoption("--instance-type")
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
return provided_instance_type if provided_instance_type is not None else default_instance_type


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def account_id(request):
@pytest.fixture
def instance_type(request, processor):
provided_instance_type = request.config.getoption("--instance-type")
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p2.xlarge"
default_instance_type = "ml.c5.xlarge" if processor == "cpu" else "ml.p3.xlarge"
return provided_instance_type if provided_instance_type is not None else default_instance_type


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def test_hc_distributed_mnist_ps(ecr_image, sagemaker_regions, instance_type, fr

validate_or_skip_test(ecr_image=ecr_image)
print("ecr image used for training", ecr_image)
instance_type = instance_type or "ml.p2.xlarge"
instance_type = instance_type or "ml.p3.xlarge"
training_group = InstanceGroup("train_group", instance_type, 2)
invoke_sm_helper_function(
ecr_image,
Expand Down
6 changes: 3 additions & 3 deletions test/test_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type):
Check for all compatibility issues between DLC Image Types and EC2 Instance Types.
Currently configured to fail on the following checks:
1. p4d.24xlarge instance type is used with a cuda<11.0 image
2. p2.8xlarge instance type is used with a cuda=11.0 image for MXNET framework
2. p3.8xlarge instance type is used with a cuda=11.0 image for MXNET framework

:param image_uri: ECR Image URI in valid DLC-format
:param ec2_instance_type: EC2 Instance Type
Expand All @@ -624,15 +624,15 @@ def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type):
framework == "mxnet"
and get_processor_from_image_uri(image_uri) == "gpu"
and get_cuda_version_from_tag(image_uri).startswith("cu11")
and ec2_instance_type in ["p2.8xlarge"]
and ec2_instance_type in ["p3.8xlarge"]
)
incompatible_conditions.append(image_is_cuda11_on_incompatible_p2_instance_mxnet)

image_is_pytorch_1_11_on_incompatible_p2_instance_pytorch = (
framework == "pytorch"
and Version(framework_version) in SpecifierSet("==1.11.*")
and get_processor_from_image_uri(image_uri) == "gpu"
and ec2_instance_type in ["p2.8xlarge"]
and ec2_instance_type in ["p3.8xlarge"]
)
incompatible_conditions.append(image_is_pytorch_1_11_on_incompatible_p2_instance_pytorch)

Expand Down
Loading