Skip to content

Commit

Permalink
change: fix tests for new regions (#983)
Browse files Browse the repository at this point in the history
* fix test failures related to m4, c4 and t2 region availibility.

* rename test_hvd_basic.py to hvd_basic.py

* use regional s3 endpoint in TF script mode tests
  • Loading branch information
lianyiding authored and chuyang-deng committed Aug 15, 2019
1 parent 75f3554 commit db6c55e
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 18 deletions.
14 changes: 13 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@

DEFAULT_REGION = "us-west-2"

NO_M4_REGIONS = ["eu-west-3", "eu-north-1", "ap-east-1"]
NO_M4_REGIONS = ["eu-west-3", "eu-north-1", "ap-east-1", "sa-east-1"]

NO_T2_REGIONS = ["eu-north-1", "ap-east-1"]


def pytest_addoption(parser):
Expand Down Expand Up @@ -262,6 +264,16 @@ def cpu_instance_type(sagemaker_session, request):
return "ml.m4.xlarge"


@pytest.fixture(scope="session")
def alternative_cpu_instance_type(sagemaker_session, request):
region = sagemaker_session.boto_session.region_name
if region in NO_T2_REGIONS:
# T3 is not supported by hosting yet
return "ml.c5.xlarge"
else:
return "ml.t2.medium"


@pytest.fixture(scope="session")
def cpu_instance_family(cpu_instance_type):
"_".join(cpu_instance_type.split(".")[0:2])
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions tests/integ/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdi
output_path = "file://%s" % tmpdir
job_name = sagemaker.utils.unique_name_from_base("tf-horovod")
estimator = TensorFlow(
entry_point=os.path.join(horovod_dir, "test_hvd_basic.py"),
entry_point=os.path.join(horovod_dir, "hvd_basic.py"),
role="SageMakerRole",
train_instance_count=2,
train_instance_type="local",
Expand Down Expand Up @@ -100,7 +100,7 @@ def extract_files_from_s3(s3_url, tmpdir):
def __create_and_fit_estimator(sagemaker_session, instance_type, tmpdir):
job_name = sagemaker.utils.unique_name_from_base("tf-horovod")
estimator = TensorFlow(
entry_point=os.path.join(horovod_dir, "test_hvd_basic.py"),
entry_point=os.path.join(horovod_dir, "hvd_basic.py"),
role="SageMakerRole",
train_instance_count=2,
train_instance_type=instance_type,
Expand Down
10 changes: 6 additions & 4 deletions tests/integ/test_inference_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ def test_inference_pipeline_model_deploy(sagemaker_session, cpu_instance_type):
assert "Could not find model" in str(exception.value)


def test_inference_pipeline_model_deploy_with_update_endpoint(sagemaker_session):
def test_inference_pipeline_model_deploy_with_update_endpoint(
sagemaker_session, cpu_instance_type, alternative_cpu_instance_type
):
sparkml_data_path = os.path.join(DATA_DIR, "sparkml_model")
xgboost_data_path = os.path.join(DATA_DIR, "xgboost_model")
endpoint_name = "test-inference-pipeline-deploy-{}".format(sagemaker_timestamp())
Expand Down Expand Up @@ -179,13 +181,13 @@ def test_inference_pipeline_model_deploy_with_update_endpoint(sagemaker_session)
role="SageMakerRole",
sagemaker_session=sagemaker_session,
)
model.deploy(1, "ml.t2.medium", endpoint_name=endpoint_name)
model.deploy(1, alternative_cpu_instance_type, endpoint_name=endpoint_name)
old_endpoint = sagemaker_session.sagemaker_client.describe_endpoint(
EndpointName=endpoint_name
)
old_config_name = old_endpoint["EndpointConfigName"]

model.deploy(1, "ml.m4.xlarge", update_endpoint=True, endpoint_name=endpoint_name)
model.deploy(1, cpu_instance_type, update_endpoint=True, endpoint_name=endpoint_name)

# Wait for endpoint to finish updating
max_retry_count = 40 # Endpoint update takes ~7min. 40 retries * 30s sleeps = 20min timeout
Expand All @@ -207,7 +209,7 @@ def test_inference_pipeline_model_deploy_with_update_endpoint(sagemaker_session)
)

assert old_config_name != new_config_name
assert new_config["ProductionVariants"][0]["InstanceType"] == "ml.m4.xlarge"
assert new_config["ProductionVariants"][0]["InstanceType"] == cpu_instance_type
assert new_config["ProductionVariants"][0]["InitialInstanceCount"] == 1

model.delete_model()
Expand Down
20 changes: 14 additions & 6 deletions tests/integ/test_mxnet_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,11 @@ def test_deploy_model_with_tags_and_kms(


def test_deploy_model_with_update_endpoint(
mxnet_training_job, sagemaker_session, mxnet_full_version, cpu_instance_type
mxnet_training_job,
sagemaker_session,
mxnet_full_version,
cpu_instance_type,
alternative_cpu_instance_type,
):
endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp())

Expand All @@ -169,13 +173,13 @@ def test_deploy_model_with_update_endpoint(
sagemaker_session=sagemaker_session,
framework_version=mxnet_full_version,
)
model.deploy(1, "ml.t2.medium", endpoint_name=endpoint_name)
model.deploy(1, alternative_cpu_instance_type, endpoint_name=endpoint_name)
old_endpoint = sagemaker_session.sagemaker_client.describe_endpoint(
EndpointName=endpoint_name
)
old_config_name = old_endpoint["EndpointConfigName"]

model.deploy(1, "ml.m4.xlarge", update_endpoint=True, endpoint_name=endpoint_name)
model.deploy(1, cpu_instance_type, update_endpoint=True, endpoint_name=endpoint_name)

# Wait for endpoint to finish updating
max_retry_count = 40 # Endpoint update takes ~7min. 40 retries * 30s sleeps = 20min timeout
Expand All @@ -197,12 +201,16 @@ def test_deploy_model_with_update_endpoint(
)

assert old_config_name != new_config_name
assert new_config["ProductionVariants"][0]["InstanceType"] == "ml.m4.xlarge"
assert new_config["ProductionVariants"][0]["InstanceType"] == cpu_instance_type
assert new_config["ProductionVariants"][0]["InitialInstanceCount"] == 1


def test_deploy_model_with_update_non_existing_endpoint(
mxnet_training_job, sagemaker_session, mxnet_full_version, cpu_instance_type
mxnet_training_job,
sagemaker_session,
mxnet_full_version,
cpu_instance_type,
alternative_cpu_instance_type,
):
endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp())
expected_error_message = (
Expand All @@ -224,7 +232,7 @@ def test_deploy_model_with_update_non_existing_endpoint(
sagemaker_session=sagemaker_session,
framework_version=mxnet_full_version,
)
model.deploy(1, "ml.t2.medium", endpoint_name=endpoint_name)
model.deploy(1, alternative_cpu_instance_type, endpoint_name=endpoint_name)
sagemaker_session.sagemaker_client.describe_endpoint(EndpointName=endpoint_name)

with pytest.raises(ValueError, message=expected_error_message):
Expand Down
2 changes: 1 addition & 1 deletion tests/integ/test_tf_keras.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_keras(sagemaker_session, cpu_instance_type):

endpoint_name = estimator.latest_training_job.name
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge")
predictor = estimator.deploy(initial_instance_count=1, instance_type=cpu_instance_type)

data = np.random.randn(32, 32, 3)
predict_response = predictor.predict(data)
Expand Down
12 changes: 8 additions & 4 deletions tests/integ/test_tf_script_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ def test_mnist(sagemaker_session, instance_type):
with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-mnist"))
_assert_s3_files_exist(
estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"]
estimator.model_dir,
["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
sagemaker_session.boto_region_name,
)
df = estimator.training_job_analytics.dataframe()
assert df.size > 0
Expand Down Expand Up @@ -118,7 +120,9 @@ def test_mnist_distributed(sagemaker_session, instance_type):
with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-distributed"))
_assert_s3_files_exist(
estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"]
estimator.model_dir,
["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
sagemaker_session.boto_region_name,
)


Expand Down Expand Up @@ -196,9 +200,9 @@ def test_deploy_with_input_handlers(sagemaker_session, instance_type):
assert expected_result == result


def _assert_s3_files_exist(s3_url, files):
def _assert_s3_files_exist(s3_url, files, region):
parsed_url = urlparse(s3_url)
s3 = boto3.client("s3")
s3 = boto3.client("s3", region_name=region)
contents = s3.list_objects_v2(Bucket=parsed_url.netloc, Prefix=parsed_url.path.lstrip("/"))[
"Contents"
]
Expand Down

0 comments on commit db6c55e

Please sign in to comment.