change: fix tests for new regions (#983)

* fix test failures related to m4, c4 and t2 region availibility. * rename test_hvd_basic.py to hvd_basic.py * use regional s3 endpoint in TF script mode tests
aws · Aug 15, 2019 · db6c55e · db6c55e
1 parent 75f3554
commit db6c55e
Show file tree

Hide file tree

Showing 7 changed files with 44 additions and 18 deletions.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -31,7 +31,9 @@
 
 DEFAULT_REGION = "us-west-2"
 
-NO_M4_REGIONS = ["eu-west-3", "eu-north-1", "ap-east-1"]
+NO_M4_REGIONS = ["eu-west-3", "eu-north-1", "ap-east-1", "sa-east-1"]
+
+NO_T2_REGIONS = ["eu-north-1", "ap-east-1"]
 
 
 def pytest_addoption(parser):
@@ -262,6 +264,16 @@ def cpu_instance_type(sagemaker_session, request):
         return "ml.m4.xlarge"
 
 
+@pytest.fixture(scope="session")
+def alternative_cpu_instance_type(sagemaker_session, request):
+    region = sagemaker_session.boto_session.region_name
+    if region in NO_T2_REGIONS:
+        # T3 is not supported by hosting yet
+        return "ml.c5.xlarge"
+    else:
+        return "ml.t2.medium"
+
+
 @pytest.fixture(scope="session")
 def cpu_instance_family(cpu_instance_type):
     "_".join(cpu_instance_type.split(".")[0:2])

diff --git a/tests/data/horovod/test_hvd_basic.py → tests/data/horovod/hvd_basic.py b/tests/data/horovod/test_hvd_basic.py → tests/data/horovod/hvd_basic.py
diff --git a/tests/integ/test_horovod.py b/tests/integ/test_horovod.py
@@ -52,7 +52,7 @@ def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdi
     output_path = "file://%s" % tmpdir
     job_name = sagemaker.utils.unique_name_from_base("tf-horovod")
     estimator = TensorFlow(
-        entry_point=os.path.join(horovod_dir, "test_hvd_basic.py"),
+        entry_point=os.path.join(horovod_dir, "hvd_basic.py"),
         role="SageMakerRole",
         train_instance_count=2,
         train_instance_type="local",
@@ -100,7 +100,7 @@ def extract_files_from_s3(s3_url, tmpdir):
 def __create_and_fit_estimator(sagemaker_session, instance_type, tmpdir):
     job_name = sagemaker.utils.unique_name_from_base("tf-horovod")
     estimator = TensorFlow(
-        entry_point=os.path.join(horovod_dir, "test_hvd_basic.py"),
+        entry_point=os.path.join(horovod_dir, "hvd_basic.py"),
         role="SageMakerRole",
         train_instance_count=2,
         train_instance_type=instance_type,

diff --git a/tests/integ/test_inference_pipeline.py b/tests/integ/test_inference_pipeline.py
@@ -151,7 +151,9 @@ def test_inference_pipeline_model_deploy(sagemaker_session, cpu_instance_type):
         assert "Could not find model" in str(exception.value)
 
 
-def test_inference_pipeline_model_deploy_with_update_endpoint(sagemaker_session):
+def test_inference_pipeline_model_deploy_with_update_endpoint(
+    sagemaker_session, cpu_instance_type, alternative_cpu_instance_type
+):
     sparkml_data_path = os.path.join(DATA_DIR, "sparkml_model")
     xgboost_data_path = os.path.join(DATA_DIR, "xgboost_model")
     endpoint_name = "test-inference-pipeline-deploy-{}".format(sagemaker_timestamp())
@@ -179,13 +181,13 @@ def test_inference_pipeline_model_deploy_with_update_endpoint(sagemaker_session)
             role="SageMakerRole",
             sagemaker_session=sagemaker_session,
         )
-        model.deploy(1, "ml.t2.medium", endpoint_name=endpoint_name)
+        model.deploy(1, alternative_cpu_instance_type, endpoint_name=endpoint_name)
         old_endpoint = sagemaker_session.sagemaker_client.describe_endpoint(
             EndpointName=endpoint_name
         )
         old_config_name = old_endpoint["EndpointConfigName"]
 
-        model.deploy(1, "ml.m4.xlarge", update_endpoint=True, endpoint_name=endpoint_name)
+        model.deploy(1, cpu_instance_type, update_endpoint=True, endpoint_name=endpoint_name)
 
         # Wait for endpoint to finish updating
         max_retry_count = 40  # Endpoint update takes ~7min. 40 retries * 30s sleeps = 20min timeout
@@ -207,7 +209,7 @@ def test_inference_pipeline_model_deploy_with_update_endpoint(sagemaker_session)
         )
 
         assert old_config_name != new_config_name
-        assert new_config["ProductionVariants"][0]["InstanceType"] == "ml.m4.xlarge"
+        assert new_config["ProductionVariants"][0]["InstanceType"] == cpu_instance_type
         assert new_config["ProductionVariants"][0]["InitialInstanceCount"] == 1
 
     model.delete_model()

diff --git a/tests/integ/test_mxnet_train.py b/tests/integ/test_mxnet_train.py
@@ -151,7 +151,11 @@ def test_deploy_model_with_tags_and_kms(
 
 
 def test_deploy_model_with_update_endpoint(
-    mxnet_training_job, sagemaker_session, mxnet_full_version, cpu_instance_type
+    mxnet_training_job,
+    sagemaker_session,
+    mxnet_full_version,
+    cpu_instance_type,
+    alternative_cpu_instance_type,
 ):
     endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp())
 
@@ -169,13 +173,13 @@ def test_deploy_model_with_update_endpoint(
             sagemaker_session=sagemaker_session,
             framework_version=mxnet_full_version,
         )
-        model.deploy(1, "ml.t2.medium", endpoint_name=endpoint_name)
+        model.deploy(1, alternative_cpu_instance_type, endpoint_name=endpoint_name)
         old_endpoint = sagemaker_session.sagemaker_client.describe_endpoint(
             EndpointName=endpoint_name
         )
         old_config_name = old_endpoint["EndpointConfigName"]
 
-        model.deploy(1, "ml.m4.xlarge", update_endpoint=True, endpoint_name=endpoint_name)
+        model.deploy(1, cpu_instance_type, update_endpoint=True, endpoint_name=endpoint_name)
 
         # Wait for endpoint to finish updating
         max_retry_count = 40  # Endpoint update takes ~7min. 40 retries * 30s sleeps = 20min timeout
@@ -197,12 +201,16 @@ def test_deploy_model_with_update_endpoint(
         )
 
         assert old_config_name != new_config_name
-        assert new_config["ProductionVariants"][0]["InstanceType"] == "ml.m4.xlarge"
+        assert new_config["ProductionVariants"][0]["InstanceType"] == cpu_instance_type
         assert new_config["ProductionVariants"][0]["InitialInstanceCount"] == 1
 
 
 def test_deploy_model_with_update_non_existing_endpoint(
-    mxnet_training_job, sagemaker_session, mxnet_full_version, cpu_instance_type
+    mxnet_training_job,
+    sagemaker_session,
+    mxnet_full_version,
+    cpu_instance_type,
+    alternative_cpu_instance_type,
 ):
     endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp())
     expected_error_message = (
@@ -224,7 +232,7 @@ def test_deploy_model_with_update_non_existing_endpoint(
             sagemaker_session=sagemaker_session,
             framework_version=mxnet_full_version,
         )
-        model.deploy(1, "ml.t2.medium", endpoint_name=endpoint_name)
+        model.deploy(1, alternative_cpu_instance_type, endpoint_name=endpoint_name)
         sagemaker_session.sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
 
         with pytest.raises(ValueError, message=expected_error_message):

diff --git a/tests/integ/test_tf_keras.py b/tests/integ/test_tf_keras.py
@@ -60,7 +60,7 @@ def test_keras(sagemaker_session, cpu_instance_type):
 
     endpoint_name = estimator.latest_training_job.name
     with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
-        predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge")
+        predictor = estimator.deploy(initial_instance_count=1, instance_type=cpu_instance_type)
 
         data = np.random.randn(32, 32, 3)
         predict_response = predictor.predict(data)

diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py
@@ -57,7 +57,9 @@ def test_mnist(sagemaker_session, instance_type):
     with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
         estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-mnist"))
     _assert_s3_files_exist(
-        estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"]
+        estimator.model_dir,
+        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
+        sagemaker_session.boto_region_name,
     )
     df = estimator.training_job_analytics.dataframe()
     assert df.size > 0
@@ -118,7 +120,9 @@ def test_mnist_distributed(sagemaker_session, instance_type):
     with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
         estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-distributed"))
     _assert_s3_files_exist(
-        estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"]
+        estimator.model_dir,
+        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
+        sagemaker_session.boto_region_name,
     )
 
 
@@ -196,9 +200,9 @@ def test_deploy_with_input_handlers(sagemaker_session, instance_type):
         assert expected_result == result
 
 
-def _assert_s3_files_exist(s3_url, files):
+def _assert_s3_files_exist(s3_url, files, region):
     parsed_url = urlparse(s3_url)
-    s3 = boto3.client("s3")
+    s3 = boto3.client("s3", region_name=region)
     contents = s3.list_objects_v2(Bucket=parsed_url.netloc, Prefix=parsed_url.path.lstrip("/"))[
         "Contents"
     ]