Skip to content

Commit

Permalink
fix: change AMI ids in tests to be dynamic based on regions (#1004)
Browse files Browse the repository at this point in the history
* fix: change Amazon AMI ids to be dynamic based on regions using searching Amazon Linux AMI
  • Loading branch information
caxiaohu authored and chuyang-deng committed Aug 28, 2019
1 parent af6f943 commit f233dc0
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 73 deletions.
2 changes: 0 additions & 2 deletions tests/integ/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,6 @@
NO_LDA_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"]
NO_MARKET_PLACE_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"]

EFS_TEST_ENABLED_REGION = ["us-west-2"]

logging.getLogger("boto3").setLevel(logging.INFO)
logging.getLogger("botocore").setLevel(logging.INFO)

Expand Down
37 changes: 28 additions & 9 deletions tests/integ/file_system_input_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import collections
import logging
from operator import itemgetter
import os
from os import path
import stat
Expand All @@ -27,13 +28,12 @@
from tests.integ.vpc_test_utils import check_or_create_vpc_resources_efs_fsx

VPC_NAME = "sagemaker-efs-fsx-vpc"
ALINUX_AMI_NAME_FILTER = "amzn-ami-hvm-????.??.?.????????-x86_64-gp2"
EFS_CREATION_TOKEN = str(uuid.uuid4())
PREFIX = "ec2_fs_key_"
KEY_NAME = PREFIX + str(uuid.uuid4().hex.upper()[0:8])
ROLE_NAME = "SageMakerRole"
REGION = "us-west-2"
EC2_INSTANCE_TYPE = "t2.micro"
AMI_ID = "ami-082b5a644766e0e6f"
MIN_COUNT = 1
MAX_COUNT = 1

Expand Down Expand Up @@ -69,12 +69,13 @@ def set_up_efs_fsx(sagemaker_session):
_check_or_create_key_pair(sagemaker_session)
_check_or_create_iam_profile_and_attach_role(sagemaker_session)
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
sagemaker_session, REGION, VPC_NAME
sagemaker_session, VPC_NAME
)

ami_id = _ami_id_for_region(sagemaker_session)
ec2_instance = _create_ec2_instance(
sagemaker_session,
AMI_ID,
ami_id,
EC2_INSTANCE_TYPE,
KEY_NAME,
MIN_COUNT,
Expand All @@ -100,16 +101,34 @@ def set_up_efs_fsx(sagemaker_session):
mount_efs_target_id,
)

region = sagemaker_session.boto_region_name
try:
connected_instance = _connect_ec2_instance(ec2_instance)
_upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id)
_upload_data_and_mount_fs(
connected_instance, file_system_efs_id, file_system_fsx_id, region
)
except Exception:
tear_down(sagemaker_session, fs_resources)
raise

return fs_resources


def _ami_id_for_region(sagemaker_session):
ec2_client = sagemaker_session.boto_session.client("ec2")
filters = [
{"Name": "name", "Values": [ALINUX_AMI_NAME_FILTER]},
{"Name": "state", "Values": ["available"]},
]
response = ec2_client.describe_images(Filters=filters)
image_details = sorted(response["Images"], key=itemgetter("CreationDate"), reverse=True)

if len(image_details) == 0:
raise Exception("AMI was not found based on current search criteria: {}".format(filters))

return image_details[0]["ImageId"]


def _connect_ec2_instance(ec2_instance):
public_ip_address = ec2_instance.public_ip_address
connected_instance = Connection(
Expand All @@ -118,7 +137,7 @@ def _connect_ec2_instance(ec2_instance):
return connected_instance


def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id):
def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id, region):
connected_instance.put(FS_MOUNT_SCRIPT, ".")
connected_instance.run("mkdir temp_tf; mkdir temp_one_p", in_stream=False)
for dir_name, subdir_list, file_list in os.walk(MNIST_LOCAL_DATA):
Expand All @@ -127,7 +146,7 @@ def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_syste
connected_instance.put(local_file, "temp_tf/")
connected_instance.put(ONE_P_LOCAL_DATA, "temp_one_p/")
connected_instance.run(
"sudo sh fs_mount_setup.sh {} {}".format(file_system_efs_id, file_system_fsx_id),
"sudo sh fs_mount_setup.sh {} {} {}".format(file_system_efs_id, file_system_fsx_id, region),
in_stream=False,
)

Expand Down Expand Up @@ -168,7 +187,7 @@ def _check_or_create_efs(sagemaker_session):

def _create_efs_mount(sagemaker_session, file_system_id):
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
sagemaker_session, REGION, VPC_NAME
sagemaker_session, VPC_NAME
)
efs_client = sagemaker_session.boto_session.client("efs")
mount_response = efs_client.create_mount_target(
Expand All @@ -188,7 +207,7 @@ def _create_efs_mount(sagemaker_session, file_system_id):
def _check_or_create_fsx(sagemaker_session):
fsx_client = sagemaker_session.boto_session.client("fsx")
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
sagemaker_session, REGION, VPC_NAME
sagemaker_session, VPC_NAME
)
create_response = fsx_client.create_file_system(
FileSystemType="LUSTRE",
Expand Down
34 changes: 8 additions & 26 deletions tests/integ/test_kmeans_efs_fsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

import pytest

import tests.integ
from sagemaker import KMeans
from sagemaker.amazon.amazon_estimator import FileSystemRecordSet
from sagemaker.parameter import IntegerParameter, CategoricalParameter
Expand All @@ -25,7 +24,6 @@
from tests.integ.s3_utils import assert_s3_files_exist
from tests.integ.timeout import timeout

TRAIN_INSTANCE_TYPE = "ml.c4.xlarge"
TRAIN_INSTANCE_COUNT = 1
OBJECTIVE_METRIC_NAME = "test:msd"
EFS_DIR_PATH = "/one_p_mnist"
Expand All @@ -46,19 +44,15 @@ def efs_fsx_setup(sagemaker_session):
tear_down(sagemaker_session, fs_resources)


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_kmeans_efs(efs_fsx_setup, sagemaker_session):
def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
role = efs_fsx_setup.role_name
kmeans = KMeans(
role=role,
train_instance_count=TRAIN_INSTANCE_COUNT,
train_instance_type=TRAIN_INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
k=K,
sagemaker_session=sagemaker_session,
subnets=subnets,
Expand All @@ -80,19 +74,15 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session):
assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session):
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
role = efs_fsx_setup.role_name
kmeans = KMeans(
role=role,
train_instance_count=TRAIN_INSTANCE_COUNT,
train_instance_type=TRAIN_INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
k=K,
sagemaker_session=sagemaker_session,
subnets=subnets,
Expand All @@ -114,18 +104,14 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session):
assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session):
def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
role = efs_fsx_setup.role_name
kmeans = KMeans(
role=role,
train_instance_count=TRAIN_INSTANCE_COUNT,
train_instance_type=TRAIN_INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
k=K,
sagemaker_session=sagemaker_session,
subnets=subnets,
Expand Down Expand Up @@ -174,18 +160,14 @@ def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session):
assert best_training_job


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session):
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
role = efs_fsx_setup.role_name
kmeans = KMeans(
role=role,
train_instance_count=TRAIN_INSTANCE_COUNT,
train_instance_type=TRAIN_INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
k=K,
sagemaker_session=sagemaker_session,
subnets=subnets,
Expand Down
34 changes: 8 additions & 26 deletions tests/integ/test_tf_efs_fsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

import pytest

import tests.integ
from sagemaker.inputs import FileSystemInput
from sagemaker.parameter import IntegerParameter
from sagemaker.tensorflow import TensorFlow
Expand All @@ -32,7 +31,6 @@
MNIST_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tensorflow_mnist")
SCRIPT = os.path.join(MNIST_RESOURCE_PATH, "mnist.py")
TFS_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tfs", "tfs-test-entrypoint-with-handler")
INSTANCE_TYPE = "ml.c4.xlarge"
EFS_DIR_PATH = "/tensorflow"
FSX_DIR_PATH = "/fsx/tensorflow"
MAX_JOBS = 2
Expand All @@ -49,11 +47,7 @@ def efs_fsx_setup(sagemaker_session):
tear_down(sagemaker_session, fs_resources)


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_mnist_efs(efs_fsx_setup, sagemaker_session):
def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
role = efs_fsx_setup.role_name
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
Expand All @@ -62,7 +56,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session):
entry_point=SCRIPT,
role=role,
train_instance_count=1,
train_instance_type=INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
Expand All @@ -85,11 +79,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session):
)


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
role = efs_fsx_setup.role_name
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
Expand All @@ -98,7 +88,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
entry_point=SCRIPT,
role=role,
train_instance_count=1,
train_instance_type=INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
Expand All @@ -121,11 +111,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
)


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
role = efs_fsx_setup.role_name
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
Expand All @@ -134,7 +120,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
entry_point=SCRIPT,
role=role,
train_instance_count=1,
train_instance_type=INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
script_mode=True,
sagemaker_session=sagemaker_session,
py_version=PY_VERSION,
Expand Down Expand Up @@ -169,11 +155,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
assert best_training_job


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session):
def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
role = efs_fsx_setup.role_name
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
Expand All @@ -182,7 +164,7 @@ def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session):
entry_point=SCRIPT,
role=role,
train_instance_count=1,
train_instance_type=INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
script_mode=True,
sagemaker_session=sagemaker_session,
py_version=PY_VERSION,
Expand Down
12 changes: 5 additions & 7 deletions tests/integ/vpc_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _route_table_id(ec2_client, vpc_id):
return desc["RouteTables"][0]["RouteTableId"]


def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NAME):
def check_or_create_vpc_resources_efs_fsx(sagemaker_session, name=VPC_NAME):
# use lock to prevent race condition when tests are running concurrently
with lock.lock(LOCK_PATH):
ec2_client = sagemaker_session.boto_session.client("ec2")
Expand All @@ -74,13 +74,11 @@ def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NA
_security_group_ids_by_vpc_id(sagemaker_session, vpc_id),
)
else:
return _create_vpc_with_name_efs_fsx(ec2_client, region, name)
return _create_vpc_with_name_efs_fsx(ec2_client, name)


def _create_vpc_with_name_efs_fsx(ec2_client, region, name):
vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(
ec2_client, region, name
)
def _create_vpc_with_name_efs_fsx(ec2_client, name):
vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(ec2_client, name)
ec2_client.modify_vpc_attribute(EnableDnsHostnames={"Value": True}, VpcId=vpc_id)

ig = ec2_client.create_internet_gateway()
Expand Down Expand Up @@ -121,7 +119,7 @@ def _create_vpc_with_name_efs_fsx(ec2_client, region, name):
return [subnet_id_a], [security_group_id]


def _create_vpc_resources(ec2_client, region, name):
def _create_vpc_resources(ec2_client, name):
vpc_id = ec2_client.create_vpc(CidrBlock="10.0.0.0/16")["Vpc"]["VpcId"]
print("created vpc: {}".format(vpc_id))

Expand Down
7 changes: 4 additions & 3 deletions tests/scripts/fs_mount_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,19 @@
# Mounting EFS and FSx for Lustre file systems for integration Tests
FILE_SYSTEM_EFS_ID=$1
FILE_SYSTEM_FSX_ID=$2
REGION=$3

echo "Mounting EFS File Systems"
sudo yum install -y amazon-efs-utils.noarch 0:1.10-1.amzn2
sudo yum install -y amazon-efs-utils
sudo mkdir efs
sudo mount -t efs "$FILE_SYSTEM_EFS_ID":/ efs
sudo mkdir efs/tensorflow
sudo mkdir efs/one_p_mnist

echo "Mounting FSx for Lustre File System"
sudo amazon-linux-extras install -y lustre2.10
sudo yum install -y lustre-client
sudo mkdir -p /mnt/fsx
sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx.us-west-2.amazonaws.com@tcp:/fsx /mnt/fsx
sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx."$REGION".amazonaws.com@tcp:/fsx /mnt/fsx
sudo mkdir /mnt/fsx/tensorflow
sudo mkdir /mnt/fsx/one_p_mnist

Expand Down

0 comments on commit f233dc0

Please sign in to comment.