Skip to content

Commit

Permalink
Use new repository names for Framework images (#114)
Browse files Browse the repository at this point in the history
* Use new repository names for Framework images

* Update changelog

* Fix flake8

* Fix framework_name_from_image to support both old and new image names

* Add unit test on tensorflow attach for new repo name

* Update docstring on legacy ecr repo naming
  • Loading branch information
winstonaws committed Mar 30, 2018
1 parent 0bbd9ba commit 4f92fbd
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 36 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
CHANGELOG
=========

1.1.dev4
========
* feature: Frameworks: Use more idiomatic ECR repository naming scheme

1.1.3
========

Expand Down
50 changes: 36 additions & 14 deletions src/sagemaker/fw_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,28 +28,40 @@
"""


def create_image_uri(region, framework, instance_type, framework_version, py_version, account='520713654638'):
def create_image_uri(region, framework, instance_type, framework_version, py_version, account='520713654638',
optimized_families=[]):
"""Return the ECR URI of an image.
Args:
region (str): AWS region where the image is uploaded.
framework (str): framework used by the image.
instance_type (str): EC2 instance type. Used to determine whether to use the CPU image or GPU image.
instance_type (str): SageMaker instance type. Used to determine device type (cpu/gpu/family-specific optimized).
framework_version (str): The version of the framework.
py_version (str): Python version. One of 'py2' or 'py3'.
account (str): AWS account that contains the image. (default: '520713654638')
optimized_families (str): Instance families for which there exist specific optimized images.
Returns:
str: The appropriate image URI based on the given parameters.
"""
device_type = 'cpu'
# Instance types that start with G, P are GPU powered: https://aws.amazon.com/sagemaker/pricing/instance-types/
if instance_type[3] in ['g', 'p']:

if not instance_type.startswith('ml.'):
raise ValueError('{} is not a valid SageMaker instance type. See: '
'https://aws.amazon.com/sagemaker/pricing/instance-types/'.format(instance_type))
family = instance_type.split('.')[1]

# For some frameworks, we have optimized images for specific families, e.g c5 or p3. In those cases,
# we use the family name in the image tag. In other cases, we use 'cpu' or 'gpu'.
if family in optimized_families:
device_type = family
elif family[0] in ['g', 'p']:
device_type = 'gpu'
else:
device_type = 'cpu'

tag = "{}-{}-{}".format(framework_version, device_type, py_version)
return "{}.dkr.ecr.{}.amazonaws.com/sagemaker-{}-{}-{}:{}" \
.format(account, region, framework, py_version, device_type, tag)
return "{}.dkr.ecr.{}.amazonaws.com/sagemaker-{}:{}" \
.format(account, region, framework, tag)


def tar_and_upload_dir(session, bucket, s3_key_prefix, script, directory):
Expand Down Expand Up @@ -107,8 +119,13 @@ def framework_name_from_image(image_name):
"""Extract the framework and Python version from the image name.
Args:
image_name (str): Image URI, which should take the form
'<account>.dkr.ecr.<region>.amazonaws.com/sagemaker-<framework>-<py_ver>-<device>:<tag>'
image_name (str): Image URI, which should be one of the following forms:
legacy:
'<account>.dkr.ecr.<region>.amazonaws.com/sagemaker-<fw>-<py_ver>-<device>:<container_version>'
legacy:
'<account>.dkr.ecr.<region>.amazonaws.com/sagemaker-<fw>-<py_ver>-<device>:<fw_version>-<device>-<py_ver>'
current:
'<account>.dkr.ecr.<region>.amazonaws.com/sagemaker-<fw>:<fw_version>-<device>-<py_ver>'
Returns:
tuple: A tuple containing:
Expand All @@ -123,14 +140,19 @@ def framework_name_from_image(image_name):
return None, None, None
else:
# extract framework, python version and image tag
name_pattern = re.compile('^sagemaker-(tensorflow|mxnet)-(py2|py3)-(cpu|gpu):(.*)$')

# We must support both the legacy and current image name format.
name_pattern = re.compile('^sagemaker-(tensorflow|mxnet):(.*?)-(.*?)-(py2|py3)$')
legacy_name_pattern = re.compile('^sagemaker-(tensorflow|mxnet)-(py2|py3)-(cpu|gpu):(.*)$')
name_match = name_pattern.match(sagemaker_match.group(8))
legacy_match = legacy_name_pattern.match(sagemaker_match.group(8))

if name_match is None:
return None, None, None
if name_match is not None:
fw, ver, device, py = name_match.group(1), name_match.group(2), name_match.group(3), name_match.group(4)
return fw, py, '{}-{}-{}'.format(ver, device, py)
elif legacy_match is not None:
return legacy_match.group(1), legacy_match.group(2), legacy_match.group(4)
else:
return name_match.group(1), name_match.group(2), name_match.group(4)
return None, None, None


def framework_version_from_tag(image_tag):
Expand Down
52 changes: 43 additions & 9 deletions tests/unit/test_fw_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,42 @@ def sagemaker_session():


def test_create_image_uri_cpu():
image_uri = create_image_uri('mars-south-3', 'mlfw', 'any-non-gpu-device', '1.0rc', 'py2', '23')
assert image_uri == '23.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mlfw-py2-cpu:1.0rc-cpu-py2'
image_uri = create_image_uri('mars-south-3', 'mlfw', 'ml.c4.large', '1.0rc', 'py2', '23')
assert image_uri == '23.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mlfw:1.0rc-cpu-py2'


def test_create_image_uri_gpu():
image_uri = create_image_uri('mars-south-3', 'mlfw', 'ml.p3.2xlarge', '1.0rc', 'py3', '23')
assert image_uri == '23.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mlfw-py3-gpu:1.0rc-gpu-py3'
assert image_uri == '23.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mlfw:1.0rc-gpu-py3'


def test_create_image_uri_default_account():
image_uri = create_image_uri('mars-south-3', 'mlfw', 'ml.p3.2xlarge', '1.0rc', 'py3')
assert image_uri == '520713654638.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mlfw-py3-gpu:1.0rc-gpu-py3'
assert image_uri == '520713654638.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mlfw:1.0rc-gpu-py3'


def test_invalid_instance_type():
# instance type is missing 'ml.' prefix
with pytest.raises(ValueError):
create_image_uri('mars-south-3', 'mlfw', 'p3.2xlarge', '1.0.0', 'py3')


def test_optimized_family():
image_uri = create_image_uri('mars-south-3', 'mlfw', 'ml.p3.2xlarge', '1.0.0', 'py3',
optimized_families=['c5', 'p3'])
assert image_uri == '520713654638.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mlfw:1.0.0-p3-py3'


def test_unoptimized_cpu_family():
image_uri = create_image_uri('mars-south-3', 'mlfw', 'ml.m4.xlarge', '1.0.0', 'py3',
optimized_families=['c5', 'p3'])
assert image_uri == '520713654638.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mlfw:1.0.0-cpu-py3'


def test_unoptimized_gpu_family():
image_uri = create_image_uri('mars-south-3', 'mlfw', 'ml.p2.xlarge', '1.0.0', 'py3',
optimized_families=['c5', 'p3'])
assert image_uri == '520713654638.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mlfw:1.0.0-gpu-py3'


def test_tar_and_upload_dir_s3(sagemaker_session):
Expand Down Expand Up @@ -99,36 +123,46 @@ def test_tar_and_upload_dir_not_s3(sagemaker_session):
assert result == UploadedCode('s3://{}/{}/sourcedir.tar.gz'.format(bucket, s3_key_prefix), script)


def test_framework_name_from_framework_image():
def test_framework_name_from_image_mxnet():
image_name = '123.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:1.1-gpu-py3'
assert ('mxnet', 'py3', '1.1-gpu-py3') == framework_name_from_image(image_name)


def test_framework_name_from_image_tf():
image_name = '123.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow:1.6-cpu-py2'
assert ('tensorflow', 'py2', '1.6-cpu-py2') == framework_name_from_image(image_name)


def test_legacy_name_from_framework_image():
image_name = '123.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py3-gpu:2.5.6-gpu-py2'
framework, py_ver, tag = framework_name_from_image(image_name)
assert framework == 'mxnet'
assert py_ver == 'py3'
assert tag == '2.5.6-gpu-py2'


def test_framework_name_from_wrong_framework():
def test_legacy_name_from_wrong_framework():
framework, py_ver, tag = framework_name_from_image('123.dkr.ecr.us-west-2.amazonaws.com/sagemaker-myown-py2-gpu:1')
assert framework is None
assert py_ver is None
assert tag is None


def test_framework_name_from_wrong_python():
def test_legacy_name_from_wrong_python():
framework, py_ver, tag = framework_name_from_image('123.dkr.ecr.us-west-2.amazonaws.com/sagemaker-myown-py4-gpu:1')
assert framework is None
assert py_ver is None
assert tag is None


def test_framework_name_from_wrong_device():
def test_legacy_name_from_wrong_device():
framework, py_ver, tag = framework_name_from_image('123.dkr.ecr.us-west-2.amazonaws.com/sagemaker-myown-py4-gpu:1')
assert framework is None
assert py_ver is None
assert tag is None


def test_framework_name_from_image_any_tag():
def test_legacy_name_from_image_any_tag():
image_name = '123.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py2-cpu:any-tag'
framework, py_ver, tag = framework_name_from_image(image_name)
assert framework == 'tensorflow'
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/test_mxnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
BUCKET_NAME = 'mybucket'
INSTANCE_COUNT = 1
INSTANCE_TYPE = 'ml.c4.4xlarge'
IMAGE_CPU_NAME = 'sagemaker-mxnet-py2-cpu'
IMAGE_CPU_NAME = 'sagemaker-mxnet'
JOB_NAME = '{}-{}'.format(IMAGE_CPU_NAME, TIMESTAMP)
FULL_IMAGE_URI = '520713654638.dkr.ecr.us-west-2.amazonaws.com/{}:{}-cpu-py2'
ROLE = 'Dummy'
Expand Down Expand Up @@ -138,10 +138,10 @@ def test_mxnet(strftime, sagemaker_session, mxnet_version):

model = mx.create_model()

expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-gpu:{}-gpu-py2'
expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:{}-gpu-py2'
assert {'Environment':
{'SAGEMAKER_SUBMIT_DIRECTORY':
's3://mybucket/sagemaker-mxnet-py2-cpu-{}/sourcedir.tar.gz'.format(TIMESTAMP),
's3://mybucket/sagemaker-mxnet-{}/sourcedir.tar.gz'.format(TIMESTAMP),
'SAGEMAKER_PROGRAM': 'dummy_script.py',
'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false',
'SAGEMAKER_REGION': 'us-west-2',
Expand Down
67 changes: 57 additions & 10 deletions tests/unit/test_tf_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@
BUCKET_NAME = 'mybucket'
INSTANCE_COUNT = 1
INSTANCE_TYPE = 'ml.c4.4xlarge'
CPU_IMAGE_NAME = 'sagemaker-tensorflow-py2-cpu'
GPU_IMAGE_NAME = 'sagemaker-tensorflow-py2-gpu'
JOB_NAME = '{}-{}'.format(CPU_IMAGE_NAME, TIMESTAMP)
IMAGE_REPO_NAME = 'sagemaker-tensorflow'
JOB_NAME = '{}-{}'.format(IMAGE_REPO_NAME, TIMESTAMP)
ROLE = 'Dummy'
REGION = 'us-west-2'
DOCKER_TAG = '1.0'
Expand All @@ -53,11 +52,11 @@ def sagemaker_session():


def _get_full_cpu_image_uri(version):
return IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, version, 'cpu', 'py2')
return IMAGE_URI_FORMAT_STRING.format(REGION, IMAGE_REPO_NAME, version, 'cpu', 'py2')


def _get_full_gpu_image_uri(version):
return IMAGE_URI_FORMAT_STRING.format(REGION, GPU_IMAGE_NAME, version, 'gpu', 'py2')
return IMAGE_URI_FORMAT_STRING.format(REGION, IMAGE_REPO_NAME, version, 'gpu', 'py2')


def _create_train_job(tf_version):
Expand Down Expand Up @@ -231,11 +230,11 @@ def test_tf(time, strftime, sagemaker_session, tf_version):
'SAGEMAKER_REGION': 'us-west-2',
'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'
},
'Image': create_image_uri('us-west-2', "tensorflow", GPU_IMAGE_NAME, tf_version, "py2"),
'ModelDataUrl': 's3://m/m.tar.gz'} == model.prepare_container_def(GPU_IMAGE_NAME)
'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"),
'ModelDataUrl': 's3://m/m.tar.gz'} == model.prepare_container_def(INSTANCE_TYPE)

assert 'cpu' in model.prepare_container_def(CPU_IMAGE_NAME)['Image']
predictor = tf.deploy(1, GPU_IMAGE_NAME)
assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image']
predictor = tf.deploy(1, INSTANCE_TYPE)
assert isinstance(predictor, TensorFlowPredictor)


Expand All @@ -257,7 +256,7 @@ def test_run_tensorboard_locally_without_tensorboard_binary(time, strftime, pope
def test_model(sagemaker_session, tf_version):
model = TensorFlowModel("s3://some/data.tar.gz", role=ROLE, entry_point=SCRIPT_PATH,
sagemaker_session=sagemaker_session)
predictor = model.deploy(1, GPU_IMAGE_NAME)
predictor = model.deploy(1, INSTANCE_TYPE)
assert isinstance(predictor, TensorFlowPredictor)


Expand Down Expand Up @@ -410,6 +409,54 @@ def test_attach(sagemaker_session, tf_version):
assert estimator.checkpoint_path == 's3://other/1508872349'


def test_attach_new_repo_name(sagemaker_session, tf_version):
training_image = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow:{}-cpu-py2'.format(tf_version)
rjd = {'AlgorithmSpecification':
{'TrainingInputMode': 'File',
'TrainingImage': training_image},
'HyperParameters':
{'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
'checkpoint_path': '"s3://other/1508872349"',
'sagemaker_program': '"iris-dnn-classifier.py"',
'sagemaker_enable_cloudwatch_metrics': 'false',
'sagemaker_container_log_level': '"logging.INFO"',
'sagemaker_job_name': '"neo"',
'training_steps': '100',
'evaluation_steps': '10'},
'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
'ResourceConfig':
{'VolumeSizeInGB': 30,
'InstanceCount': 1,
'InstanceType': 'ml.c4.xlarge'},
'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
'TrainingJobName': 'neo',
'TrainingJobStatus': 'Completed',
'OutputDataConfig': {'KmsKeyId': '',
'S3OutputPath': 's3://place/output/neo'},
'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd)

estimator = TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
assert estimator.latest_training_job.job_name == 'neo'
assert estimator.py_version == 'py2'
assert estimator.framework_version == tf_version
assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole'
assert estimator.train_instance_count == 1
assert estimator.train_max_run == 24 * 60 * 60
assert estimator.input_mode == 'File'
assert estimator.training_steps == 100
assert estimator.evaluation_steps == 10
assert estimator.input_mode == 'File'
assert estimator.base_job_name == 'neo'
assert estimator.output_path == 's3://place/output/neo'
assert estimator.output_kms_key == ''
assert estimator.hyperparameters()['training_steps'] == '100'
assert estimator.source_dir == 's3://some/sourcedir.tar.gz'
assert estimator.entry_point == 'iris-dnn-classifier.py'
assert estimator.checkpoint_path == 's3://other/1508872349'
assert estimator.train_image() == training_image


def test_attach_old_container(sagemaker_session):
training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py2-cpu:1.0'
rjd = {'AlgorithmSpecification':
Expand Down

0 comments on commit 4f92fbd

Please sign in to comment.