Skip to content
Permalink
Browse files

Refactoring sagemaker deployment experience (#328)

* modify func name to be more clear

* refactor sagemaker apply operation

* Add spinner for waiting action complete

* update deploy to sagemaker notebook

* remove unused imports

* update ensure_api_name check for deployments

* add comment for delete operation

* update ensure api name function signature

* updat error message copy

* update list deployment command all-namespaces option
  • Loading branch information...
yubozhao committed Oct 9, 2019
1 parent cd8a0ba commit a4448da9246f88dddc3e6fcbd7b746253f9b508a
@@ -47,6 +47,7 @@
from bentoml.exceptions import BentoMLDeploymentException, BentoMLException
from bentoml.deployment.store import ALL_NAMESPACE_TAG
from bentoml import config
from bentoml.cli.utils import Spinner

# pylint: disable=unused-variable

@@ -83,16 +84,17 @@ def get_state_after_await_action_complete(
yatai_service, name, namespace, message, timeout_limit=600, wait_time=5
):
start_time = time.time()
while (time.time() - start_time) < timeout_limit:
result = yatai_service.DescribeDeployment(
DescribeDeploymentRequest(deployment_name=name, namespace=namespace)
)
if result.state.state is DeploymentState.PENDING:
time.sleep(wait_time)
_echo(message)
continue
else:
break

with Spinner(message):
while (time.time() - start_time) < timeout_limit:
result = yatai_service.DescribeDeployment(
DescribeDeploymentRequest(deployment_name=name, namespace=namespace)
)
if result.state.state is DeploymentState.PENDING:
time.sleep(wait_time)
continue
else:
break
return result


@@ -293,7 +295,7 @@ def create(
)
if result.status.status_code != Status.OK:
_echo(
'Failed to create deployment {name}. code: {error_code}, message: '
'Failed to create deployment {name}. {error_code}: '
'{error_message}'.format(
name=name,
error_code=Status.Code.Name(result.status.status_code),
@@ -307,7 +309,7 @@ def create(
yatai_service=yatai_service,
name=name,
namespace=namespace,
message='Creating deployment...',
message='Creating deployment ',
)
result.deployment.state.CopyFrom(result_state.state)

@@ -354,7 +356,7 @@ def apply(deployment_yaml, output, wait):
yatai_service=yatai_service,
name=deployment_pb.name,
namespace=deployment_pb.namespace,
message='Applying deployment...',
message='Applying deployment',
)
result.deployment.state.CopyFrom(result_state.state)

@@ -476,7 +478,7 @@ def describe(name, output, namespace):
help='Deployment namespace managed by BentoML, default value is "default" which'
'can be changed in BentoML configuration file',
)
@click.option('--all-namespace', type=click.BOOL, default=False)
@click.option('--all-namespaces', is_flag=True)
@click.option(
'--limit', type=click.INT, help='Limit how many deployments will be retrieved'
)
@@ -492,10 +494,10 @@ def describe(name, output, namespace):
help='List deployments matching the giving labels',
)
@click.option('-o', '--output', type=click.Choice(['json', 'yaml']), default='json')
def list_deployments(output, limit, filters, labels, namespace, all_namespace):
def list_deployments(output, limit, filters, labels, namespace, all_namespaces):
track_cli('deploy-list')

if all_namespace:
if all_namespaces:
if namespace is not None:
logger.warning(
'Ignoring `namespace=%s` due to the --all-namespace flag presented',
@@ -0,0 +1,70 @@
# Copyright 2019 Atalaya Tech, Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import threading
import itertools
import time


class Spinner:
def __init__(self, message, delay=0.1):
self.spinner = itertools.cycle(['-', '/', '|', '\\'])
self.delay = delay
self.busy = False
self._screen_lock = None
self.thread = None
self.spinner_visible = False
sys.stdout.write(message)

def write_next(self):
with self._screen_lock:
if not self.spinner_visible:
sys.stdout.write(next(self.spinner))
self.spinner_visible = True
sys.stdout.flush()

def remove_spinner(self, cleanup=False):
with self._screen_lock:
if self.spinner_visible:
sys.stdout.write('\b')
self.spinner_visible = False
if cleanup:
sys.stdout.write(' ') # overwrite spinner with blank
sys.stdout.write('\r') # move to next line
sys.stdout.flush()

def spinner_task(self):
while self.busy:
self.write_next()
time.sleep(self.delay)
self.remove_spinner()

def __enter__(self):
if sys.stdout.isatty():
self._screen_lock = threading.Lock()
self.busy = True
self.thread = threading.Thread(target=self.spinner_task)
self.thread.start()

def __exit__(self, exception, value, tb):
if sys.stdout.isatty():
self.busy = False
self.remove_spinner(cleanup=True)
else:
sys.stdout.write('\r')
@@ -34,7 +34,9 @@
process_docker_api_line,
ensure_docker_available_or_raise,
exception_to_return_status,
ensure_deploy_api_name_exists_in_bento,
)
from bentoml.proto.repository_pb2 import GetBentoRequest, BentoUri
from bentoml.yatai.status import Status
from bentoml.utils.tempdir import TempDirectory
from bentoml.exceptions import BentoMLDeploymentException, BentoMLException
@@ -51,16 +53,10 @@
DescribeDeploymentResponse,
DeploymentState,
)
from bentoml.archive.loader import load_bento_service_metadata

logger = logging.getLogger(__name__)


DEFAULT_REGION = "us-west-2"
DEFAULT_INSTANCE_TYPE = "ml.m4.xlarge"
DEFAULT_INSTANCE_COUNT = 1


def strip_scheme(url):
""" Stripe url's schema
e.g. http://some.url/path -> some.url/path
@@ -85,11 +81,13 @@ def create_sagemaker_model_name(bento_name, bento_version):

def create_sagemaker_endpoint_config_name(bento_name, bento_version):
return generate_aws_compatible_string(
'{name}-{version}-configuration'.format(name=bento_name, version=bento_version)
'bentoml-{name}-{version}-configuration'.format(
name=bento_name, version=bento_version
)
)


def get_arn_role_from_current_user():
def get_arn_role_from_current_aws_user():
sts_client = boto3.client("sts")
identity = sts_client.get_caller_identity()
sts_arn = identity["Arn"]
@@ -118,7 +116,7 @@ def get_arn_role_from_current_user():
return role_response["Role"]["Arn"]


def create_push_image_to_ecr(bento_name, bento_version, snapshot_path):
def create_push_docker_image_to_ecr(bento_name, bento_version, snapshot_path):
"""Create BentoService sagemaker image and push to AWS ECR
Example: https://github.com/awslabs/amazon-sagemaker-examples/blob/\
@@ -175,47 +173,6 @@ def create_push_image_to_ecr(bento_name, bento_version, snapshot_path):
return ecr_tag


class TemporarySageMakerContent(object):
def __init__(self, archive_path, bento_name, bento_version, _cleanup=True):
self.archive_path = archive_path
self.bento_name = bento_name
self.bento_version = bento_version
self.temp_directory = TempDirectory()
self._cleanup = _cleanup
self.path = None

def __enter__(self):
self.generate()
return self.path

def generate(self):
self.temp_directory.create()
tempdir = self.temp_directory.path
saved_path = os.path.join(tempdir, self.bento_name, self.bento_version)
shutil.copytree(self.archive_path, saved_path)

with open(os.path.join(saved_path, "nginx.conf"), "w") as f:
f.write(DEFAULT_NGINX_CONFIG)
with open(os.path.join(saved_path, "wsgi.py"), "w") as f:
f.write(DEFAULT_WSGI_PY)
with open(os.path.join(saved_path, "serve"), "w") as f:
f.write(DEFAULT_SERVE_SCRIPT)

# permission 755 is required for entry script 'serve'
permission = "755"
octal_permission = int(permission, 8)
os.chmod(os.path.join(saved_path, "serve"), octal_permission)
self.path = saved_path

def cleanup(self):
self.temp_directory.cleanup()
self.path = None

def __exit__(self, exc_type, exc_val, exc_tb):
if self._cleanup:
self.cleanup()


# Sagemaker response status: 'OutOfService'|'Creating'|'Updating'|
# 'SystemUpdating'|'RollingBack'|'InService'|
# 'Deleting'|'Failed'
@@ -251,10 +208,10 @@ def _parse_aws_client_exception_or_raise(e):
operation=e.operation_name, code=error_code, message=error_message
)
if error_code == 'ValidationException':
logger.info(error_log_message)
logger.error(error_log_message)
return Status.NOT_FOUND(error_response.get('Message', 'Unknown'))
elif error_code == 'InvalidSignatureException':
logger.info(error_log_message)
logger.error(error_log_message)
return Status.UNAUTHENTICATED(error_response.get('Message', 'Unknown'))
else:
logger.error(error_log_message)
@@ -286,30 +243,64 @@ def _cleanup_sagemaker_endpoint_config(client, name, version):
return


def init_sagemaker_project(sagemaker_project_dir, bento_path, bento_name):
shutil.copytree(bento_path, sagemaker_project_dir)

with open(os.path.join(sagemaker_project_dir, "nginx.conf"), "w") as f:
f.write(DEFAULT_NGINX_CONFIG)
with open(os.path.join(sagemaker_project_dir, "wsgi.py"), "w") as f:
f.write(DEFAULT_WSGI_PY)
with open(os.path.join(sagemaker_project_dir, "serve"), "w") as f:
f.write(DEFAULT_SERVE_SCRIPT)

# permission 755 is required for entry script 'serve'
permission = "755"
octal_permission = int(permission, 8)
os.chmod(os.path.join(sagemaker_project_dir, "serve"), octal_permission)
return sagemaker_project_dir


class SageMakerDeploymentOperator(DeploymentOperatorBase):
def apply(self, deployment_pb, repo, prev_deployment=None):
def apply(self, deployment_pb, yatai_service, prev_deployment=None):
try:
ensure_docker_available_or_raise()
deployment_spec = deployment_pb.spec
sagemaker_config = deployment_spec.sagemaker_operator_config
if sagemaker_config is None:
raise BentoMLDeploymentException('Sagemaker configuration is missing.')

archive_path = repo.get(
deployment_spec.bento_name, deployment_spec.bento_version
bento_pb = yatai_service.GetBento(
GetBentoRequest(
bento_name=deployment_spec.bento_name,
bento_version=deployment_spec.bento_version,
)
)
if bento_pb.bento.uri.type != BentoUri.LOCAL:
raise BentoMLException(
'BentoML currently only support local repository'
)
else:
bento_path = bento_pb.bento.uri.uri

ensure_deploy_api_name_exists_in_bento(
[api.name for api in bento_pb.bento.bento_service_metadata.apis],
[sagemaker_config.api_name],
)
bento_config = load_bento_service_metadata(archive_path)

sagemaker_client = boto3.client('sagemaker', sagemaker_config.region)

with TemporarySageMakerContent(
archive_path, deployment_spec.bento_name, deployment_spec.bento_version
) as temp_path:
ecr_image_path = create_push_image_to_ecr(
deployment_spec.bento_name, deployment_spec.bento_version, temp_path
with TempDirectory() as temp_dir:
sagemaker_project_dir = os.path.jon(
temp_dir, deployment_spec.bento_name
)
init_sagemaker_project(sagemaker_project_dir, bento_path)
ecr_image_path = create_push_docker_image_to_ecr(
deployment_spec.bento_name,
deployment_spec.bento_version,
sagemaker_project_dir,
)

execution_role_arn = get_arn_role_from_current_user()
execution_role_arn = get_arn_role_from_current_aws_user()
model_name = create_sagemaker_model_name(
deployment_spec.bento_name, deployment_spec.bento_version
)
@@ -372,6 +363,7 @@ def apply(self, deployment_pb, repo, prev_deployment=None):
"AWS create endpoint config response: %s", create_config_response
)
except ClientError as e:
# create endpoint failed, will remove previously created model
cleanup_model_error = _cleanup_sagemaker_model(
sagemaker_client,
deployment_spec.bento_name,
@@ -417,6 +409,8 @@ def apply(self, deployment_pb, repo, prev_deployment=None):
"AWS create endpoint response: %s", create_endpoint_response
)
except ClientError as e:
# create/update endpoint failed, will remove previously created config
# and then remove the model
cleanup_endpoint_config_error = _cleanup_sagemaker_endpoint_config(
client=sagemaker_client,
name=deployment_spec.bento_name,
@@ -463,7 +457,7 @@ def apply(self, deployment_pb, repo, prev_deployment=None):
except BentoMLException as error:
return ApplyDeploymentResponse(status=exception_to_return_status(error))

def delete(self, deployment_pb, repo=None):
def delete(self, deployment_pb, yatai_service=None):
try:
deployment_spec = deployment_pb.spec
sagemaker_config = deployment_spec.sagemaker_operator_config
@@ -483,9 +477,8 @@ def delete(self, deployment_pb, repo=None):
)
except ClientError as e:
status = _parse_aws_client_exception_or_raise(e)
status.error_message = (
'Failed to delete SageMaker deployment: %s',
status.error_message,
status.error_message = 'Failed to delete SageMaker endpoint: {}'.format(
status.error_message
)
return DeleteDeploymentResponse(status=status)

@@ -517,7 +510,7 @@ def delete(self, deployment_pb, repo=None):
except BentoMLException as error:
return DeleteDeploymentResponse(status=exception_to_return_status(error))

def describe(self, deployment_pb, repo=None):
def describe(self, deployment_pb, yatai_service=None):
try:
deployment_spec = deployment_pb.spec
sagemaker_config = deployment_spec.sagemaker_operator_config

0 comments on commit a4448da

Please sign in to comment.
You can’t perform that action at this time.