Skip to content

Commit

Permalink
Dump + cleanup endpoint logs during integ tests (#102)
Browse files Browse the repository at this point in the history
* Dump + cleanup endpoint logs during integ tests

* Fix flake errors

* Add aws logs to tox as a test dependency

* Specify region for AWSLogs
  • Loading branch information
winstonaws committed Mar 21, 2018
1 parent 6a74d07 commit 6e0047b
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 25 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def read(fname):

extras_require={
'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist',
'mock', 'tensorflow>=1.3.0', 'contextlib2']},
'mock', 'tensorflow>=1.3.0', 'contextlib2', 'awslogs']},

entry_points={
'console_scripts': ['sagemaker=sagemaker.cli.main:main'],
Expand Down
16 changes: 10 additions & 6 deletions tests/integ/test_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from sagemaker import Session
from sagemaker.tensorflow import TensorFlow
from tests.integ import DATA_DIR, REGION
from tests.integ.timeout import timeout_and_delete_endpoint, timeout
from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout

DATA_PATH = os.path.join(DATA_DIR, 'iris', 'data')

Expand All @@ -43,12 +43,14 @@ def test_tf(sagemaker_session, tf_full_version):
sagemaker_session=sagemaker_session,
base_job_name='test-tf')

inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
estimator.fit(inputs)
print('job succeeded: {}'.format(estimator.latest_training_job.name))

with timeout_and_delete_endpoint(estimator=estimator, minutes=20):
json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
endpoint_name = estimator.latest_training_job.name
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
endpoint_name=endpoint_name)

features = [6.4, 3.2, 4.5, 1.5]
dict_result = json_predictor.predict({'inputs': features})
Expand Down Expand Up @@ -81,9 +83,11 @@ def test_tf_async(sagemaker_session, tf_full_version):
training_job_name = estimator.latest_training_job.name
time.sleep(20)

with timeout_and_delete_endpoint(estimator=estimator, minutes=35):
endpoint_name = training_job_name
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35):
estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
endpoint_name=endpoint_name)

result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
print('predict result: {}'.format(result))
Expand Down
5 changes: 3 additions & 2 deletions tests/integ/test_tf_cifar.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from sagemaker import Session
from sagemaker.tensorflow import TensorFlow
from tests.integ import DATA_DIR, REGION
from tests.integ.timeout import timeout_and_delete_endpoint, timeout
from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout

PICKLE_CONTENT_TYPE = 'application/python-pickle'

Expand Down Expand Up @@ -54,7 +54,8 @@ def test_cifar(sagemaker_session, tf_full_version):
estimator.fit(inputs, logs=False)
print('job succeeded: {}'.format(estimator.latest_training_job.name))

with timeout_and_delete_endpoint(estimator=estimator, minutes=20):
endpoint_name = estimator.latest_training_job.name
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')
predictor.serializer = PickleSerializer()
predictor.content_type = PICKLE_CONTENT_TYPE
Expand Down
35 changes: 19 additions & 16 deletions tests/integ/timeout.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import signal
from contextlib import contextmanager
import logging

from awslogs.core import AWSLogs
from botocore.exceptions import ClientError

LOGGER = logging.getLogger('timeout')
Expand Down Expand Up @@ -55,21 +55,6 @@ def handler(signum, frame):
signal.alarm(0)


@contextmanager
def timeout_and_delete_endpoint(estimator, seconds=0, minutes=0, hours=0):
with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
try:
yield [t]
finally:
try:
estimator.delete_endpoint()
LOGGER.info('deleted endpoint')
except ClientError as ce:
if ce.response['Error']['Code'] == 'ValidationException':
# avoids the inner exception to be overwritten
pass


@contextmanager
def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0):
with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
Expand All @@ -79,7 +64,25 @@ def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, second
try:
sagemaker_session.delete_endpoint(endpoint_name)
LOGGER.info('deleted endpoint {}'.format(endpoint_name))
_cleanup_endpoint_logs(endpoint_name, sagemaker_session)
except ClientError as ce:
if ce.response['Error']['Code'] == 'ValidationException':
# avoids the inner exception to be overwritten
pass


def _cleanup_endpoint_logs(endpoint_name, sagemaker_session):
log_group = '/aws/sagemaker/Endpoints/{}'.format(endpoint_name)
try:
# print out logs before deletion for debuggability
LOGGER.info('cloudwatch logs for log group {}:'.format(log_group))
logs = AWSLogs(log_group_name=log_group, log_stream_name='ALL', start='1d',
aws_region=sagemaker_session.boto_session.region_name)
logs.list_logs()

cwl_client = sagemaker_session.boto_session.client('logs')
cwl_client.delete_log_group(logGroupName=log_group)
LOGGER.info('deleted cloudwatch log group: {}'.format(log_group))
except Exception:
LOGGER.exception('Failure occurred while cleaning up cloudwatch log group %s. ' +
'Swallowing exception but printing stacktrace for debugging.', log_group)
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ deps =
mock
contextlib2
teamcity-messages
awslogs

[testenv:flake8]
basepython = python
Expand Down

0 comments on commit 6e0047b

Please sign in to comment.