Skip to content

Commit

Permalink
Local mode improvements (#117)
Browse files Browse the repository at this point in the history
* Change local mode GPU docker engine

Switch local mode to use nvidia-docker2 + docker-compose
instead of nvidia-docker + nvidia-docker-compose.

Also get rid of the log output for billable seconds in local mode as it
doesn't make sense.

Finally, let the users know if we can't cleanup a container directory
but not fail the training job.
  • Loading branch information
iquintero committed Apr 2, 2018
1 parent 6184b22 commit 7ff630f
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 7 deletions.
33 changes: 26 additions & 7 deletions src/sagemaker/local/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,16 @@ def train(self, input_data_config, hyperparameters):
# free up the training data directory as it may contain
# lots of data downloaded from S3. This doesn't delete any local
# data that was just mounted to the container.
shutil.rmtree(data_dir)
_delete_tree(data_dir)
# Also free the container config files.
for host in self.hosts:
shutil.rmtree(os.path.join(self.container_root, host))
container_config_path = os.path.join(self.container_root, host)
_delete_tree(container_config_path)

self._cleanup()
# Print our Job Complete line to have a simmilar experience to training on SageMaker where you
# see this line at the end.
print('===== Job Complete =====')
return s3_model_artifacts

def serve(self, primary_container):
Expand Down Expand Up @@ -162,7 +166,7 @@ def stop_serving(self):
self.container.down()
self._cleanup()
# for serving we can delete everything in the container root.
shutil.rmtree(self.container_root)
_delete_tree(self.container_root)

def retrieve_model_artifacts(self, compose_data):
"""Get the model artifacts from all the container nodes.
Expand All @@ -185,9 +189,9 @@ def retrieve_model_artifacts(self, compose_data):
volumes = compose_data['services'][str(host)]['volumes']

for volume in volumes:
container_dir, host_dir = volume.split(':')
if host_dir == '/opt/ml/model':
self._recursive_copy(container_dir, s3_model_artifacts)
host_dir, container_dir = volume.split(':')
if container_dir == '/opt/ml/model':
self._recursive_copy(host_dir, s3_model_artifacts)

return s3_model_artifacts

Expand Down Expand Up @@ -304,7 +308,7 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
return content

def _compose(self, detached=False):
compose_cmd = 'nvidia-docker-compose' if self.instance_type == "local_gpu" else 'docker-compose'
compose_cmd = 'docker-compose'

command = [
compose_cmd,
Expand Down Expand Up @@ -480,6 +484,21 @@ def _create_config_file_directories(root, host):
os.makedirs(os.path.join(root, host, d))


def _delete_tree(path):
try:
shutil.rmtree(path)
except OSError as exc:
# on Linux, when docker writes to any mounted volume, it uses the container's user. In most cases
# this is root. When the container exits and we try to delete them we can't because root owns those
# files. We expect this to happen, so we handle EACCESS. Any other error we will raise the
# exception up.
if exc.errno == errno.EACCES:
logger.warning("Failed to delete: %s Please remove it manually." % path)
else:
logger.error("Failed to delete: %s" % path)
raise


def _aws_credentials(session):
try:
creds = session.get_credentials()
Expand Down
5 changes: 5 additions & 0 deletions src/sagemaker/local/local_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,8 @@ def __init__(self, boto_session=None):
logger.warning("Windows Support for Local Mode is Experimental")
self.sagemaker_client = LocalSagemakerClient(self)
self.sagemaker_runtime_client = LocalSagemakerRuntimeClient(self.config)

def logs_for_job(self, job_name, wait=False, poll=5):
# override logs_for_job() as it doesn't need to perform any action
# on local mode.
pass

0 comments on commit 7ff630f

Please sign in to comment.