diff --git a/.gitignore b/.gitignore index 8667e326..4f89b30d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ __pycache__/ .DS_Store **/*.pyc **.pyc +**/*.tar.gz diff --git a/setup.py b/setup.py index 695413db..548e0720 100644 --- a/setup.py +++ b/setup.py @@ -47,13 +47,14 @@ def read(fname): 'Programming Language :: Python :: 3.6', ], - install_requires=['numpy==1.16.4', 'Pillow>=6.2.0', 'retrying==1.3.3', 'sagemaker-containers>=2.5.4', - 'six==1.12.0', 'requests_mock==1.6.0', 'sagemaker-inference>=1.2.2', - 'retrying==1.3.3'], + # We don't declare our dependency on torch here because we build with + # different packages for different variants + install_requires=['numpy', 'retrying', 'sagemaker-inference>=1.2.2'], extras_require={ 'test': ['boto3==1.10.32', 'coverage==4.5.3', 'docker-compose==1.23.2', 'flake8==3.7.7', 'Flask==1.1.1', 'mock==2.0.0', 'pytest==4.4.0', 'pytest-cov==2.7.1', 'pytest-xdist==1.28.0', 'PyYAML==3.10', - 'sagemaker==1.48.0', 'requests==2.20.0', 'torchvision==0.5.0', 'tox==3.7.0', 'requests_mock==1.6.0'] + 'sagemaker==1.48.0', 'sagemaker-containers>=2.5.4', 'six==1.12.0', 'requests==2.20.0', + 'requests_mock==1.6.0', 'torch==1.4.0', 'torchvision==0.5.0', 'tox==3.7.0'] }, entry_points={ diff --git a/test-toolkit/integration/__init__.py b/test-toolkit/integration/__init__.py index 7904d0fd..d3109e80 100644 --- a/test-toolkit/integration/__init__.py +++ b/test-toolkit/integration/__init__.py @@ -14,6 +14,8 @@ import os +from utils import file_utils + resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'resources')) mnist_path = os.path.join(resources_path, 'mnist') data_dir = os.path.join(mnist_path, 'data') @@ -24,14 +26,33 @@ model_cpu_dir = os.path.join(mnist_path, cpu_sub_dir) mnist_cpu_script = os.path.join(model_cpu_dir, 'mnist.py') +model_cpu_tar = file_utils.make_tarfile(mnist_cpu_script, + os.path.join(model_cpu_dir, "model.pth"), + model_cpu_dir) + model_cpu_1d_dir = os.path.join(model_cpu_dir, '1d') mnist_1d_script = os.path.join(model_cpu_1d_dir, 'mnist_1d.py') +model_cpu_1d_tar = file_utils.make_tarfile(mnist_1d_script, + os.path.join(model_cpu_1d_dir, "model.pth"), + model_cpu_1d_dir) + model_gpu_dir = os.path.join(mnist_path, gpu_sub_dir) mnist_gpu_script = os.path.join(model_gpu_dir, 'mnist.py') -model_gpu_1d_dir = os.path.join(model_gpu_dir, '1d') +model_gpu_tar = file_utils.make_tarfile(mnist_gpu_script, + os.path.join(model_gpu_dir, "model.pth"), + model_gpu_dir) + model_eia_dir = os.path.join(mnist_path, eia_sub_dir) mnist_eia_script = os.path.join(model_eia_dir, 'mnist.py') +model_eia_tar = file_utils.make_tarfile(mnist_eia_script, + os.path.join(model_eia_dir, "model.pth"), + model_eia_dir) + call_model_fn_once_script = os.path.join(model_cpu_dir, 'call_model_fn_once.py') +call_model_fn_once_tar = file_utils.make_tarfile(call_model_fn_once_script, + os.path.join(model_cpu_dir, "model.pth"), + model_cpu_dir, + "model_call_model_fn_once.tar.gz") ROLE = 'dummy/unused-role' DEFAULT_TIMEOUT = 20 diff --git a/test-toolkit/integration/local/test_serving.py b/test-toolkit/integration/local/test_serving.py index f54c675f..dbaf8eaa 100644 --- a/test-toolkit/integration/local/test_serving.py +++ b/test-toolkit/integration/local/test_serving.py @@ -25,8 +25,9 @@ from sagemaker_containers.beta.framework import content_types from torchvision import datasets, transforms -from integration import training_dir, mnist_1d_script, model_cpu_dir, mnist_cpu_script, \ - model_gpu_dir, mnist_gpu_script, model_cpu_1d_dir, call_model_fn_once_script, ROLE +from integration import training_dir, mnist_1d_script, model_cpu_tar, mnist_cpu_script, \ + model_gpu_tar, mnist_gpu_script, model_cpu_1d_tar, call_model_fn_once_script, ROLE, \ + call_model_fn_once_tar from utils import local_mode_utils CONTENT_TYPE_TO_SERIALIZER_MAP = { @@ -49,9 +50,9 @@ def fixture_test_loader(): def test_serve_json_npy(test_loader, use_gpu, image_uri, sagemaker_local_session, instance_type): - model_dir = model_gpu_dir if use_gpu else model_cpu_dir + model_tar = model_gpu_tar if use_gpu else model_cpu_tar mnist_script = mnist_gpu_script if use_gpu else mnist_cpu_script - with _predictor(model_dir, mnist_script, image_uri, sagemaker_local_session, + with _predictor(model_tar, mnist_script, image_uri, sagemaker_local_session, instance_type) as predictor: for content_type in (content_types.JSON, content_types.NPY): for accept in (content_types.JSON, content_types.CSV, content_types.NPY): @@ -59,7 +60,7 @@ def test_serve_json_npy(test_loader, use_gpu, image_uri, sagemaker_local_session def test_serve_csv(test_loader, use_gpu, image_uri, sagemaker_local_session, instance_type): - with _predictor(model_cpu_1d_dir, mnist_1d_script, image_uri, sagemaker_local_session, + with _predictor(model_cpu_1d_tar, mnist_1d_script, image_uri, sagemaker_local_session, instance_type) as predictor: for accept in (content_types.JSON, content_types.CSV, content_types.NPY): _assert_prediction_csv(predictor, test_loader, accept) @@ -67,13 +68,13 @@ def test_serve_csv(test_loader, use_gpu, image_uri, sagemaker_local_session, ins @pytest.mark.skip_cpu def test_serve_cpu_model_on_gpu(test_loader, image_uri, sagemaker_local_session, instance_type): - with _predictor(model_cpu_1d_dir, mnist_1d_script, image_uri, sagemaker_local_session, + with _predictor(model_cpu_1d_tar, mnist_1d_script, image_uri, sagemaker_local_session, instance_type) as predictor: _assert_prediction_npy_json(predictor, test_loader, content_types.NPY, content_types.JSON) def test_serving_calls_model_fn_once(image_uri, sagemaker_local_session, instance_type): - with _predictor(model_cpu_dir, call_model_fn_once_script, image_uri, sagemaker_local_session, + with _predictor(call_model_fn_once_tar, call_model_fn_once_script, image_uri, sagemaker_local_session, instance_type, model_server_workers=2) as predictor: predictor.accept = None predictor.deserializer = BytesDeserializer() @@ -86,9 +87,9 @@ def test_serving_calls_model_fn_once(image_uri, sagemaker_local_session, instanc @contextmanager -def _predictor(model_dir, script, image, sagemaker_local_session, instance_type, +def _predictor(model_tar, script, image, sagemaker_local_session, instance_type, model_server_workers=None): - model = PyTorchModel('file://{}'.format(model_dir), + model = PyTorchModel('file://{}'.format(model_tar), ROLE, script, image=image, diff --git a/test-toolkit/integration/sagemaker/test_mnist.py b/test-toolkit/integration/sagemaker/test_mnist.py index 9efb28c8..ba89c373 100644 --- a/test-toolkit/integration/sagemaker/test_mnist.py +++ b/test-toolkit/integration/sagemaker/test_mnist.py @@ -12,29 +12,26 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import os - import numpy as np import pytest import sagemaker from sagemaker.pytorch import PyTorchModel -from integration import model_cpu_dir, mnist_cpu_script, mnist_gpu_script, model_eia_dir, mnist_eia_script +from integration import model_cpu_tar, model_gpu_tar, mnist_cpu_script, mnist_gpu_script, \ + model_eia_tar, mnist_eia_script from integration.sagemaker.timeout import timeout_and_delete_endpoint @pytest.mark.cpu_test def test_mnist_cpu(sagemaker_session, image_uri, instance_type): instance_type = instance_type or 'ml.c4.xlarge' - model_dir = os.path.join(model_cpu_dir, 'model_mnist.tar.gz') - _test_mnist_distributed(sagemaker_session, image_uri, instance_type, model_dir, mnist_cpu_script) + _test_mnist_distributed(sagemaker_session, image_uri, instance_type, model_cpu_tar, mnist_cpu_script) @pytest.mark.gpu_test def test_mnist_gpu(sagemaker_session, image_uri, instance_type): instance_type = instance_type or 'ml.p2.xlarge' - model_dir = os.path.join(model_cpu_dir, 'model_mnist.tar.gz') - _test_mnist_distributed(sagemaker_session, image_uri, instance_type, model_dir, mnist_gpu_script) + _test_mnist_distributed(sagemaker_session, image_uri, instance_type, model_gpu_tar, mnist_gpu_script) @pytest.mark.eia_test @@ -42,17 +39,16 @@ def test_mnist_eia(sagemaker_session, image_uri, instance_type, accelerator_type instance_type = instance_type or 'ml.c4.xlarge' # Scripted model is serialized with torch.jit.save(). # Inference test for EIA doesn't need to instantiate model definition then load state_dict - model_dir = os.path.join(model_eia_dir, 'model_mnist.tar.gz') - _test_mnist_distributed(sagemaker_session, image_uri, instance_type, model_dir, mnist_eia_script, + _test_mnist_distributed(sagemaker_session, image_uri, instance_type, model_eia_tar, mnist_eia_script, accelerator_type=accelerator_type) -def _test_mnist_distributed(sagemaker_session, image_uri, instance_type, model_dir, mnist_script, +def _test_mnist_distributed(sagemaker_session, image_uri, instance_type, model_tar, mnist_script, accelerator_type=None): endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-pytorch-serving") model_data = sagemaker_session.upload_data( - path=model_dir, + path=model_tar, key_prefix="sagemaker-pytorch-serving/models", ) diff --git a/test-toolkit/resources/mnist/model_cpu/1d/model.tar.gz b/test-toolkit/resources/mnist/model_cpu/1d/model.tar.gz deleted file mode 100644 index 4ca07729..00000000 Binary files a/test-toolkit/resources/mnist/model_cpu/1d/model.tar.gz and /dev/null differ diff --git a/test-toolkit/resources/mnist/model_cpu/model.tar.gz b/test-toolkit/resources/mnist/model_cpu/model.tar.gz deleted file mode 100644 index f075782e..00000000 Binary files a/test-toolkit/resources/mnist/model_cpu/model.tar.gz and /dev/null differ diff --git a/test-toolkit/resources/mnist/model_cpu/model_mnist.tar.gz b/test-toolkit/resources/mnist/model_cpu/model_mnist.tar.gz deleted file mode 100644 index 3d741a70..00000000 Binary files a/test-toolkit/resources/mnist/model_cpu/model_mnist.tar.gz and /dev/null differ diff --git a/test-toolkit/resources/mnist/model_eia/mnist.py b/test-toolkit/resources/mnist/model_eia/mnist.py index d151a3f3..ebc0bff0 100644 --- a/test-toolkit/resources/mnist/model_eia/mnist.py +++ b/test-toolkit/resources/mnist/model_eia/mnist.py @@ -10,4 +10,39 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -# This file is intentionally left blank to utilize default_model_fn and default_predict_fn +from __future__ import absolute_import +import logging +import os +import sys + +import torch + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +logger.addHandler(logging.StreamHandler(sys.stdout)) + + +def predict_fn(input_data, model): + logger.info('Performing EIA inference with Torch JIT context with input of size {}'.format(input_data.shape)) + # With EI, client instance should be CPU for cost-efficiency. Subgraphs with unsupported arguments run locally. Server runs with CUDA + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + mdoel = model.to(device) + input_data = input_data.to(device) + with torch.no_grad(): + # Set the target device to the accelerator ordinal + with torch.jit.optimized_execution(True, {'target_device': 'eia:0'}): + return model(input_data) + + +def model_fn(model_dir): + logger.info('model_fn: Loading model with TorchScript from {}'.format(model_dir)) + # Scripted model is serialized with torch.jit.save(). + # No need to instantiate model definition then load state_dict + model = torch.jit.load('model.pth') + return model + + +def save_model(model, model_dir): + logger.info("Saving the model to {}.".format(model_dir)) + path = os.path.join(model_dir, 'model.pth') + torch.jit.save(model, path) diff --git a/test-toolkit/resources/mnist/model_eia/model.pth b/test-toolkit/resources/mnist/model_eia/model.pth new file mode 100644 index 00000000..d7b4691d Binary files /dev/null and b/test-toolkit/resources/mnist/model_eia/model.pth differ diff --git a/test-toolkit/resources/mnist/model_eia/model.tar.gz b/test-toolkit/resources/mnist/model_eia/model.tar.gz deleted file mode 100644 index a992cf5e..00000000 Binary files a/test-toolkit/resources/mnist/model_eia/model.tar.gz and /dev/null differ diff --git a/test-toolkit/resources/mnist/model_eia/model_mnist.tar.gz b/test-toolkit/resources/mnist/model_eia/model_mnist.tar.gz deleted file mode 100644 index 6afd83a4..00000000 Binary files a/test-toolkit/resources/mnist/model_eia/model_mnist.tar.gz and /dev/null differ diff --git a/test-toolkit/resources/mnist/model_gpu/model.pth b/test-toolkit/resources/mnist/model_gpu/model.pth new file mode 100644 index 00000000..ca97f76d Binary files /dev/null and b/test-toolkit/resources/mnist/model_gpu/model.pth differ diff --git a/test-toolkit/resources/mnist/model_gpu/model.tar.gz b/test-toolkit/resources/mnist/model_gpu/model.tar.gz deleted file mode 100644 index ed39a7a8..00000000 Binary files a/test-toolkit/resources/mnist/model_gpu/model.tar.gz and /dev/null differ diff --git a/test/resources/call_model_fn_once.py b/test-toolkit/utils/file_utils.py similarity index 50% rename from test/resources/call_model_fn_once.py rename to test-toolkit/utils/file_utils.py index 1bbd3e27..f81a20b5 100644 --- a/test/resources/call_model_fn_once.py +++ b/test-toolkit/utils/file_utils.py @@ -1,4 +1,4 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of @@ -13,25 +13,12 @@ from __future__ import absolute_import import os +import tarfile -def model_fn(model_dir): - lock_file = os.path.join(model_dir, 'model_fn.lock.{}'.format(os.getpid())) - if os.path.exists(lock_file): - raise RuntimeError('model_fn called more than once (lock: {})'.format(lock_file)) - - open(lock_file, 'a').close() - - return 'model' - - -def input_fn(data, content_type): - return data - - -def predict_fn(data, model): - return b'output' - - -def output_fn(prediction, accept): - return prediction +def make_tarfile(script, model, output_path, filename="model.tar.gz"): + output_filename = os.path.join(output_path, filename) + with tarfile.open(output_filename, "w:gz") as tar: + tar.add(script, arcname=os.path.basename(script)) + tar.add(model, arcname=os.path.basename(model)) + return output_filename diff --git a/test/integration/__init__.py b/test/integration/__init__.py index 6500428d..e4d472e4 100644 --- a/test/integration/__init__.py +++ b/test/integration/__init__.py @@ -14,9 +14,10 @@ import os +from test.utils import file_utils + resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'resources')) mnist_path = os.path.join(resources_path, 'mnist') -mnist_script = os.path.join(mnist_path, 'mnist.py') data_dir = os.path.join(mnist_path, 'data') training_dir = os.path.join(data_dir, 'training') cpu_sub_dir = 'model_cpu' @@ -25,14 +26,33 @@ model_cpu_dir = os.path.join(mnist_path, cpu_sub_dir) mnist_cpu_script = os.path.join(model_cpu_dir, 'mnist.py') +model_cpu_tar = file_utils.make_tarfile(mnist_cpu_script, + os.path.join(model_cpu_dir, "model.pth"), + model_cpu_dir) + model_cpu_1d_dir = os.path.join(model_cpu_dir, '1d') mnist_1d_script = os.path.join(model_cpu_1d_dir, 'mnist_1d.py') +model_cpu_1d_tar = file_utils.make_tarfile(mnist_1d_script, + os.path.join(model_cpu_1d_dir, "model.pth"), + model_cpu_1d_dir) + model_gpu_dir = os.path.join(mnist_path, gpu_sub_dir) mnist_gpu_script = os.path.join(model_gpu_dir, 'mnist.py') -model_gpu_1d_dir = os.path.join(model_gpu_dir, '1d') +model_gpu_tar = file_utils.make_tarfile(mnist_gpu_script, + os.path.join(model_gpu_dir, "model.pth"), + model_gpu_dir) + model_eia_dir = os.path.join(mnist_path, eia_sub_dir) mnist_eia_script = os.path.join(model_eia_dir, 'mnist.py') -call_model_fn_once_script = os.path.join(resources_path, 'call_model_fn_once.py') +model_eia_tar = file_utils.make_tarfile(mnist_eia_script, + os.path.join(model_eia_dir, "model.pth"), + model_eia_dir) + +call_model_fn_once_script = os.path.join(model_cpu_dir, 'call_model_fn_once.py') +call_model_fn_once_tar = file_utils.make_tarfile(call_model_fn_once_script, + os.path.join(model_cpu_dir, "model.pth"), + model_cpu_dir, + "model_call_model_fn_once.tar.gz") ROLE = 'dummy/unused-role' DEFAULT_TIMEOUT = 20 diff --git a/test/integration/local/test_serving.py b/test/integration/local/test_serving.py index c124a9c8..10e6a8a2 100644 --- a/test/integration/local/test_serving.py +++ b/test/integration/local/test_serving.py @@ -25,8 +25,9 @@ from sagemaker_containers.beta.framework import content_types from torchvision import datasets, transforms -from test.integration import training_dir, mnist_script, mnist_1d_script, model_cpu_dir, \ - model_gpu_dir, model_cpu_1d_dir, call_model_fn_once_script, ROLE +from test.integration import training_dir, mnist_1d_script, model_cpu_tar, mnist_cpu_script, \ + model_gpu_tar, mnist_gpu_script, model_cpu_1d_tar, call_model_fn_once_script, ROLE, \ + call_model_fn_once_tar from test.utils import local_mode_utils CONTENT_TYPE_TO_SERIALIZER_MAP = { @@ -49,7 +50,8 @@ def fixture_test_loader(): def test_serve_json_npy(test_loader, use_gpu, docker_image, sagemaker_local_session, instance_type): - model_dir = model_gpu_dir if use_gpu else model_cpu_dir + model_dir = model_gpu_tar if use_gpu else model_cpu_tar + mnist_script = mnist_gpu_script if use_gpu else mnist_cpu_script with _predictor(model_dir, mnist_script, docker_image, sagemaker_local_session, instance_type) as predictor: for content_type in (content_types.JSON, content_types.NPY): @@ -58,7 +60,7 @@ def test_serve_json_npy(test_loader, use_gpu, docker_image, sagemaker_local_sess def test_serve_csv(test_loader, use_gpu, docker_image, sagemaker_local_session, instance_type): - with _predictor(model_cpu_1d_dir, mnist_1d_script, docker_image, sagemaker_local_session, + with _predictor(model_cpu_1d_tar, mnist_1d_script, docker_image, sagemaker_local_session, instance_type) as predictor: for accept in (content_types.JSON, content_types.CSV, content_types.NPY): _assert_prediction_csv(predictor, test_loader, accept) @@ -66,14 +68,14 @@ def test_serve_csv(test_loader, use_gpu, docker_image, sagemaker_local_session, @pytest.mark.skip_cpu def test_serve_cpu_model_on_gpu(test_loader, docker_image, sagemaker_local_session, instance_type): - with _predictor(model_cpu_1d_dir, mnist_1d_script, docker_image, sagemaker_local_session, + with _predictor(model_cpu_1d_tar, mnist_1d_script, docker_image, sagemaker_local_session, instance_type) as predictor: _assert_prediction_npy_json(predictor, test_loader, content_types.NPY, content_types.JSON) @pytest.mark.skip_gpu_py2 def test_serving_calls_model_fn_once(docker_image, sagemaker_local_session, instance_type): - with _predictor(model_cpu_dir, call_model_fn_once_script, docker_image, sagemaker_local_session, + with _predictor(call_model_fn_once_tar, call_model_fn_once_script, docker_image, sagemaker_local_session, instance_type, model_server_workers=2) as predictor: predictor.accept = None predictor.deserializer = BytesDeserializer() diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index c8bc1c0e..1fcb9d15 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -12,29 +12,26 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import os - import numpy as np import pytest import sagemaker from sagemaker.pytorch import PyTorchModel -from test.integration import model_cpu_dir, mnist_cpu_script, mnist_gpu_script, model_eia_dir, mnist_eia_script +from test.integration import model_cpu_tar, mnist_cpu_script, model_gpu_tar, mnist_gpu_script, \ + model_eia_tar, mnist_eia_script from test.integration.sagemaker.timeout import timeout_and_delete_endpoint @pytest.mark.cpu_test def test_mnist_distributed_cpu(sagemaker_session, ecr_image, instance_type): instance_type = instance_type or 'ml.c4.xlarge' - model_dir = os.path.join(model_cpu_dir, 'model_mnist.tar.gz') - _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_dir, mnist_cpu_script) + _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_cpu_tar, mnist_cpu_script) @pytest.mark.gpu_test def test_mnist_distributed_gpu(sagemaker_session, ecr_image, instance_type): instance_type = instance_type or 'ml.p2.xlarge' - model_dir = os.path.join(model_cpu_dir, 'model_mnist.tar.gz') - _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_dir, mnist_gpu_script) + _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_gpu_tar, mnist_gpu_script) @pytest.mark.eia_test @@ -42,17 +39,16 @@ def test_mnist_eia(sagemaker_session, ecr_image, instance_type, accelerator_type instance_type = instance_type or 'ml.c4.xlarge' # Scripted model is serialized with torch.jit.save(). # Inference test for EIA doesn't need to instantiate model definition then load state_dict - model_dir = os.path.join(model_eia_dir, 'model_mnist.tar.gz') - _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_dir, mnist_eia_script, + _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_eia_tar, mnist_eia_script, accelerator_type=accelerator_type) -def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_dir, mnist_script, +def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_tar, mnist_script, accelerator_type=None): endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-pytorch-serving") model_data = sagemaker_session.upload_data( - path=model_dir, + path=model_tar, key_prefix="sagemaker-pytorch-serving/models", ) diff --git a/test/resources/mnist/.DS_Store b/test/resources/mnist/.DS_Store deleted file mode 100644 index d61e19ff..00000000 Binary files a/test/resources/mnist/.DS_Store and /dev/null differ diff --git a/test/resources/mnist/mnist.py b/test/resources/mnist/mnist.py deleted file mode 100644 index 5d50b30a..00000000 --- a/test/resources/mnist/mnist.py +++ /dev/null @@ -1,227 +0,0 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import -import argparse -import logging -import os -import sys - -import cv2 as cv -import sagemaker_containers -import torch -import torch.distributed as dist -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torch.utils.data -import torch.utils.data.distributed -from torchvision import datasets, transforms - -logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) -logger.addHandler(logging.StreamHandler(sys.stdout)) - - -# Based on https://github.com/pytorch/examples/blob/master/mnist/main.py -class Net(nn.Module): - def __init__(self): - logger.info("Create neural network module") - - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x, dim=1) - - -def _get_train_data_loader(batch_size, training_dir, is_distributed, **kwargs): - logger.info("Get train data loader") - dataset = datasets.MNIST(training_dir, train=True, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if is_distributed else None - return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=train_sampler is None, - sampler=train_sampler, **kwargs) - - -def _get_test_data_loader(test_batch_size, training_dir, **kwargs): - logger.info("Get test data loader") - return torch.utils.data.DataLoader( - datasets.MNIST(training_dir, train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=test_batch_size, shuffle=True, **kwargs) - - -def _average_gradients(model): - # Gradient averaging. - size = float(dist.get_world_size()) - for param in model.parameters(): - dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM) - param.grad.data /= size - - -def train(args): - is_distributed = len(args.hosts) > 1 and args.backend is not None - logger.debug("Distributed training - {}".format(is_distributed)) - use_cuda = (args.processor == 'gpu') or (args.num_gpus > 0) - logger.debug("Number of gpus available - {}".format(args.num_gpus)) - kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} - device = torch.device("cuda" if use_cuda else "cpu") - - if is_distributed: - # Initialize the distributed environment. - world_size = len(args.hosts) - os.environ['WORLD_SIZE'] = str(world_size) - host_rank = args.hosts.index(args.current_host) - os.environ['RANK'] = str(host_rank) - dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) - logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( - args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( - dist.get_rank(), args.num_gpus)) - - # set the seed for generating random numbers - torch.manual_seed(args.seed) - if use_cuda: - torch.cuda.manual_seed(args.seed) - - train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) - test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) - - # TODO: assert the logs when we move to the SDK local mode - logger.debug("Processes {}/{} ({:.0f}%) of train data".format( - len(train_loader.sampler), len(train_loader.dataset), - 100. * len(train_loader.sampler) / len(train_loader.dataset) - )) - - logger.debug("Processes {}/{} ({:.0f}%) of test data".format( - len(test_loader.sampler), len(test_loader.dataset), - 100. * len(test_loader.sampler) / len(test_loader.dataset) - )) - - model = Net().to(device) - if is_distributed and use_cuda: - # multi-machine multi-gpu case - logger.debug("Multi-machine multi-gpu: using DistributedDataParallel.") - model = torch.nn.parallel.DistributedDataParallel(model) - elif use_cuda: - # single-machine multi-gpu case - logger.debug("Single-machine multi-gpu: using DataParallel().cuda().") - model = torch.nn.DataParallel(model).to(device) - else: - # single-machine or multi-machine cpu case - logger.debug("Single-machine/multi-machine cpu: using DataParallel.") - model = torch.nn.DataParallel(model) - - optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) - - for epoch in range(1, args.epochs + 1): - model.train() - for batch_idx, (data, target) in enumerate(train_loader, 1): - data, target = data.to(device), target.to(device) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - if is_distributed and not use_cuda: - # average gradients manually for multi-machine cpu case only - _average_gradients(model) - optimizer.step() - if batch_idx % args.log_interval == 0: - logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_loader.sampler), - 100. * batch_idx / len(train_loader), loss.item())) - test(model, test_loader, device) - save_model(model, args.model_dir) - - -def test(model, test_loader, device): - model.eval() - test_loss = 0 - correct = 0 - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - test_loss += F.nll_loss(output, target, size_average=None).item() # sum up batch loss - pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability - correct += pred.eq(target.view_as(pred)).sum().item() - - test_loss /= len(test_loader.dataset) - logger.debug('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( - test_loss, correct, len(test_loader.dataset), - 100. * correct / len(test_loader.dataset))) - - -def model_fn(model_dir): - logger.info('model_fn') - model = torch.nn.DataParallel(Net()) - with open(os.path.join(model_dir, 'model.pth'), 'rb') as f: - model.load_state_dict(torch.load(f)) - return model - - -def save_model(model, model_dir): - logger.info("Saving the model.") - path = os.path.join(model_dir, 'model.pth') - # recommended way from http://pytorch.org/docs/master/notes/serialization.html - torch.save(model.state_dict(), path) - - -if __name__ == '__main__': - # test opencv - print(cv.__version__) - - parser = argparse.ArgumentParser() - - # Data and model checkpoints directories - parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=1, metavar='N', - help='number of epochs to train (default: 10)') - parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') - parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--log-interval', type=int, default=100, metavar='N', - help='how many batches to wait before logging training status') - parser.add_argument('--backend', type=str, default=None, - help='backend for distributed training') - parser.add_argument('--processor', type=str, default='cpu', - help='backend for distributed training') - - # Container environment - env = sagemaker_containers.training_env() - parser.add_argument('--hosts', type=list, default=env.hosts) - parser.add_argument('--current-host', type=str, default=env.current_host) - parser.add_argument('--model-dir', type=str, default=env.model_dir) - parser.add_argument('--data-dir', type=str, default=env.channel_input_dirs['training']) - parser.add_argument('--num-gpus', type=int, default=env.num_gpus) - - train(parser.parse_args()) diff --git a/test/resources/mnist/model_cpu/1d/model.pth b/test/resources/mnist/model_cpu/1d/model.pth new file mode 100644 index 00000000..126a0093 Binary files /dev/null and b/test/resources/mnist/model_cpu/1d/model.pth differ diff --git a/test/resources/mnist/model_cpu/1d/model.tar.gz b/test/resources/mnist/model_cpu/1d/model.tar.gz deleted file mode 100644 index 4ca07729..00000000 Binary files a/test/resources/mnist/model_cpu/1d/model.tar.gz and /dev/null differ diff --git a/test/resources/mnist/model_cpu/mnist.py b/test/resources/mnist/model_cpu/mnist.py index 5d50b30a..34956c97 100644 --- a/test/resources/mnist/model_cpu/mnist.py +++ b/test/resources/mnist/model_cpu/mnist.py @@ -11,26 +11,25 @@ # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from __future__ import absolute_import -import argparse + import logging import os import sys import cv2 as cv -import sagemaker_containers import torch -import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F -import torch.optim as optim import torch.utils.data import torch.utils.data.distributed -from torchvision import datasets, transforms logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logger.addHandler(logging.StreamHandler(sys.stdout)) +# test opencv +print(cv.__version__) + # Based on https://github.com/pytorch/examples/blob/master/mnist/main.py class Net(nn.Module): @@ -54,174 +53,9 @@ def forward(self, x): return F.log_softmax(x, dim=1) -def _get_train_data_loader(batch_size, training_dir, is_distributed, **kwargs): - logger.info("Get train data loader") - dataset = datasets.MNIST(training_dir, train=True, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if is_distributed else None - return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=train_sampler is None, - sampler=train_sampler, **kwargs) - - -def _get_test_data_loader(test_batch_size, training_dir, **kwargs): - logger.info("Get test data loader") - return torch.utils.data.DataLoader( - datasets.MNIST(training_dir, train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=test_batch_size, shuffle=True, **kwargs) - - -def _average_gradients(model): - # Gradient averaging. - size = float(dist.get_world_size()) - for param in model.parameters(): - dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM) - param.grad.data /= size - - -def train(args): - is_distributed = len(args.hosts) > 1 and args.backend is not None - logger.debug("Distributed training - {}".format(is_distributed)) - use_cuda = (args.processor == 'gpu') or (args.num_gpus > 0) - logger.debug("Number of gpus available - {}".format(args.num_gpus)) - kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} - device = torch.device("cuda" if use_cuda else "cpu") - - if is_distributed: - # Initialize the distributed environment. - world_size = len(args.hosts) - os.environ['WORLD_SIZE'] = str(world_size) - host_rank = args.hosts.index(args.current_host) - os.environ['RANK'] = str(host_rank) - dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) - logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( - args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( - dist.get_rank(), args.num_gpus)) - - # set the seed for generating random numbers - torch.manual_seed(args.seed) - if use_cuda: - torch.cuda.manual_seed(args.seed) - - train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) - test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) - - # TODO: assert the logs when we move to the SDK local mode - logger.debug("Processes {}/{} ({:.0f}%) of train data".format( - len(train_loader.sampler), len(train_loader.dataset), - 100. * len(train_loader.sampler) / len(train_loader.dataset) - )) - - logger.debug("Processes {}/{} ({:.0f}%) of test data".format( - len(test_loader.sampler), len(test_loader.dataset), - 100. * len(test_loader.sampler) / len(test_loader.dataset) - )) - - model = Net().to(device) - if is_distributed and use_cuda: - # multi-machine multi-gpu case - logger.debug("Multi-machine multi-gpu: using DistributedDataParallel.") - model = torch.nn.parallel.DistributedDataParallel(model) - elif use_cuda: - # single-machine multi-gpu case - logger.debug("Single-machine multi-gpu: using DataParallel().cuda().") - model = torch.nn.DataParallel(model).to(device) - else: - # single-machine or multi-machine cpu case - logger.debug("Single-machine/multi-machine cpu: using DataParallel.") - model = torch.nn.DataParallel(model) - - optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) - - for epoch in range(1, args.epochs + 1): - model.train() - for batch_idx, (data, target) in enumerate(train_loader, 1): - data, target = data.to(device), target.to(device) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - if is_distributed and not use_cuda: - # average gradients manually for multi-machine cpu case only - _average_gradients(model) - optimizer.step() - if batch_idx % args.log_interval == 0: - logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_loader.sampler), - 100. * batch_idx / len(train_loader), loss.item())) - test(model, test_loader, device) - save_model(model, args.model_dir) - - -def test(model, test_loader, device): - model.eval() - test_loss = 0 - correct = 0 - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - test_loss += F.nll_loss(output, target, size_average=None).item() # sum up batch loss - pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability - correct += pred.eq(target.view_as(pred)).sum().item() - - test_loss /= len(test_loader.dataset) - logger.debug('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( - test_loss, correct, len(test_loader.dataset), - 100. * correct / len(test_loader.dataset))) - - def model_fn(model_dir): logger.info('model_fn') model = torch.nn.DataParallel(Net()) with open(os.path.join(model_dir, 'model.pth'), 'rb') as f: model.load_state_dict(torch.load(f)) return model - - -def save_model(model, model_dir): - logger.info("Saving the model.") - path = os.path.join(model_dir, 'model.pth') - # recommended way from http://pytorch.org/docs/master/notes/serialization.html - torch.save(model.state_dict(), path) - - -if __name__ == '__main__': - # test opencv - print(cv.__version__) - - parser = argparse.ArgumentParser() - - # Data and model checkpoints directories - parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=1, metavar='N', - help='number of epochs to train (default: 10)') - parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') - parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--log-interval', type=int, default=100, metavar='N', - help='how many batches to wait before logging training status') - parser.add_argument('--backend', type=str, default=None, - help='backend for distributed training') - parser.add_argument('--processor', type=str, default='cpu', - help='backend for distributed training') - - # Container environment - env = sagemaker_containers.training_env() - parser.add_argument('--hosts', type=list, default=env.hosts) - parser.add_argument('--current-host', type=str, default=env.current_host) - parser.add_argument('--model-dir', type=str, default=env.model_dir) - parser.add_argument('--data-dir', type=str, default=env.channel_input_dirs['training']) - parser.add_argument('--num-gpus', type=int, default=env.num_gpus) - - train(parser.parse_args()) diff --git a/test/resources/mnist/model_cpu/model.pth b/test/resources/mnist/model_cpu/model.pth new file mode 100644 index 00000000..393ea140 Binary files /dev/null and b/test/resources/mnist/model_cpu/model.pth differ diff --git a/test/resources/mnist/model_cpu/model.tar.gz b/test/resources/mnist/model_cpu/model.tar.gz deleted file mode 100644 index f075782e..00000000 Binary files a/test/resources/mnist/model_cpu/model.tar.gz and /dev/null differ diff --git a/test/resources/mnist/model_cpu/model_mnist.tar.gz b/test/resources/mnist/model_cpu/model_mnist.tar.gz deleted file mode 100644 index 3d741a70..00000000 Binary files a/test/resources/mnist/model_cpu/model_mnist.tar.gz and /dev/null differ diff --git a/test/resources/mnist/model_eia/mnist.py b/test/resources/mnist/model_eia/mnist.py index d151a3f3..ebc0bff0 100644 --- a/test/resources/mnist/model_eia/mnist.py +++ b/test/resources/mnist/model_eia/mnist.py @@ -10,4 +10,39 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -# This file is intentionally left blank to utilize default_model_fn and default_predict_fn +from __future__ import absolute_import +import logging +import os +import sys + +import torch + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +logger.addHandler(logging.StreamHandler(sys.stdout)) + + +def predict_fn(input_data, model): + logger.info('Performing EIA inference with Torch JIT context with input of size {}'.format(input_data.shape)) + # With EI, client instance should be CPU for cost-efficiency. Subgraphs with unsupported arguments run locally. Server runs with CUDA + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + mdoel = model.to(device) + input_data = input_data.to(device) + with torch.no_grad(): + # Set the target device to the accelerator ordinal + with torch.jit.optimized_execution(True, {'target_device': 'eia:0'}): + return model(input_data) + + +def model_fn(model_dir): + logger.info('model_fn: Loading model with TorchScript from {}'.format(model_dir)) + # Scripted model is serialized with torch.jit.save(). + # No need to instantiate model definition then load state_dict + model = torch.jit.load('model.pth') + return model + + +def save_model(model, model_dir): + logger.info("Saving the model to {}.".format(model_dir)) + path = os.path.join(model_dir, 'model.pth') + torch.jit.save(model, path) diff --git a/test/resources/mnist/model_eia/model.pth b/test/resources/mnist/model_eia/model.pth new file mode 100644 index 00000000..d7b4691d Binary files /dev/null and b/test/resources/mnist/model_eia/model.pth differ diff --git a/test/resources/mnist/model_eia/model.tar.gz b/test/resources/mnist/model_eia/model.tar.gz deleted file mode 100644 index a992cf5e..00000000 Binary files a/test/resources/mnist/model_eia/model.tar.gz and /dev/null differ diff --git a/test/resources/mnist/model_eia/model_mnist.tar.gz b/test/resources/mnist/model_eia/model_mnist.tar.gz deleted file mode 100644 index 6afd83a4..00000000 Binary files a/test/resources/mnist/model_eia/model_mnist.tar.gz and /dev/null differ diff --git a/test/resources/mnist/model_gpu/mnist.py b/test/resources/mnist/model_gpu/mnist.py index 5d50b30a..34956c97 100644 --- a/test/resources/mnist/model_gpu/mnist.py +++ b/test/resources/mnist/model_gpu/mnist.py @@ -11,26 +11,25 @@ # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from __future__ import absolute_import -import argparse + import logging import os import sys import cv2 as cv -import sagemaker_containers import torch -import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F -import torch.optim as optim import torch.utils.data import torch.utils.data.distributed -from torchvision import datasets, transforms logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logger.addHandler(logging.StreamHandler(sys.stdout)) +# test opencv +print(cv.__version__) + # Based on https://github.com/pytorch/examples/blob/master/mnist/main.py class Net(nn.Module): @@ -54,174 +53,9 @@ def forward(self, x): return F.log_softmax(x, dim=1) -def _get_train_data_loader(batch_size, training_dir, is_distributed, **kwargs): - logger.info("Get train data loader") - dataset = datasets.MNIST(training_dir, train=True, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if is_distributed else None - return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=train_sampler is None, - sampler=train_sampler, **kwargs) - - -def _get_test_data_loader(test_batch_size, training_dir, **kwargs): - logger.info("Get test data loader") - return torch.utils.data.DataLoader( - datasets.MNIST(training_dir, train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=test_batch_size, shuffle=True, **kwargs) - - -def _average_gradients(model): - # Gradient averaging. - size = float(dist.get_world_size()) - for param in model.parameters(): - dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM) - param.grad.data /= size - - -def train(args): - is_distributed = len(args.hosts) > 1 and args.backend is not None - logger.debug("Distributed training - {}".format(is_distributed)) - use_cuda = (args.processor == 'gpu') or (args.num_gpus > 0) - logger.debug("Number of gpus available - {}".format(args.num_gpus)) - kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} - device = torch.device("cuda" if use_cuda else "cpu") - - if is_distributed: - # Initialize the distributed environment. - world_size = len(args.hosts) - os.environ['WORLD_SIZE'] = str(world_size) - host_rank = args.hosts.index(args.current_host) - os.environ['RANK'] = str(host_rank) - dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) - logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( - args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( - dist.get_rank(), args.num_gpus)) - - # set the seed for generating random numbers - torch.manual_seed(args.seed) - if use_cuda: - torch.cuda.manual_seed(args.seed) - - train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) - test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) - - # TODO: assert the logs when we move to the SDK local mode - logger.debug("Processes {}/{} ({:.0f}%) of train data".format( - len(train_loader.sampler), len(train_loader.dataset), - 100. * len(train_loader.sampler) / len(train_loader.dataset) - )) - - logger.debug("Processes {}/{} ({:.0f}%) of test data".format( - len(test_loader.sampler), len(test_loader.dataset), - 100. * len(test_loader.sampler) / len(test_loader.dataset) - )) - - model = Net().to(device) - if is_distributed and use_cuda: - # multi-machine multi-gpu case - logger.debug("Multi-machine multi-gpu: using DistributedDataParallel.") - model = torch.nn.parallel.DistributedDataParallel(model) - elif use_cuda: - # single-machine multi-gpu case - logger.debug("Single-machine multi-gpu: using DataParallel().cuda().") - model = torch.nn.DataParallel(model).to(device) - else: - # single-machine or multi-machine cpu case - logger.debug("Single-machine/multi-machine cpu: using DataParallel.") - model = torch.nn.DataParallel(model) - - optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) - - for epoch in range(1, args.epochs + 1): - model.train() - for batch_idx, (data, target) in enumerate(train_loader, 1): - data, target = data.to(device), target.to(device) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - if is_distributed and not use_cuda: - # average gradients manually for multi-machine cpu case only - _average_gradients(model) - optimizer.step() - if batch_idx % args.log_interval == 0: - logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_loader.sampler), - 100. * batch_idx / len(train_loader), loss.item())) - test(model, test_loader, device) - save_model(model, args.model_dir) - - -def test(model, test_loader, device): - model.eval() - test_loss = 0 - correct = 0 - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - test_loss += F.nll_loss(output, target, size_average=None).item() # sum up batch loss - pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability - correct += pred.eq(target.view_as(pred)).sum().item() - - test_loss /= len(test_loader.dataset) - logger.debug('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( - test_loss, correct, len(test_loader.dataset), - 100. * correct / len(test_loader.dataset))) - - def model_fn(model_dir): logger.info('model_fn') model = torch.nn.DataParallel(Net()) with open(os.path.join(model_dir, 'model.pth'), 'rb') as f: model.load_state_dict(torch.load(f)) return model - - -def save_model(model, model_dir): - logger.info("Saving the model.") - path = os.path.join(model_dir, 'model.pth') - # recommended way from http://pytorch.org/docs/master/notes/serialization.html - torch.save(model.state_dict(), path) - - -if __name__ == '__main__': - # test opencv - print(cv.__version__) - - parser = argparse.ArgumentParser() - - # Data and model checkpoints directories - parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=1, metavar='N', - help='number of epochs to train (default: 10)') - parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') - parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--log-interval', type=int, default=100, metavar='N', - help='how many batches to wait before logging training status') - parser.add_argument('--backend', type=str, default=None, - help='backend for distributed training') - parser.add_argument('--processor', type=str, default='cpu', - help='backend for distributed training') - - # Container environment - env = sagemaker_containers.training_env() - parser.add_argument('--hosts', type=list, default=env.hosts) - parser.add_argument('--current-host', type=str, default=env.current_host) - parser.add_argument('--model-dir', type=str, default=env.model_dir) - parser.add_argument('--data-dir', type=str, default=env.channel_input_dirs['training']) - parser.add_argument('--num-gpus', type=int, default=env.num_gpus) - - train(parser.parse_args()) diff --git a/test/resources/mnist/model_gpu/model.pth b/test/resources/mnist/model_gpu/model.pth new file mode 100644 index 00000000..ca97f76d Binary files /dev/null and b/test/resources/mnist/model_gpu/model.pth differ diff --git a/test/resources/mnist/model_gpu/model.tar.gz b/test/resources/mnist/model_gpu/model.tar.gz deleted file mode 100644 index ed39a7a8..00000000 Binary files a/test/resources/mnist/model_gpu/model.tar.gz and /dev/null differ diff --git a/test/utils/file_utils.py b/test/utils/file_utils.py new file mode 100644 index 00000000..f81a20b5 --- /dev/null +++ b/test/utils/file_utils.py @@ -0,0 +1,24 @@ +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import tarfile + + +def make_tarfile(script, model, output_path, filename="model.tar.gz"): + output_filename = os.path.join(output_path, filename) + with tarfile.open(output_filename, "w:gz") as tar: + tar.add(script, arcname=os.path.basename(script)) + tar.add(model, arcname=os.path.basename(model)) + return output_filename