diff --git a/README.rst b/README.rst index 69f8c03e..54a8b77a 100644 --- a/README.rst +++ b/README.rst @@ -1,8 +1,16 @@ - ================================== SageMaker PyTorch Training Toolkit ================================== +.. image:: https://img.shields.io/pypi/v/sagemaker-pytorch-training.svg + :target: https://pypi.python.org/pypi/sagemaker-pytorch-training + :alt: Latest Version + +.. image:: https://img.shields.io/pypi/pyversions/sagemaker-pytorch-training.svg + :target: https://pypi.python.org/pypi/sagemaker-pytorch-training + :alt: Supported Python Versions + + SageMaker PyTorch Training Toolkit is an open-source library for using PyTorch to train models on Amazon SageMaker. This toolkit depends and extends the base `SageMaker Training Toolkit `__ with PyTorch specific support. diff --git a/VERSION b/VERSION index 063f85e5..4a36342f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.9.1.dev0 +3.0.0 diff --git a/setup.py b/setup.py index 84bbe341..1e4a9c75 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def read(fname): test_dependencies = ['boto3', 'coverage==6.5.0', 'flake8', 'future', 'mock', 'pytest', 'pytest-cov', - 'pytest-xdist', 'sagemaker[local]<2', 'torch', 'torchvision', 'tox'] + 'pytest-xdist', 'sagemaker[local]', 'torch', 'torchvision', 'tox'] setup( name='sagemaker_pytorch_training', @@ -48,12 +48,11 @@ def read(fname): "Natural Language :: English", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', ], - install_requires=['retrying', 'sagemaker-training>=4.3.0,<=4.8.3', 'six>=1.12.0'], + install_requires=['retrying', 'sagemaker-training>=5.0.0,<6.0.0', 'six>=1.12.0'], extras_require={ 'test': test_dependencies }, diff --git a/test/integration/local/test_distributed_training.py b/test/integration/local/test_distributed_training.py index 553110a8..bd09a2bb 100644 --- a/test/integration/local/test_distributed_training.py +++ b/test/integration/local/test_distributed_training.py @@ -35,9 +35,9 @@ def fixture_dist_gpu_backend(request): def test_dist_operations_path_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=dist_operations_path, role=ROLE, - image_name=image_uri, - train_instance_count=2, - train_instance_type='local', + image_uri=image_uri, + instance_count=2, + instance_type='local', sagemaker_session=sagemaker_local_session, hyperparameters={'backend': dist_cpu_backend}, output_path='file://{}'.format(tmpdir)) @@ -49,9 +49,9 @@ def test_dist_operations_path_cpu(image_uri, dist_cpu_backend, sagemaker_local_s def test_dist_operations_path_gpu_nccl(image_uri, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=dist_operations_path, role=ROLE, - image_name=image_uri, - train_instance_count=1, - train_instance_type='local_gpu', + image_uri=image_uri, + instance_count=1, + instance_type='local_gpu', sagemaker_session=sagemaker_local_session, hyperparameters={'backend': 'nccl'}, output_path='file://{}'.format(tmpdir)) @@ -63,9 +63,9 @@ def test_dist_operations_path_gpu_nccl(image_uri, sagemaker_local_session, tmpdi def test_cpu_nccl(image_uri, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=mnist_script, role=ROLE, - image_name=image_uri, - train_instance_count=2, - train_instance_type='local', + image_uri=image_uri, + instance_count=2, + instance_type='local', sagemaker_session=sagemaker_local_session, hyperparameters={'backend': 'nccl'}, output_path='file://{}'.format(tmpdir)) @@ -81,9 +81,9 @@ def test_cpu_nccl(image_uri, sagemaker_local_session, tmpdir): def test_mnist_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=mnist_script, role=ROLE, - image_name=image_uri, - train_instance_count=2, - train_instance_type='local', + image_uri=image_uri, + instance_count=2, + instance_type='local', sagemaker_session=sagemaker_local_session, hyperparameters={'backend': dist_cpu_backend}, output_path='file://{}'.format(tmpdir)) diff --git a/test/integration/local/test_horovod.py b/test/integration/local/test_horovod.py index 1e87135a..63fe85e5 100644 --- a/test/integration/local/test_horovod.py +++ b/test/integration/local/test_horovod.py @@ -31,10 +31,10 @@ def test_horovod_simple(sagemaker_local_session, image_uri, framework_version, t estimator = PyTorch( entry_point=os.path.join(resources_path, 'horovod', 'simple.py'), role='SageMakerRole', - train_instance_type="local_gpu", + instance_type="local_gpu", sagemaker_session=sagemaker_local_session, - train_instance_count=instances, - image_name=image_uri, + instance_count=instances, + image_uri=image_uri, output_path=output_path, framework_version=framework_version, hyperparameters={'sagemaker_mpi_enabled': True, @@ -66,10 +66,10 @@ def test_horovod_training(sagemaker_local_session, image_uri, framework_version, estimator = PyTorch( entry_point=os.path.join(resources_path, 'horovod', 'train.py'), role='SageMakerRole', - train_instance_type="local_gpu", + instance_type="local_gpu", sagemaker_session=sagemaker_local_session, - train_instance_count=1, - image_name=image_uri, + instance_count=1, + image_uri=image_uri, framework_version=framework_version, hyperparameters={'sagemaker_mpi_enabled': True, 'sagemaker_mpi_num_of_processes_per_host': 2, diff --git a/test/integration/local/test_requirements.py b/test/integration/local/test_requirements.py index 02deac8d..f470dac6 100644 --- a/test/integration/local/test_requirements.py +++ b/test/integration/local/test_requirements.py @@ -23,9 +23,9 @@ def test_requirements_file(image_uri, instance_type, sagemaker_local_session, tm entry_point=requirements_script, source_dir=requirements_dir, role=ROLE, - image_name=image_uri, - train_instance_count=1, - train_instance_type=instance_type, + image_uri=image_uri, + instance_count=1, + instance_type=instance_type, sagemaker_session=sagemaker_local_session, output_path='file://{}'.format(tmpdir) ) diff --git a/test/integration/local/test_single_machine_training.py b/test/integration/local/test_single_machine_training.py index 68f99859..6b9a771d 100644 --- a/test/integration/local/test_single_machine_training.py +++ b/test/integration/local/test_single_machine_training.py @@ -23,9 +23,9 @@ def test_mnist(image_uri, processor, instance_type, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=mnist_script, role=ROLE, - image_name=image_uri, - train_instance_count=1, - train_instance_type=instance_type, + image_uri=image_uri, + instance_count=1, + instance_type=instance_type, sagemaker_session=sagemaker_local_session, hyperparameters={'processor': processor}, output_path='file://{}'.format(tmpdir)) diff --git a/test/integration/sagemaker/test_distributed_operations.py b/test/integration/sagemaker/test_distributed_operations.py index eff50afe..526703a0 100644 --- a/test/integration/sagemaker/test_distributed_operations.py +++ b/test/integration/sagemaker/test_distributed_operations.py @@ -18,6 +18,7 @@ import pytest from sagemaker import utils from sagemaker.pytorch import PyTorch +from sagemaker.local import LocalSession from six.moves.urllib.parse import urlparse from integration import data_dir, dist_operations_path, mnist_script, DEFAULT_TIMEOUT @@ -31,6 +32,8 @@ @pytest.mark.skip_test_in_region def test_dist_operations_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend): instance_type = instance_type or 'ml.c4.xlarge' + if "local" in instance_type: + sagemaker_session = LocalSession() _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_cpu_backend) @@ -38,6 +41,8 @@ def test_dist_operations_cpu(sagemaker_session, image_uri, instance_type, dist_c @pytest.mark.deploy_test def test_dist_operations_gpu(sagemaker_session, instance_type, image_uri, dist_gpu_backend): instance_type = instance_type or 'ml.p2.xlarge' + if "local" in instance_type: + sagemaker_session = LocalSession() _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_gpu_backend) @@ -52,9 +57,9 @@ def test_mnist_gpu(sagemaker_session, image_uri, dist_gpu_backend): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=mnist_script, role='SageMakerRole', - train_instance_count=2, - image_name=image_uri, - train_instance_type=MULTI_GPU_INSTANCE, + instance_count=2, + image_uri=image_uri, + instance_type=MULTI_GPU_INSTANCE, sagemaker_session=sagemaker_session, debugger_hook_config=False, hyperparameters={'backend': dist_gpu_backend}) @@ -70,10 +75,10 @@ def _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_back with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=dist_operations_path, role='SageMakerRole', - train_instance_count=train_instance_count, - train_instance_type=instance_type, + instance_count=train_instance_count, + instance_type=instance_type, sagemaker_session=sagemaker_session, - image_name=image_uri, + image_uri=image_uri, debugger_hook_config=False, hyperparameters={'backend': dist_backend}) diff --git a/test/integration/sagemaker/test_horovod.py b/test/integration/sagemaker/test_horovod.py index 09276e44..9dbb2478 100644 --- a/test/integration/sagemaker/test_horovod.py +++ b/test/integration/sagemaker/test_horovod.py @@ -44,10 +44,10 @@ def test_horovod_simple( estimator = PyTorch( entry_point=os.path.join(resources_path, "horovod", "simple.py"), role="SageMakerRole", - train_instance_type=train_instance_type, + instance_type=train_instance_type, sagemaker_session=sagemaker_session, - train_instance_count=instances, - image_name=image_uri, + instance_count=instances, + image_uri=image_uri, output_path=output_path, framework_version=framework_version, hyperparameters={ @@ -100,10 +100,10 @@ def test_horovod_training( estimator = PyTorch( entry_point=os.path.join(resources_path, "horovod", "train.py"), role="SageMakerRole", - train_instance_type=train_instance_type, + instance_type=train_instance_type, sagemaker_session=sagemaker_session, - train_instance_count=instances, - image_name=image_uri, + instance_count=instances, + image_uri=image_uri, framework_version=framework_version, hyperparameters={ "sagemaker_mpi_enabled": True, diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index d0f4a048..23f4a037 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -15,6 +15,7 @@ import pytest from sagemaker import utils from sagemaker.pytorch import PyTorch +from sagemaker.local import LocalSession from integration import training_dir, mnist_script, DEFAULT_TIMEOUT from integration.sagemaker.timeout import timeout @@ -23,12 +24,16 @@ @pytest.mark.skip_gpu def test_mnist_distributed_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend): instance_type = instance_type or 'ml.c4.xlarge' + if "local" in instance_type: + sagemaker_session = LocalSession() _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_cpu_backend) @pytest.mark.skip_cpu def test_mnist_distributed_gpu(sagemaker_session, image_uri, instance_type, dist_gpu_backend): instance_type = instance_type or 'ml.p2.xlarge' + if "local" in instance_type: + sagemaker_session = LocalSession() _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_gpu_backend) @@ -36,10 +41,10 @@ def _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_ba with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=mnist_script, role='SageMakerRole', - train_instance_count=2, - train_instance_type=instance_type, + instance_count=2, + instance_type=instance_type, sagemaker_session=sagemaker_session, - image_name=image_uri, + image_uri=image_uri, debugger_hook_config=False, hyperparameters={'backend': dist_backend, 'epochs': 2}) training_input = pytorch.sagemaker_session.upload_data(path=training_dir, diff --git a/test/integration/sagemaker/test_smdataparallel.py b/test/integration/sagemaker/test_smdataparallel.py index 86184d5e..657f0515 100644 --- a/test/integration/sagemaker/test_smdataparallel.py +++ b/test/integration/sagemaker/test_smdataparallel.py @@ -36,10 +36,10 @@ def test_smdataparallel_training( estimator = PyTorch( entry_point=os.path.join(resources_path, "mnist", "smdataparallel_mnist.py"), role="SageMakerRole", - train_instance_type=train_instance_type, + instance_type=train_instance_type, sagemaker_session=sagemaker_session, - train_instance_count=instances, - image_name=image_uri, + instance_count=instances, + image_uri=image_uri, output_path=output_path, framework_version=framework_version, hyperparameters={