Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@

==================================
SageMaker PyTorch Training Toolkit
==================================

.. image:: https://img.shields.io/pypi/v/sagemaker-pytorch-training.svg
:target: https://pypi.python.org/pypi/sagemaker-pytorch-training
:alt: Latest Version

.. image:: https://img.shields.io/pypi/pyversions/sagemaker-pytorch-training.svg
:target: https://pypi.python.org/pypi/sagemaker-pytorch-training
:alt: Supported Python Versions


SageMaker PyTorch Training Toolkit is an open-source library for using PyTorch to train models on Amazon SageMaker.

This toolkit depends and extends the base `SageMaker Training Toolkit <https://github.com/aws/sagemaker-training-toolkit>`__ with PyTorch specific support.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.9.1.dev0
3.0.0
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def read(fname):


test_dependencies = ['boto3', 'coverage==6.5.0', 'flake8', 'future', 'mock', 'pytest', 'pytest-cov',
'pytest-xdist', 'sagemaker[local]<2', 'torch', 'torchvision', 'tox']
'pytest-xdist', 'sagemaker[local]', 'torch', 'torchvision', 'tox']

setup(
name='sagemaker_pytorch_training',
Expand All @@ -48,12 +48,11 @@ def read(fname):
"Natural Language :: English",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python",
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
],

install_requires=['retrying', 'sagemaker-training>=4.3.0,<=4.8.3', 'six>=1.12.0'],
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need <6.0.0? if we keep version backward compatible, it will be good to always try to pull latest?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given the base toolkit is now at v5, I don't think that it is a guarantee that we would not have a v6

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I mean, shouldn't we keep base toolkit backward compatible so there's no restriction to upgrade to latest version even if we published V6?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know what you mean, but for V5 our breaking release was not due to the base toolkit change itself, it was from a breaking dependency. Major version releases are for breaking non-backwards compatible changes. Otherwise if the change is backwards compatible that should come as a minor or patch release for the V5 major version

install_requires=['retrying', 'sagemaker-training>=5.0.0,<6.0.0', 'six>=1.12.0'],
extras_require={
'test': test_dependencies
},
Expand Down
24 changes: 12 additions & 12 deletions test/integration/local/test_distributed_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def fixture_dist_gpu_backend(request):
def test_dist_operations_path_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir):
estimator = PyTorch(entry_point=dist_operations_path,
role=ROLE,
image_name=image_uri,
train_instance_count=2,
train_instance_type='local',
image_uri=image_uri,
instance_count=2,
instance_type='local',
sagemaker_session=sagemaker_local_session,
hyperparameters={'backend': dist_cpu_backend},
output_path='file://{}'.format(tmpdir))
Expand All @@ -49,9 +49,9 @@ def test_dist_operations_path_cpu(image_uri, dist_cpu_backend, sagemaker_local_s
def test_dist_operations_path_gpu_nccl(image_uri, sagemaker_local_session, tmpdir):
estimator = PyTorch(entry_point=dist_operations_path,
role=ROLE,
image_name=image_uri,
train_instance_count=1,
train_instance_type='local_gpu',
image_uri=image_uri,
instance_count=1,
instance_type='local_gpu',
sagemaker_session=sagemaker_local_session,
hyperparameters={'backend': 'nccl'},
output_path='file://{}'.format(tmpdir))
Expand All @@ -63,9 +63,9 @@ def test_dist_operations_path_gpu_nccl(image_uri, sagemaker_local_session, tmpdi
def test_cpu_nccl(image_uri, sagemaker_local_session, tmpdir):
estimator = PyTorch(entry_point=mnist_script,
role=ROLE,
image_name=image_uri,
train_instance_count=2,
train_instance_type='local',
image_uri=image_uri,
instance_count=2,
instance_type='local',
sagemaker_session=sagemaker_local_session,
hyperparameters={'backend': 'nccl'},
output_path='file://{}'.format(tmpdir))
Expand All @@ -81,9 +81,9 @@ def test_cpu_nccl(image_uri, sagemaker_local_session, tmpdir):
def test_mnist_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir):
estimator = PyTorch(entry_point=mnist_script,
role=ROLE,
image_name=image_uri,
train_instance_count=2,
train_instance_type='local',
image_uri=image_uri,
instance_count=2,
instance_type='local',
sagemaker_session=sagemaker_local_session,
hyperparameters={'backend': dist_cpu_backend},
output_path='file://{}'.format(tmpdir))
Expand Down
12 changes: 6 additions & 6 deletions test/integration/local/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def test_horovod_simple(sagemaker_local_session, image_uri, framework_version, t
estimator = PyTorch(
entry_point=os.path.join(resources_path, 'horovod', 'simple.py'),
role='SageMakerRole',
train_instance_type="local_gpu",
instance_type="local_gpu",
sagemaker_session=sagemaker_local_session,
train_instance_count=instances,
image_name=image_uri,
instance_count=instances,
image_uri=image_uri,
output_path=output_path,
framework_version=framework_version,
hyperparameters={'sagemaker_mpi_enabled': True,
Expand Down Expand Up @@ -66,10 +66,10 @@ def test_horovod_training(sagemaker_local_session, image_uri, framework_version,
estimator = PyTorch(
entry_point=os.path.join(resources_path, 'horovod', 'train.py'),
role='SageMakerRole',
train_instance_type="local_gpu",
instance_type="local_gpu",
sagemaker_session=sagemaker_local_session,
train_instance_count=1,
image_name=image_uri,
instance_count=1,
image_uri=image_uri,
framework_version=framework_version,
hyperparameters={'sagemaker_mpi_enabled': True,
'sagemaker_mpi_num_of_processes_per_host': 2,
Expand Down
6 changes: 3 additions & 3 deletions test/integration/local/test_requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ def test_requirements_file(image_uri, instance_type, sagemaker_local_session, tm
entry_point=requirements_script,
source_dir=requirements_dir,
role=ROLE,
image_name=image_uri,
train_instance_count=1,
train_instance_type=instance_type,
image_uri=image_uri,
instance_count=1,
instance_type=instance_type,
sagemaker_session=sagemaker_local_session,
output_path='file://{}'.format(tmpdir)
)
Expand Down
6 changes: 3 additions & 3 deletions test/integration/local/test_single_machine_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
def test_mnist(image_uri, processor, instance_type, sagemaker_local_session, tmpdir):
estimator = PyTorch(entry_point=mnist_script,
role=ROLE,
image_name=image_uri,
train_instance_count=1,
train_instance_type=instance_type,
image_uri=image_uri,
instance_count=1,
instance_type=instance_type,
sagemaker_session=sagemaker_local_session,
hyperparameters={'processor': processor},
output_path='file://{}'.format(tmpdir))
Expand Down
17 changes: 11 additions & 6 deletions test/integration/sagemaker/test_distributed_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import pytest
from sagemaker import utils
from sagemaker.pytorch import PyTorch
from sagemaker.local import LocalSession
from six.moves.urllib.parse import urlparse

from integration import data_dir, dist_operations_path, mnist_script, DEFAULT_TIMEOUT
Expand All @@ -31,13 +32,17 @@
@pytest.mark.skip_test_in_region
def test_dist_operations_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend):
instance_type = instance_type or 'ml.c4.xlarge'
if "local" in instance_type:
sagemaker_session = LocalSession()
_test_dist_operations(sagemaker_session, image_uri, instance_type, dist_cpu_backend)


@pytest.mark.skip_cpu
@pytest.mark.deploy_test
def test_dist_operations_gpu(sagemaker_session, instance_type, image_uri, dist_gpu_backend):
instance_type = instance_type or 'ml.p2.xlarge'
if "local" in instance_type:
sagemaker_session = LocalSession()
_test_dist_operations(sagemaker_session, image_uri, instance_type, dist_gpu_backend)


Expand All @@ -52,9 +57,9 @@ def test_mnist_gpu(sagemaker_session, image_uri, dist_gpu_backend):
with timeout(minutes=DEFAULT_TIMEOUT):
pytorch = PyTorch(entry_point=mnist_script,
role='SageMakerRole',
train_instance_count=2,
image_name=image_uri,
train_instance_type=MULTI_GPU_INSTANCE,
instance_count=2,
image_uri=image_uri,
instance_type=MULTI_GPU_INSTANCE,
sagemaker_session=sagemaker_session,
debugger_hook_config=False,
hyperparameters={'backend': dist_gpu_backend})
Expand All @@ -70,10 +75,10 @@ def _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_back
with timeout(minutes=DEFAULT_TIMEOUT):
pytorch = PyTorch(entry_point=dist_operations_path,
role='SageMakerRole',
train_instance_count=train_instance_count,
train_instance_type=instance_type,
instance_count=train_instance_count,
instance_type=instance_type,
sagemaker_session=sagemaker_session,
image_name=image_uri,
image_uri=image_uri,
debugger_hook_config=False,
hyperparameters={'backend': dist_backend})

Expand Down
12 changes: 6 additions & 6 deletions test/integration/sagemaker/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ def test_horovod_simple(
estimator = PyTorch(
entry_point=os.path.join(resources_path, "horovod", "simple.py"),
role="SageMakerRole",
train_instance_type=train_instance_type,
instance_type=train_instance_type,
sagemaker_session=sagemaker_session,
train_instance_count=instances,
image_name=image_uri,
instance_count=instances,
image_uri=image_uri,
output_path=output_path,
framework_version=framework_version,
hyperparameters={
Expand Down Expand Up @@ -100,10 +100,10 @@ def test_horovod_training(
estimator = PyTorch(
entry_point=os.path.join(resources_path, "horovod", "train.py"),
role="SageMakerRole",
train_instance_type=train_instance_type,
instance_type=train_instance_type,
sagemaker_session=sagemaker_session,
train_instance_count=instances,
image_name=image_uri,
instance_count=instances,
image_uri=image_uri,
framework_version=framework_version,
hyperparameters={
"sagemaker_mpi_enabled": True,
Expand Down
11 changes: 8 additions & 3 deletions test/integration/sagemaker/test_mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pytest
from sagemaker import utils
from sagemaker.pytorch import PyTorch
from sagemaker.local import LocalSession

from integration import training_dir, mnist_script, DEFAULT_TIMEOUT
from integration.sagemaker.timeout import timeout
Expand All @@ -23,23 +24,27 @@
@pytest.mark.skip_gpu
def test_mnist_distributed_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend):
instance_type = instance_type or 'ml.c4.xlarge'
if "local" in instance_type:
sagemaker_session = LocalSession()
_test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_cpu_backend)


@pytest.mark.skip_cpu
def test_mnist_distributed_gpu(sagemaker_session, image_uri, instance_type, dist_gpu_backend):
instance_type = instance_type or 'ml.p2.xlarge'
if "local" in instance_type:
sagemaker_session = LocalSession()
_test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_gpu_backend)


def _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_backend):
with timeout(minutes=DEFAULT_TIMEOUT):
pytorch = PyTorch(entry_point=mnist_script,
role='SageMakerRole',
train_instance_count=2,
train_instance_type=instance_type,
instance_count=2,
instance_type=instance_type,
sagemaker_session=sagemaker_session,
image_name=image_uri,
image_uri=image_uri,
debugger_hook_config=False,
hyperparameters={'backend': dist_backend, 'epochs': 2})
training_input = pytorch.sagemaker_session.upload_data(path=training_dir,
Expand Down
6 changes: 3 additions & 3 deletions test/integration/sagemaker/test_smdataparallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ def test_smdataparallel_training(
estimator = PyTorch(
entry_point=os.path.join(resources_path, "mnist", "smdataparallel_mnist.py"),
role="SageMakerRole",
train_instance_type=train_instance_type,
instance_type=train_instance_type,
sagemaker_session=sagemaker_session,
train_instance_count=instances,
image_name=image_uri,
instance_count=instances,
image_uri=image_uri,
output_path=output_path,
framework_version=framework_version,
hyperparameters={
Expand Down