From 28d28e1b56aaf2dbd120ca1c06cefc5b48d2d75e Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Wed, 13 Aug 2025 20:36:55 +0000 Subject: [PATCH 1/5] Breaking: Update sagemaker-training version >=5.0.0 --- VERSION | 2 +- setup.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/VERSION b/VERSION index 063f85e..4a36342 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.9.1.dev0 +3.0.0 diff --git a/setup.py b/setup.py index 84bbe34..bb65e7e 100644 --- a/setup.py +++ b/setup.py @@ -48,12 +48,11 @@ def read(fname): "Natural Language :: English", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', ], - install_requires=['retrying', 'sagemaker-training>=4.3.0,<=4.8.3', 'six>=1.12.0'], + install_requires=['retrying', 'sagemaker-training>=5.0.0,<6.0.0', 'six>=1.12.0'], extras_require={ 'test': test_dependencies }, From eba23ba5ee40b061f94987d3141cfb017abf079c Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Thu, 14 Aug 2025 00:18:50 +0000 Subject: [PATCH 2/5] update test dependency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bb65e7e..1e4a9c7 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def read(fname): test_dependencies = ['boto3', 'coverage==6.5.0', 'flake8', 'future', 'mock', 'pytest', 'pytest-cov', - 'pytest-xdist', 'sagemaker[local]<2', 'torch', 'torchvision', 'tox'] + 'pytest-xdist', 'sagemaker[local]', 'torch', 'torchvision', 'tox'] setup( name='sagemaker_pytorch_training', From abdd2d1c3a8d32f539c124b1d6980cc3e2f80bac Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Thu, 14 Aug 2025 17:07:50 +0000 Subject: [PATCH 3/5] Add latest version and python version to README --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 69f8c03..c996e35 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,4 @@ +[![Latest Version](https://img.shields.io/pypi/v/sagemaker-pytorch-training.svg)](https://pypi.python.org/pypi/sagemaker-pytorch-training) [![Supported Python Versions](https://img.shields.io/pypi/pyversions/sagemaker-pytorch-training.svg)](https://pypi.python.org/pypi/sagemaker-pytorch-training) ================================== SageMaker PyTorch Training Toolkit From 45fa1672e71c1f6ea2a12dec2f63849d45dc967b Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Thu, 14 Aug 2025 17:12:08 +0000 Subject: [PATCH 4/5] fix GitHub README shield badges --- README.rst | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index c996e35..54a8b77 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,16 @@ -[![Latest Version](https://img.shields.io/pypi/v/sagemaker-pytorch-training.svg)](https://pypi.python.org/pypi/sagemaker-pytorch-training) [![Supported Python Versions](https://img.shields.io/pypi/pyversions/sagemaker-pytorch-training.svg)](https://pypi.python.org/pypi/sagemaker-pytorch-training) - ================================== SageMaker PyTorch Training Toolkit ================================== +.. image:: https://img.shields.io/pypi/v/sagemaker-pytorch-training.svg + :target: https://pypi.python.org/pypi/sagemaker-pytorch-training + :alt: Latest Version + +.. image:: https://img.shields.io/pypi/pyversions/sagemaker-pytorch-training.svg + :target: https://pypi.python.org/pypi/sagemaker-pytorch-training + :alt: Supported Python Versions + + SageMaker PyTorch Training Toolkit is an open-source library for using PyTorch to train models on Amazon SageMaker. This toolkit depends and extends the base `SageMaker Training Toolkit `__ with PyTorch specific support. From 7f44b2666c42d0b1a19df375ab3104e3a7978e9f Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Mon, 25 Aug 2025 18:35:21 +0000 Subject: [PATCH 5/5] fix tests --- .../local/test_distributed_training.py | 24 +++++++++---------- test/integration/local/test_horovod.py | 12 +++++----- test/integration/local/test_requirements.py | 6 ++--- .../local/test_single_machine_training.py | 6 ++--- .../sagemaker/test_distributed_operations.py | 17 ++++++++----- test/integration/sagemaker/test_horovod.py | 12 +++++----- test/integration/sagemaker/test_mnist.py | 11 ++++++--- .../sagemaker/test_smdataparallel.py | 6 ++--- 8 files changed, 52 insertions(+), 42 deletions(-) diff --git a/test/integration/local/test_distributed_training.py b/test/integration/local/test_distributed_training.py index 553110a..bd09a2b 100644 --- a/test/integration/local/test_distributed_training.py +++ b/test/integration/local/test_distributed_training.py @@ -35,9 +35,9 @@ def fixture_dist_gpu_backend(request): def test_dist_operations_path_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=dist_operations_path, role=ROLE, - image_name=image_uri, - train_instance_count=2, - train_instance_type='local', + image_uri=image_uri, + instance_count=2, + instance_type='local', sagemaker_session=sagemaker_local_session, hyperparameters={'backend': dist_cpu_backend}, output_path='file://{}'.format(tmpdir)) @@ -49,9 +49,9 @@ def test_dist_operations_path_cpu(image_uri, dist_cpu_backend, sagemaker_local_s def test_dist_operations_path_gpu_nccl(image_uri, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=dist_operations_path, role=ROLE, - image_name=image_uri, - train_instance_count=1, - train_instance_type='local_gpu', + image_uri=image_uri, + instance_count=1, + instance_type='local_gpu', sagemaker_session=sagemaker_local_session, hyperparameters={'backend': 'nccl'}, output_path='file://{}'.format(tmpdir)) @@ -63,9 +63,9 @@ def test_dist_operations_path_gpu_nccl(image_uri, sagemaker_local_session, tmpdi def test_cpu_nccl(image_uri, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=mnist_script, role=ROLE, - image_name=image_uri, - train_instance_count=2, - train_instance_type='local', + image_uri=image_uri, + instance_count=2, + instance_type='local', sagemaker_session=sagemaker_local_session, hyperparameters={'backend': 'nccl'}, output_path='file://{}'.format(tmpdir)) @@ -81,9 +81,9 @@ def test_cpu_nccl(image_uri, sagemaker_local_session, tmpdir): def test_mnist_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=mnist_script, role=ROLE, - image_name=image_uri, - train_instance_count=2, - train_instance_type='local', + image_uri=image_uri, + instance_count=2, + instance_type='local', sagemaker_session=sagemaker_local_session, hyperparameters={'backend': dist_cpu_backend}, output_path='file://{}'.format(tmpdir)) diff --git a/test/integration/local/test_horovod.py b/test/integration/local/test_horovod.py index 1e87135..63fe85e 100644 --- a/test/integration/local/test_horovod.py +++ b/test/integration/local/test_horovod.py @@ -31,10 +31,10 @@ def test_horovod_simple(sagemaker_local_session, image_uri, framework_version, t estimator = PyTorch( entry_point=os.path.join(resources_path, 'horovod', 'simple.py'), role='SageMakerRole', - train_instance_type="local_gpu", + instance_type="local_gpu", sagemaker_session=sagemaker_local_session, - train_instance_count=instances, - image_name=image_uri, + instance_count=instances, + image_uri=image_uri, output_path=output_path, framework_version=framework_version, hyperparameters={'sagemaker_mpi_enabled': True, @@ -66,10 +66,10 @@ def test_horovod_training(sagemaker_local_session, image_uri, framework_version, estimator = PyTorch( entry_point=os.path.join(resources_path, 'horovod', 'train.py'), role='SageMakerRole', - train_instance_type="local_gpu", + instance_type="local_gpu", sagemaker_session=sagemaker_local_session, - train_instance_count=1, - image_name=image_uri, + instance_count=1, + image_uri=image_uri, framework_version=framework_version, hyperparameters={'sagemaker_mpi_enabled': True, 'sagemaker_mpi_num_of_processes_per_host': 2, diff --git a/test/integration/local/test_requirements.py b/test/integration/local/test_requirements.py index 02deac8..f470dac 100644 --- a/test/integration/local/test_requirements.py +++ b/test/integration/local/test_requirements.py @@ -23,9 +23,9 @@ def test_requirements_file(image_uri, instance_type, sagemaker_local_session, tm entry_point=requirements_script, source_dir=requirements_dir, role=ROLE, - image_name=image_uri, - train_instance_count=1, - train_instance_type=instance_type, + image_uri=image_uri, + instance_count=1, + instance_type=instance_type, sagemaker_session=sagemaker_local_session, output_path='file://{}'.format(tmpdir) ) diff --git a/test/integration/local/test_single_machine_training.py b/test/integration/local/test_single_machine_training.py index 68f9985..6b9a771 100644 --- a/test/integration/local/test_single_machine_training.py +++ b/test/integration/local/test_single_machine_training.py @@ -23,9 +23,9 @@ def test_mnist(image_uri, processor, instance_type, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=mnist_script, role=ROLE, - image_name=image_uri, - train_instance_count=1, - train_instance_type=instance_type, + image_uri=image_uri, + instance_count=1, + instance_type=instance_type, sagemaker_session=sagemaker_local_session, hyperparameters={'processor': processor}, output_path='file://{}'.format(tmpdir)) diff --git a/test/integration/sagemaker/test_distributed_operations.py b/test/integration/sagemaker/test_distributed_operations.py index eff50af..526703a 100644 --- a/test/integration/sagemaker/test_distributed_operations.py +++ b/test/integration/sagemaker/test_distributed_operations.py @@ -18,6 +18,7 @@ import pytest from sagemaker import utils from sagemaker.pytorch import PyTorch +from sagemaker.local import LocalSession from six.moves.urllib.parse import urlparse from integration import data_dir, dist_operations_path, mnist_script, DEFAULT_TIMEOUT @@ -31,6 +32,8 @@ @pytest.mark.skip_test_in_region def test_dist_operations_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend): instance_type = instance_type or 'ml.c4.xlarge' + if "local" in instance_type: + sagemaker_session = LocalSession() _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_cpu_backend) @@ -38,6 +41,8 @@ def test_dist_operations_cpu(sagemaker_session, image_uri, instance_type, dist_c @pytest.mark.deploy_test def test_dist_operations_gpu(sagemaker_session, instance_type, image_uri, dist_gpu_backend): instance_type = instance_type or 'ml.p2.xlarge' + if "local" in instance_type: + sagemaker_session = LocalSession() _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_gpu_backend) @@ -52,9 +57,9 @@ def test_mnist_gpu(sagemaker_session, image_uri, dist_gpu_backend): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=mnist_script, role='SageMakerRole', - train_instance_count=2, - image_name=image_uri, - train_instance_type=MULTI_GPU_INSTANCE, + instance_count=2, + image_uri=image_uri, + instance_type=MULTI_GPU_INSTANCE, sagemaker_session=sagemaker_session, debugger_hook_config=False, hyperparameters={'backend': dist_gpu_backend}) @@ -70,10 +75,10 @@ def _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_back with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=dist_operations_path, role='SageMakerRole', - train_instance_count=train_instance_count, - train_instance_type=instance_type, + instance_count=train_instance_count, + instance_type=instance_type, sagemaker_session=sagemaker_session, - image_name=image_uri, + image_uri=image_uri, debugger_hook_config=False, hyperparameters={'backend': dist_backend}) diff --git a/test/integration/sagemaker/test_horovod.py b/test/integration/sagemaker/test_horovod.py index 09276e4..9dbb247 100644 --- a/test/integration/sagemaker/test_horovod.py +++ b/test/integration/sagemaker/test_horovod.py @@ -44,10 +44,10 @@ def test_horovod_simple( estimator = PyTorch( entry_point=os.path.join(resources_path, "horovod", "simple.py"), role="SageMakerRole", - train_instance_type=train_instance_type, + instance_type=train_instance_type, sagemaker_session=sagemaker_session, - train_instance_count=instances, - image_name=image_uri, + instance_count=instances, + image_uri=image_uri, output_path=output_path, framework_version=framework_version, hyperparameters={ @@ -100,10 +100,10 @@ def test_horovod_training( estimator = PyTorch( entry_point=os.path.join(resources_path, "horovod", "train.py"), role="SageMakerRole", - train_instance_type=train_instance_type, + instance_type=train_instance_type, sagemaker_session=sagemaker_session, - train_instance_count=instances, - image_name=image_uri, + instance_count=instances, + image_uri=image_uri, framework_version=framework_version, hyperparameters={ "sagemaker_mpi_enabled": True, diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index d0f4a04..23f4a03 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -15,6 +15,7 @@ import pytest from sagemaker import utils from sagemaker.pytorch import PyTorch +from sagemaker.local import LocalSession from integration import training_dir, mnist_script, DEFAULT_TIMEOUT from integration.sagemaker.timeout import timeout @@ -23,12 +24,16 @@ @pytest.mark.skip_gpu def test_mnist_distributed_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend): instance_type = instance_type or 'ml.c4.xlarge' + if "local" in instance_type: + sagemaker_session = LocalSession() _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_cpu_backend) @pytest.mark.skip_cpu def test_mnist_distributed_gpu(sagemaker_session, image_uri, instance_type, dist_gpu_backend): instance_type = instance_type or 'ml.p2.xlarge' + if "local" in instance_type: + sagemaker_session = LocalSession() _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_gpu_backend) @@ -36,10 +41,10 @@ def _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_ba with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=mnist_script, role='SageMakerRole', - train_instance_count=2, - train_instance_type=instance_type, + instance_count=2, + instance_type=instance_type, sagemaker_session=sagemaker_session, - image_name=image_uri, + image_uri=image_uri, debugger_hook_config=False, hyperparameters={'backend': dist_backend, 'epochs': 2}) training_input = pytorch.sagemaker_session.upload_data(path=training_dir, diff --git a/test/integration/sagemaker/test_smdataparallel.py b/test/integration/sagemaker/test_smdataparallel.py index 86184d5..657f051 100644 --- a/test/integration/sagemaker/test_smdataparallel.py +++ b/test/integration/sagemaker/test_smdataparallel.py @@ -36,10 +36,10 @@ def test_smdataparallel_training( estimator = PyTorch( entry_point=os.path.join(resources_path, "mnist", "smdataparallel_mnist.py"), role="SageMakerRole", - train_instance_type=train_instance_type, + instance_type=train_instance_type, sagemaker_session=sagemaker_session, - train_instance_count=instances, - image_name=image_uri, + instance_count=instances, + image_uri=image_uri, output_path=output_path, framework_version=framework_version, hyperparameters={