In [1]:
import os
import boto3
import time
import numpy as np
import sagemaker
import torchvision
from sagemaker.pytorch import PyTorch

sess = boto3.Session()
sm   = sess.client('sagemaker')
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket_name    = sagemaker_session.default_bucket()
jobs_folder    = 'jobs'
dataset_folder = 'datasets'

In [2]:
# Parameters
backend = 'gloo' #'smddp'
instance_type = 'local' #'ml.p4d.24xlarge', # 'ml.p3.16xlarge', 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge'
instance_count = 2

In [3]:
cifar10_dataset = torchvision.datasets.CIFAR10('cifar10-dataset',
                                               train=True,
                                               download=True)

Files already downloaded and verified


In [4]:
datasets = sagemaker_session.upload_data(path='cifar10-dataset',
                                         key_prefix=f'{dataset_folder}/cifar10-dataset')

In [5]:
job_name   = f'pytorch-smddp-dist-{time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())}'
output_path = f's3://{bucket_name}/{jobs_folder}'

hyperparameters = {'epochs'         : 15,
                   'seed'           : 32,
                   'batch_size'     : 32,
                   'scheduler'      : None,
                   'optimizer'      : 'sgd',
                   'momentum'       : 0.9,
                   'lr'             : 0.001,
                   'criterion'      : 'cross_entropy',
                   'metric'         : None,
                   'model_dir'      : 'model_output',
                   'custom_function': True,
                   'backend'        : backend
                   }

In [6]:
distribution = { "smdistributed": {
                                    "dataparallel": { "enabled": True }
                                  }
                }

In [7]:
estimator = PyTorch(entry_point          = 'main.py',
                    source_dir           = '',
                    output_path          = output_path + '/',
                    code_location        = output_path,
                    role                 = role,
                    instance_count       = instance_count,
                    instance_type        = instance_type,
                    framework_version    = '1.11.0',
                    py_version           = 'py38',
                    distribution         = distribution,
                    hyperparameters      = hyperparameters)

ValueError: Provided instance_type local is not supported by smdataparallel.
Please specify one of the supported instance types:('ml.p3.16xlarge', 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p4de.24xlarge', 'local_gpu')


In [None]:
estimator.fit({'train': datasets},
              job_name=job_name,
              wait=True)