In [1]:
import os
import subprocess
from datetime import datetime

from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow
from sagemaker.inputs import FileSystemInput

In [2]:
# Set the default region
region = "us-east-1"
os.environ['AWS_DEFAULT_REGION'] = region
role = get_execution_role()

user_id = "muziy"
time_str = datetime.now().strftime("%H-%M-%S")

instance_type = "ml.p4d.24xlarge"
instance_count = 8

# smddp and horovod are supported
dist = 'smddp' # smddp and horovod are supported
if dist == 'smddp':
    distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
elif dist == 'horovod':
    custom_mpi_options = ['-x FI_EFA_USE_DEVICE_RDMA=1']
    distribution = {"mpi": {"enabled": True, "custom_mpi_options": " ".join(custom_mpi_options)}}

job_name = f'{user_id}-resnet101-p4d-{instance_count}-{dist}-{time_str}'

source_dir = "."
entry_point = "resnet_101_imagenet.py"

docker_image = "570106654206.dkr.ecr.us-east-1.amazonaws.com/muziy-keras:tf251-maskrcnn-indu-train-begin"

In [3]:
estimator = TensorFlow(
            entry_point=entry_point,
            role=role,
            image_uri=docker_image,
            source_dir=source_dir,
            instance_count=instance_count,
            instance_type=instance_type,
            subnets=['subnet-0fe4a69647f7b95f3'],
            security_group_ids=['sg-07306d8b4a69268b1'],
            disable_profiler=True,
            debugger_hook_config=False,
            distribution=distribution)

In [4]:
file_system_directory_path = '/fsx/dataset/imagenet/tfrecords'
file_system_access_mode = 'ro'
train_fs = FileSystemInput(
    file_system_id='fs-0a1be54e499ac3d67',
    file_system_type='FSxLustre',
    directory_path=file_system_directory_path,
    file_system_access_mode=file_system_access_mode)
data = {"train": train_fs}
estimator.fit(inputs=data, job_name=job_name)

2021-10-19 20:39:22 Starting - Starting the training job...
2021-10-19 20:39:25 Starting - Launching requested ML instances.....................
2021-10-19 20:43:21 Starting - Preparing the instances for training....................................
2021-10-19 20:49:07 Downloading - Downloading input data
2021-10-19 20:49:07 Training - Downloading the training image.....................
2021-10-19 20:52:54 Training - Training image download completed. Training in progress.[36m2021-10-19 20:52:47.502928: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[36m2021-10-19 20:52:47.510074: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[36m2021-10-19 20:52:47.695733: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0[0m
[36m20