In [None]:
# Download COCO dataset and weight, convert data to TFRecords and upload to S3
# Guide for preparing data and weight https://github.com/HerringForks/DeepLearningExamples/tree/master/TensorFlow2/Segmentation/MaskRCNN#quick-start-guide

In [None]:
# You can create FSx drive for your input data, which can save 3 mins of data download time at training start up and yield similar throughput
# 1. Download and prepare your training dataset on S3.
# 2. Follow the steps listed here to create a FSx linked with your S3 bucket with training data - https://docs.aws.amazon.com/fsx/latest/LustreGuide/create-fs-linked-data-repo.html. Make sure to add an endpoint to your VPC allowing S3 access.
# 3. Follow the steps listed here to configure your SageMaker training job to use FSx https://aws.amazon.com/blogs/machine-learning/speed-up-training-on-amazon-sagemaker-using-amazon-efs-or-amazon-fsx-for-lustre-file-systems/
#
# Important Caveats
# 1. You need use the same subnet and vpc and security group used with FSx when launching the SageMaker notebook instance. The same configurations will be used by your SageMaker training job.
# 2. Make sure you set appropriate inbound/output rules in the security group. Specically, opening up these ports is necessary for SageMaker to access the FSx filesystem in the training job. https://docs.aws.amazon.com/fsx/latest/LustreGuide/limit-access-security-groups.html
# 3. Make sure SageMaker IAM Role used to launch this SageMaker training job has access to AmazonFSx.

In [None]:
import os
import subprocess
from datetime import datetime

from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow
from sagemaker.inputs import FileSystemInput

In [None]:
# Set the default region
region = "us-west-2"
os.environ['AWS_DEFAULT_REGION'] = region
role = get_execution_role()

user_id = "johndoe"
time_str = datetime.now().strftime("%H-%M-%S")

instance_type = "ml.p4d.24xlarge"
instance_count = 1

# launch single node training with total batch size of 64
# parameters are only for demo purpose, user may need to tune to fit their workload
config_file = "configs/mrcnn_bs64.yaml"
hyperparameters = {"config": config_file}

# Enable SMDDP
distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}

job_name = f'{user_id}-maskrcnn-keras-p4d-{instance_count}-{time_str}'

source_dir = "."
entry_point = "train_keras.py"

# The Shared SMDDP Keras docker image
docker_image = "570106654206.dkr.ecr.us-west-2.amazonaws.com/smddp-keras-preview:tf251-maskrcnn"

In [None]:
subnets = ['<subnet-id-of-fsx>']
security_group_ids=['<security-group-id-of-fsx>']
file_system_id = '<fsx-id>'
file_system_directory_path = '/fsx/<path-to-dataset-under-imported-s3-bucket>'

estimator = TensorFlow(
            entry_point=entry_point,
            role=role,
            image_uri=docker_image,
            source_dir=source_dir,
            instance_count=instance_count,
            instance_type=instance_type,
            hyperparameters=hyperparameters,
            subnets=subnets,
            security_group_ids=security_group_ids,
            disable_profiler=True,
            debugger_hook_config=False,
            distribution=distribution)

In [None]:
train_fs = FileSystemInput(
            file_system_id=file_system_id,
            file_system_type='FSxLustre',
            directory_path=file_system_directory_path,
            file_system_access_mode='ro')

data = {"train": train_fs}
estimator.fit(inputs=data, job_name=job_name)