In [1]:
import os
import subprocess
from datetime import datetime

from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow

import yaml
from contextlib import redirect_stdout
import boto3
from configs import cfg

In [2]:
S3_BUCKET = 'sagemaker-smcv-tensorflow-tutorial' # Don't include s3:// in your bucket name
S3_DIR = 'smcv-tensorflow-tutorial'
LOCAL_DATA_DIR = '/root/smcv-tensorflow-tutorial' #for reasons detailed in Destributed Training, do not put this dir in your source dir
S3_SRC=os.path.join("s3://", S3_BUCKET, S3_DIR)

In [3]:
boto_session = boto3.session.Session()
region = boto_session.region_name
os.environ['AWS_DEFAULT_REGION'] = region # This is the region we set at the beginning, when creating the S3 bucket for our data

# this is all for naming
user_id="jbsnyder-smcv-tutorial" # This is used for naming your training job, and organizing your results on S3. It can be anything you like.
date_str=datetime.now().strftime("%d-%m-%Y")
time_str=datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

In [4]:
# specify training type, s3 src and nodes
instance_type="ml.p3.16xlarge" # This can be any of 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge', 'ml.p3.8xlarge', 'ml.p3.2xlarge', 'ml.g4dn.12xlarge'
nodes=1
role=get_execution_role() #give Sagemaker permission to launch nodes on our behalf
source_dir='.'
entry_point='train.py'

In [5]:
cfg.LOG_INTERVAL = 50 # Number of training steps between logging interval
cfg.MODEL.DENSE.PRE_NMS_TOP_N_TRAIN = 2000 # Top regions of interest to select before NMS
cfg.MODEL.DENSE.POST_NMS_TOP_N_TRAIN = 1000 # Top regions of interest to select after NMS
cfg.MODEL.RCNN.ROI_HEAD = "StandardRoIHead"
cfg.MODEL.FRCNN.LOSS_TYPE = "giou"

In [6]:
cfg.INPUT.TRAIN_BATCH_SIZE = 32 # Training batch size
cfg.INPUT.EVAL_BATCH_SIZE = 32 # Training batch size
cfg.SOLVER.SCHEDULE = "CosineDecay" # Learning rate schedule, either CosineDecay or PiecewiseConstantDecay
cfg.SOLVER.OPTIMIZER = "NovoGrad" # Optimizer type NovoGrad or Momentum
cfg.SOLVER.LR = .01 # Base learning rate after warmup
cfg.SOLVER.BETA_1 = 0.9 # NovoGrad beta 1 value
cfg.SOLVER.BETA_2 = 0.4 # NovoGRad beta 2 value
cfg.SOLVER.MAX_ITERS = 18000 # Total training steps
cfg.SOLVER.WARMUP_STEPS = 500 # warmup steps
cfg.SOLVER.XLA = True # Train with XLA
cfg.SOLVER.FP16 = True # Train with mixed precision enables
cfg.SOLVER.TF32 = False # Train with TF32 data type enabled, only available on Ampere GPUs and TF 2.4 and up

In [7]:
cfg.HOOKS=["CheckpointHook",
           "IterTimerHook",
           "TextLoggerHook",
           "CocoEvaluator"]

In [8]:
if nodes>1 and instance_type in ['ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge']:
    distribution = { "smdistributed": { "dataparallel": { "enabled": True } } } 
else:
    distribution = { "mpi": { "enabled": True, }}

In [9]:
job_name = f'{user_id}-{time_str}'
output_path = os.path.join(S3_SRC, "sagemaker-output", date_str, job_name)
code_location = os.path.join(S3_SRC, "sagemaker-code", date_str, job_name)

In [10]:
channels = {'val2017': os.path.join(S3_SRC, 'data', 'coco', 'tfrecord', 'val2017'),
            'annotations': os.path.join(S3_SRC, 'data', 'coco', 'annotations'),
            'weights': os.path.join(S3_SRC, 'data', 'weights', 'resnet')}

In [11]:
CHANNELS_DIR='/opt/ml/input/data/' # on node
cfg.PATHS.TRAIN_FILE_PATTERN = os.path.join(S3_SRC, 'data', 'coco', 'tfrecord', 'train2017', 'train*')
cfg.PATHS.VAL_FILE_PATTERN = os.path.join(CHANNELS_DIR, "val2017", "val*")
cfg.PATHS.WEIGHTS = os.path.join(CHANNELS_DIR, "weights", "model.ckpt-112603")
cfg.PATHS.VAL_ANNOTATIONS = os.path.join(CHANNELS_DIR, "annotations", "instances_val2017.json")
cfg.PATHS.OUT_DIR = '/opt/ml/checkpoints'

In [12]:
dist_config_file = f"configs/dist-training-config.yaml"
with open(dist_config_file, 'w') as outfile:
    with redirect_stdout(outfile): print(cfg.dump())

In [13]:
hyperparameters = {"config": dist_config_file}

In [14]:
from sagemaker.session import Session
beta_endpoint="https://api.sagemaker.beta.us-west-2.ml-platform.aws.a2z.com"
beta_runtime_endpoint="https://maeveruntime.beta.us-west-2.ml-platform.aws.a2z.com/"
sage = boto3.client('sagemaker', endpoint_url=beta_endpoint)
sage_runtime = boto3.client('sagemaker-runtime', endpoint_url=beta_runtime_endpoint)
session = Session(sagemaker_client=sage, sagemaker_runtime_client=sage_runtime)

In [15]:
estimator = TensorFlow(
                entry_point=entry_point, 
                source_dir=source_dir, 
                py_version='py37',
                framework_version='2.4.1',
                role=role,
                instance_count=nodes,
                instance_type=instance_type,
                distribution=distribution,
                output_path=output_path,
                #checkpoint_s3_uri=output_path,
                model_dir=output_path,
                hyperparameters=hyperparameters,
                volume_size=500,
                disable_profiler=True,
                debugger_hook_config=False,
                code_location=code_location,
                sagemaker_session=session,
)

In [16]:
job_name

'jbsnyder-smcv-tutorial-29-10-2021-21-01-52'

In [None]:
estimator.fit(channels, wait=True, job_name=job_name)

2021-10-29 21:02:00 Starting - Starting the training job......
2021-10-29 21:02:58 Starting - Preparing the instances for training..................
2021-10-29 21:05:45 Downloading - Downloading input data......
2021-10-29 21:06:34 Training - Downloading the training image........
2021-10-29 21:08:03 Training - Training image download completed. Training in progress.[34m[1,0]<stdout>:[MaskRCNN] INFO    : Using Dataset Sharding[0m
[34m[1,0]<stdout>:[MaskRCNN] INFO    : Using Horovod For Distributed Training[0m
[34m[1,0]<stdout>:[MaskRCNN] INFO    : Using Evaluation Dataset Sharding[0m
[34m[1,0]<stdout>:Start running, work_dir: /opt/ml/checkpoints[0m
[34m[1,0]<stdout>:max: 5 epochs[0m
[34m[1,0]<stdout>:Loading checkpoint from /opt/ml/input/data/weights/model.ckpt-112603...[0m
[34m[1,0]<stdout>:Start time: 2021-10-29 21:09:10.025606[0m
[34m[1,0]<stdout>:Starting epoch: 1 of 5[0m
[34m[1,0]<stdout>:[MaskRCNN] INFO    : Broadcasting model[0m
[34m[1,0]<stdout>:[MaskRCNN] IN

'jbsnyder-smcv-tutorial-29-10-2021-20-37-04'