In [168]:
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow
from datetime import datetime
import os
import pprint

### Hyperparameters that override default config 

In [169]:
hyperparam_collection = {
    # 0.365 - 12 epoch config
    '1x-8-1-2-conv-nobn-nols': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 15e-3,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 3.0,
        'instance_type': 'ml.p3.16xlarge',
        'instance_count': 1,
        'batch_size_per_device': 2,
        'num_workers_per_host': 8,
        'use_conv': True,
        'use_rcnn_bn': False,
        'ls': 0.0
    },
    # 0.364 - 12 epoch config
    '1x-8-1-4-conv-nobn-nols': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 15e-3,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 3.0,
        'batch_size_per_device': 4,
        'instance_type': 'ml.p3.16xlarge',
        'instance_count': 1,
        'num_workers_per_host': 8,
        'use_conv': True,
        'use_rcnn_bn': False,
        'ls': 0.0,
    },
    '1x-8-1-4-conv-nobn-ls': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 15e-3,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 3.0,
        'batch_size_per_device': 4,
        'instance_type': 'ml.p3.16xlarge',
        'instance_count': 1,
        'num_workers_per_host': 8,
        'use_conv': True,
        'use_rcnn_bn': False,
        'ls': 0.05,
    },
    '1x-8-1-4-fc-bn-ls': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 15e-3,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 3.0,
        'batch_size_per_device': 4,
        'instance_type': 'ml.p3.16xlarge',
        'instance_count': 1,
        'num_workers_per_host': 8,
        'use_conv': False,
        'use_rcnn_bn': True,
        'ls': 0.1
    },
    # 0.364 - 12 epoch config
    '1x-8-2-4-conv-nobn-nols': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 15e-3,
        'warmup_steps': 1800,
        'warmup_init_lr_scale': 3.0,
        'batch_size_per_device': 4,
        'instance_type': 'ml.p3dn.24xlarge',
        'instance_count': 2,
        'num_workers_per_host': 8,
        'use_conv': True,
        'use_rcnn_bn': False,
        'ls': 0.0
    },
    # 0.357 - 12 epoch config
    '1x-8-4-4-conv-nobn-nols': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 5e-3,
        'warmup_steps': 900,
        'warmup_init_lr_scale': 3.0,
        'batch_size_per_device': 4,
        'instance_type': 'ml.p3dn.24xlarge',
        'instance_count': 4,
        'num_workers_per_host': 8,
        'use_conv': True,
        'use_rcnn_bn': False,
        'ls': 0.0
    },
}

### Distributed training configuration

In [170]:
# single default config that can be overriden for HPO by manipulating key
hyperparam_key = '1x-8-4-4-conv-nobn-nols' #'1x-8-2-4-conv-nobn-nols' #'1x-8-1-2-conv-nobn-nols' #'1x-8-1-4-conv-nobn-nols'
hyperparameters = hyperparam_collection[hyperparam_key]
model_cfg = "configs/sagemaker_default_config.py"
hvd_processes_per_host = hyperparameters['num_workers_per_host']
hvd_instance_type = hyperparameters['instance_type']
hvd_instance_count = hyperparameters['instance_count']

### SageMaker configuration

In [None]:
import subprocess

# fill in brackets below

account_call = "aws sts get-caller-identity --query Account --output text"
ecr_account = subprocess.check_output(account_call, shell=True).decode().strip()
ecr_repo = [the ECR repo created earlier]
algo_name = "frcnn-tutorial"

s3_bucket = [mybucket-sagemaker] # name of your s3 bucket without s3://
docker_image = "{0}.dkr.ecr.us-east-1.amazonaws.com/{1}:{2}".format(ecr_account,
                                                                    ecr_repo,
                                                                    algo_name) # the output of `echo ${FULLNAME}` from the previous section something like 12345.dkr.ecr.us-east-1.amazonaws.com/name:algo
user_id = [user_name] # this can be anything you like, used for keeping track of your training jobs

In [1]:
role = get_execution_role()
now = datetime.now()
time_str = now.strftime("%d-%m-%Y-%H-%M")
experiment_id = hyperparam_key
image = docker_image
source_dir = "."
main_script = "tools/train_sagemaker.py"
ec2_instance = hvd_instance_type.replace(".","")

distributions = {
    "mpi": {
        "enabled": True,
        "processes_per_host": hvd_processes_per_host,
        "custom_mpi_options": "-x OMPI_MCA_btl_vader_single_copy_mechanism=none -x TF_CUDNN_USE_AUTOTUNE=0"
#        \
#        -x HOROVOD_NUM_NCCL_STREAMS=2 -x NCCL_TREE_THRESHOLD=4294967296 -x NCCL_MIN_NRINGS=13\
#        -x HOROVOD_CYCLE_TIME=0.5 -x HOROVOD_FUSION_THRESHOLD=67108864",
    }
}

channels = {
    'coco': 's3://{}/faster-rcnn/data/coco/'.format(s3_bucket),
    'weights': 's3://{}/faster-rcnn/data/weights/'.format(s3_bucket)
}

s3_path = os.path.join('s3://{}/faster-rcnn/'.format(s3_bucket), time_str)

job_name = '{}-{}-{}'.format(user_id, ec2_instance, time_str)

output_path = os.path.join(s3_path, "output", job_name)

configuration = {
    'configuration': 'configs/sagemaker_default_model_config.py', 
    's3_path': s3_path,
    'instance_name': job_name
}
configuration.update(hyperparameters)

NameError: name 'get_execution_role' is not defined

In [172]:
pprint.pprint(configuration)

{'base_learning_rate': 0.005,
 'batch_size_per_device': 4,
 'configuration': 'configs/sagemaker_default_model_config.py',
 'fp16': True,
 'instance_count': 4,
 'instance_name': 'mzanur-1x-8-4-4-conv-nobn-nols-mlp3dn24xlarge-22-04-2020-06-29',
 'instance_type': 'ml.p3dn.24xlarge',
 'ls': 0.0,
 'num_workers_per_host': 8,
 's3_path': 's3://mzanur-sagemaker/faster-rcnn/22-04-2020-06-29',
 'schedule': '1x',
 'use_conv': True,
 'use_rcnn_bn': False,
 'warmup_init_lr_scale': 3.0,
 'warmup_steps': 900}


In [173]:
estimator = TensorFlow(
                entry_point=main_script, 
                source_dir=source_dir, 
                image_name=image, 
                role=role,
                framework_version="2.1.0",
                py_version="py3",
                train_instance_count=hvd_instance_count,
                train_instance_type=hvd_instance_type,
                distributions=distributions,
                output_path=output_path, train_volume_size=200,
                hyperparameters=configuration
)

In [174]:
estimator.fit(channels, wait=False, job_name=job_name)