In [60]:
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow
from datetime import datetime
import os
import pprint

### Hyperparameters that override default config 

In [68]:
hyperparam_collection = {
    '1x-8-1-2-conv-nobn-nols': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 15e-3,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 3.0,
        'instance_type': 'ml.p3.16xlarge',
        'instance_count': 1,
        'batch_size_per_device': 2,
        'num_workers_per_host': 8,
        'use_conv': True,
        'use_rcnn_bn': False,
        'ls': 0.0
    },
    '1x-8-1-4-conv-nobn-nols': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 15e-3,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 3.0,
        'batch_size_per_device': 4,
        'instance_type': 'ml.p3.16xlarge',
        'instance_count': 1,
        'num_workers_per_host': 8,
        'use_conv': True,
        'use_rcnn_bn': False,
        'ls': 0.0,
    },
    '1x-8-1-4-conv-nobn-ls': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 15e-3,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 3.0,
        'batch_size_per_device': 4,
        'instance_type': 'ml.p3.16xlarge',
        'instance_count': 1,
        'num_workers_per_host': 8,
        'use_conv': True,
        'use_rcnn_bn': False,
        'ls': 0.05,
    },
    '1x-8-1-4-fc-bn-ls': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 15e-3,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 3.0,
        'batch_size_per_device': 4,
        'instance_type': 'ml.p3.16xlarge',
        'instance_count': 1,
        'num_workers_per_host': 8,
        'use_conv': False,
        'use_rcnn_bn': True,
        'ls': 0.1
    },
    '1x-8-2-4-conv-nobn-nols': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 15e-3,
        'warmup_steps': 1800,
        'warmup_init_lr_scale': 3.0,
        'batch_size_per_device': 4,
        'instance_type': 'ml.p3dn.24xlarge',
        'instance_count': 2,
        'num_workers_per_host': 8,
        'use_conv': True,
        'use_rcnn_bn': False,
        'ls': 0.0
    },
    '1x-8-4-4-conv-nobn-nols': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 5e-3,
        'warmup_steps': 900,
        'warmup_init_lr_scale': 3.0,
        'batch_size_per_device': 4,
        'instance_type': 'ml.p3dn.24xlarge',
        'instance_count': 4,
        'num_workers_per_host': 8,
        'use_conv': True,
        'use_rcnn_bn': False,
        'ls': 0.0
    },
}

In [100]:
hyperparam_collection = {
    '1x-8-1-2': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 1e-2,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 3.0,
        'instance_type': 'ml.p3.16xlarge',
        'instance_count': 1,
        'batch_size_per_device': 2,
        'num_workers_per_host': 8,
    },
    '1x-8-1-4': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 1e-2,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 3.0,
        'instance_type': 'ml.p3dn.24xlarge',
        'instance_count': 1,
        'batch_size_per_device': 4,
        'num_workers_per_host': 8,
    },
    '1x-8-4-4': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 1e-2,
        'warmup_steps': 1000,
        'warmup_init_lr_scale': 3.0,
        'instance_type': 'ml.p3dn.24xlarge',
        'instance_count': 4,
        'batch_size_per_device': 4,
        'num_workers_per_host': 8,
    },
    '1x-8-8-4': {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 1e-3,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 6.0,
        'instance_type': 'ml.p3dn.24xlarge',
        'instance_count': 8,
        'batch_size_per_device': 4,
        'num_workers_per_host': 8,
    }

}

### Distributed training configuration

In [101]:
# single default config that can be overriden for HPO by manipulating key
hyperparam_key = '1x-8-8-4'
hyperparameters = hyperparam_collection[hyperparam_key]
hvd_processes_per_host = hyperparameters['num_workers_per_host']
hvd_instance_type = hyperparameters['instance_type']
hvd_instance_count = hyperparameters['instance_count']

### SageMaker configuration

In [102]:
role = get_execution_role()
now = datetime.now()
time_str = now.strftime("%d-%m-%Y-%H-%M")
user_id = 'mzanur'
model_name='retinanet'
experiment_id = hyperparam_key
image = '578276202366.dkr.ecr.us-east-1.amazonaws.com/mzanur-ecr:{}'.format(model_name)
source_dir = "/home/ec2-user/SageMaker/deep-learning-models/models/vision/detection"
main_script = "tools/train_sagemaker.py"
ec2_instance = hvd_instance_type.replace(".","")

distributions = {
    "mpi": {
        "enabled": True,
        "processes_per_host": hvd_processes_per_host,
        "custom_mpi_options": "-x OMPI_MCA_btl_vader_single_copy_mechanism=none -x TF_CUDNN_USE_AUTOTUNE=0"
    }
}

channels = {
    'coco': 's3://{}-sagemaker/awsdet/data/coco/'.format(user_id),
    'weights': 's3://{}-sagemaker/awsdet/data/weights/'.format(user_id)
}

s3_path = os.path.join('s3://{}-sagemaker/{}/'.format(user_id, model_name), time_str)

model_config_path = 'configs/{}/sagemaker_default_model_config.py'.format(model_name)

job_name = '{}-{}-{}-{}'.format(user_id, experiment_id, ec2_instance, time_str)

output_path = os.path.join(s3_path, "output", job_name)

subnets=['subnet-58b35b04']

security_group_ids=['sg-02a21bf8f59e59172']

configuration = {
    'configuration': model_config_path, 
    's3_path': s3_path,
    'instance_name': job_name,
    'model_name': model_name,
}
configuration.update(hyperparameters)

In [103]:
pprint.pprint(configuration)

{'base_learning_rate': 0.001,
 'batch_size_per_device': 4,
 'configuration': 'configs/retinanet/sagemaker_default_model_config.py',
 'fp16': True,
 'instance_count': 8,
 'instance_name': 'mzanur-1x-8-8-4-mlp3dn24xlarge-02-06-2020-22-51',
 'instance_type': 'ml.p3dn.24xlarge',
 'model_name': 'retinanet',
 'num_workers_per_host': 8,
 's3_path': 's3://mzanur-sagemaker/retinanet/02-06-2020-22-51',
 'schedule': '1x',
 'warmup_init_lr_scale': 6.0,
 'warmup_steps': 500}


In [104]:
estimator = TensorFlow(
                entry_point=main_script, 
                source_dir=source_dir, 
                image_name=image, 
                role=role,
                framework_version="2.1.0",
                py_version="py3",
                train_instance_count=hvd_instance_count,
                train_instance_type=hvd_instance_type,
                distributions=distributions,
                output_path=output_path, train_volume_size=200,
                hyperparameters=configuration
)

In [105]:
estimator.fit(channels, wait=False, job_name=job_name)