Initialisations to change the base directory of the entry scripts and to update mxnet to the newest mxnet-mkl

In [None]:
!sed -i 's/examples.//g' battlesnake_src/train.py
!sed -i 's/examples.//g' battlesnake_src/networks/agent.py
!sed -i '1i import subprocess\nsubprocess.run(["pip",  "uninstall", "mxnet-mkl", "-y"])\nsubprocess.run(["pip",  "install", "mxnet-mkl", "--pre"])' battlesnake_src/train.py

In [None]:
import boto3
import sagemaker
import numpy as np

from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.mxnet.estimator import MXNet

## Initialise sagemaker
We need to define several parameters prior to running the training job. 
note: `local_mode` defines whether to run the code within this notebook or to run a sagemaker training job

In [None]:
sage_session = sagemaker.session.Session()
s3_bucket = sage_session.default_bucket()  
s3_output_path = 's3://{}/'.format(s3_bucket)
print("S3 bucket path: {}".format(s3_output_path))

# run in local_mode on this machine or as a SageMaker TrainingJob
local_mode = False

if local_mode:
    instance_type = 'local'
else:
    instance_type = "ml.c5.xlarge"
role = sagemaker.get_execution_role()
print("Using IAM role arn: {}".format(role))
# only run from SageMaker notebook instance
if local_mode:
    !/bin/bash ./setup.sh
cpu_or_gpu = 'gpu' if instance_type.startswith('ml.p') else 'cpu'

## Define the attributes of the training job
Use `job_name_prefix` to identify the sagemaker training job for this.

In [None]:
# create a descriptive job name 
job_name_prefix = 'Battlesnake-job'
max_jobs = 3
max_parallel_jobs = 3

## Define the metrics to evaluate your training job
The regex for this metric was defined based on what is printed in the training script `examples/train.py`

In [None]:
metric_definitions = [
    {'Name': 'timesteps', 'Regex': '.*Mean timesteps ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
]

# Define the hyperparameters of your job

In [None]:
hyperparameter_ranges = {
    'buffer_size': IntegerParameter(1000, 6000),
    'update_every': IntegerParameter(10, 20),
    'batch_size': IntegerParameter(16, 256),

    'lr_start': ContinuousParameter(1e-5, 1e-3),
    'lr_factor': ContinuousParameter(0.5, 1.0),
    'lr_step': IntegerParameter(5000, 30000),
    
    'tau': ContinuousParameter(1e-4, 1e-3),
    'gamma': ContinuousParameter(0.85, 0.99),
    
    'depth': IntegerParameter(10, 256),
    'depthS': IntegerParameter(10, 256),
}

static_hyperparameters = {
    'qnetwork_type': "attention",
    'seed': 666,
    'number_of_snakes': 4,
    'episodes': 22000,
    'print_score_steps': 10,
    'activation_type': "softrelu",
    'state_type': 'one_versus_all',
    'sequence_length': 2,
    'repeat_size': 3,
    'kernel_size': 3,
    'starting_channels': 6,
    'map_size': "[15, 15]",
    'snake_representation': 'bordered-51s',
    'save_model_every': 1000,
    'eps_start': 0.99
}

Defines the estimator. Firstly, try run this with `local_model = True` to test your entry_point script

In [None]:
estimator = MXNet(entry_point="train.py",
                  source_dir='battlesnake_src',
                  dependencies=["battlesnake_gym/battlesnake_gym"],
                  role=role,
                  train_instance_type=instance_type,
                  train_instance_count=1,
                  output_path=s3_output_path,
                  framework_version="1.6.0",
                  py_version='py3',
                  base_job_name=job_name_prefix,
                  metric_definitions=metric_definitions,
                  hyperparameters=static_hyperparameters
                 )
if local_mode:
    estimator.fit()

Start the Hyperparameter optimisation sagemaker jobs!

In [None]:
tuner = HyperparameterTuner(estimator,
                            objective_metric_name='timesteps',
                            objective_type='Maximize',
                            hyperparameter_ranges=hyperparameter_ranges,
                            metric_definitions=metric_definitions,
                            max_jobs=max_jobs,
                            max_parallel_jobs=max_parallel_jobs,
                            base_tuning_job_name=job_name_prefix)
tuner.fit()

# Collect the best model
Obtain an s3 URL of the best model

In [None]:
best_training_job = tuner.best_training_job()
best_model_s3 = "{}/{}//output//model.tar.gz".format(s3_output_path, best_training_job)
print("Best model location {}".format(best_model_s3))

# Deploying an endpoint -WIP
DO NOT RUN

In [None]:
estimator_best_job = estimator.attach(tuner.best_training_job(), sage_session)
compiled_model = estimator_best_job.compile_model('ml_c5', 
                                                  {'data' : (1, 3, 384, 512)}, 
                                                          s3_output_path, 
                                                      framework='mxnet', framework_version='1.4.1') 
# Error, Operator _linalg_gemm2 is not supported in frontend MXNet
endpoint = compiled_model.deploy(1, 'ml.c5.9xlarge')