# Introduction

This notebook outlines the steps involved in building and deploying a Battlesnake model using Ray RLlib and TensorFlow on Amazon SageMaker.

Library versions currently in use:  TensorFlow 2.1, Ray RLlib 0.8.2

The model is first trained using multi-agent PPO, and then deployed to a managed _TensorFlow Serving_ SageMaker endpoint that can be used for inference.

In [None]:
import sagemaker
from sagemaker.rl import RLEstimator, RLToolkit
import boto3

## Initialise sagemaker
We need to define several parameters prior to running the training job. 

In [None]:
sm_session = sagemaker.session.Session()
s3_bucket = sm_session.default_bucket()

s3_output_path = 's3://{}/'.format(s3_bucket)
print("S3 bucket path: {}".format(s3_output_path))

In [None]:
role = sagemaker.get_execution_role()
print(role)

In [None]:
local_mode = False

if local_mode:
    instance_type = 'local'
else:
    instance_type = "SAGEMAKER_TRAINING_INSTANCE_TYPE"
    
# If training locally, do some Docker housekeeping..
if local_mode:
    !/bin/bash ./common/setup.sh

# Train your model here

In [None]:
region = sm_session.boto_region_name
device = "cpu"
image_name = '462105765813.dkr.ecr.{region}.amazonaws.com/sagemaker-rl-ray-container:ray-0.8.2-tf-{device}-py36'.format(region=region, device=device)

In [None]:
job_name_prefix_base = 'Battlesnake-paper-'

configs = {"PPO": {
                "algorithm": "PPO",
                "additional_config": {
                    'lambda': 0.90,
                    'gamma': 0.999,
                    'kl_coeff': 0.2,
                    'clip_rewards': True,
                    'vf_clip_param': 175.0,
                    'train_batch_size': 9216,
                    'sample_batch_size': 96,
                    'sgd_minibatch_size': 256,
                    'num_sgd_iter': 3,
                    'lr': 5.0e-4,
                }
            }
          }

In [None]:
test_conditions = [(3, 11), (5, 11), (7, 11), (5, 7), (5, 19)]
metric_definitions =  [
    {'Name': 'training_iteration', 'Regex': 'training_iteration: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'episodes_total', 'Regex': 'episodes_total: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'num_steps_trained', 'Regex': 'num_steps_trained: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'timesteps_total', 'Regex': 'timesteps_total: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'training_iteration', 'Regex': 'training_iteration: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},

    {'Name': 'episode_reward_max', 'Regex': 'episode_reward_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'episode_reward_mean', 'Regex': 'episode_reward_mean: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'episode_reward_min', 'Regex': 'episode_reward_min: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    
    {'Name': 'episode_len_max', 'Regex': 'episode_len_mean: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'episode_len_mean', 'Regex': 'episode_len_mean: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'episode_len_min', 'Regex': 'episode_len_mean: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 

    {'Name': 'best_snake_episode_len_max', 'Regex': 'best_snake_episode_len_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'worst_snake_episode_len_max', 'Regex': 'worst_snake_episode_len_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},

    {'Name': 'Snake_hit_wall_max', 'Regex': 'Snake_hit_wall_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'Snake_was_eaten_max', 'Regex': 'Snake_was_eaten_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'Killed_another_snake_max', 'Regex': 'Killed_another_snake_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'Snake_hit_body_max', 'Regex': 'Snake_hit_body_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'Starved_max', 'Regex': 'Starved_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'Forbidden_move_max', 'Regex': 'Forbidden_move_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}
] 

In [None]:
for num_agents, map_size in test_conditions:
    for name in configs:
        algorithm = configs[name]["algorithm"]
        additional_config = configs[name]["additional_config"]
        job_name_prefix = job_name_prefix_base + "{}-s{}m{}".format(name, num_agents, map_size)
        print("job_prefix {}".format(job_name_prefix))

        estimator = RLEstimator(entry_point="train-mabs.py",
                                source_dir='training/training_src',
                                dependencies=["training/common/sagemaker_rl", "inference/inference_src/", "../BattlesnakeGym/"],
                                image_name=image_name,
                                role=role,
                                train_instance_type=instance_type,
                                train_instance_count=1,
                                output_path=s3_output_path,
                                base_job_name=job_name_prefix,
                                metric_definitions=metric_definitions,
                                hyperparameters={
                                    # See train-mabs.py to add additional hyperparameters
                                    # Also see ray_launcher.py for the rl.training.* hyperparameters
                                    #
                                    # number of training iterations
                                    "num_iters": 500,
                                    # number of snakes in the gym
                                    "num_agents": num_agents,

                                    "iterate_map_size": False,
                                    "use_heuristics_action_masks": False,
                                    "map_size": map_size,
                                    "algorithm": algorithm,
                                    "additional_configs": additional_config
                                }
                            )

        estimator.fit(wait=False)

In [None]:
num_agents = 5
map_size = 7

name = "PPO"
algorithm = configs[name]["algorithm"]
additional_config = configs[name]["additional_config"]
job_name_prefix = job_name_prefix_base + "{}-s{}curr".format(name, num_agents)
print("job_prefix {}".format(job_name_prefix))

estimator = RLEstimator(entry_point="train-mabs.py",
                        source_dir='training/training_src',
                        dependencies=["training/common/sagemaker_rl", "inference/inference_src/", "../BattlesnakeGym/"],
                        image_name=image_name,
                        role=role,
                        train_instance_type=instance_type,
                        train_instance_count=1,
                        output_path=s3_output_path,
                        base_job_name=job_name_prefix,
                        metric_definitions=metric_definitions,
                        hyperparameters={
                            # See train-mabs.py to add additional hyperparameters
                            # Also see ray_launcher.py for the rl.training.* hyperparameters
                            #
                            # number of training iterations
                            "num_iters": 500,
                            # number of snakes in the gym
                            "num_agents": num_agents,

                            "iterate_map_size": True,
                            "use_heuristics_action_masks": False,
                            "map_size": map_size,
                            "algorithm": algorithm,
                            "additional_configs": additional_config
                        }
                    )

estimator.fit(wait=False)