# Introduction

This notebook was used to conduct the experiments presented in the paper "Battlesnake Challenge: A Multi-agent Reinforcement Learning Playground with Human-in-the-loop".

In [None]:
import sagemaker
from sagemaker.rl import RLEstimator, RLToolkit
import boto3
import json

In [None]:
with open("../stack_outputs.json") as f:
    info = json.load(f)

## Initialise sagemaker
We need to define several parameters prior to running the training job. 

In [None]:
sm_session = sagemaker.session.Session()
s3_bucket = info["S3Bucket"]

s3_output_path = 's3://{}/'.format(s3_bucket)
print("S3 bucket path: {}".format(s3_output_path))

In [None]:
role = sagemaker.get_execution_role()
print(role)

In [None]:
local_mode = False

if local_mode:
    instance_type = 'local'
else:
    instance_type = info["SagemakerTrainingInstanceType"]
    
# If training locally, do some Docker housekeeping..
if local_mode:
    !/bin/bash ./common/setup.sh

# Parameter definitions

In [None]:
region = sm_session.boto_region_name
device = "cpu"
image_name = '462105765813.dkr.ecr.{region}.amazonaws.com/sagemaker-rl-ray-container:ray-0.8.2-tf-{device}-py36'.format(region=region, device=device)

In [None]:
configs = {"PPO": {
                "algorithm": "PPO",
                "additional_config": {
                    'lambda': 0.90,
                    'gamma': 0.999,
                    'kl_coeff': 0.2,
                    'clip_rewards': True,
                    'vf_clip_param': 175.0,
                    'train_batch_size': 9216,
                    'sample_batch_size': 96,
                    'sgd_minibatch_size': 256,
                    'num_sgd_iter': 3,
                    'lr': 5.0e-4,
                }
            }
          }

iterations = 3000
vanilla_rewards = {"another_turn": 0.01,
                    "ate_food": 0,
                    "won": 5,
                    "died": -5,
                    "ate_another_snake": 0,
                    "hit_wall": 0,
                    "hit_other_snake": 0,
                    "hit_self": 0,
                    "was_eaten": 0,
                    "other_snake_hit_body": 0,
                    "forbidden_move": 0,
                    "starved": 0}

In [None]:
metric_definitions =  [
    {'Name': 'training_iteration', 'Regex': 'training_iteration: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'episodes_total', 'Regex': 'episodes_total: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'num_steps_trained', 'Regex': 'num_steps_trained: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'timesteps_total', 'Regex': 'timesteps_total: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'training_iteration', 'Regex': 'training_iteration: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},

    {'Name': 'episode_reward_max', 'Regex': 'episode_reward_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'episode_reward_mean', 'Regex': 'episode_reward_mean: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'episode_reward_min', 'Regex': 'episode_reward_min: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    
    {'Name': 'episode_len_max', 'Regex': 'episode_len_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'episode_len_mean', 'Regex': 'episode_len_mean: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'episode_len_min', 'Regex': 'episode_len_min: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 

    {'Name': 'best_snake_episode_len_max', 'Regex': 'best_snake_episode_len_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'worst_snake_episode_len_max', 'Regex': 'worst_snake_episode_len_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},

    {'Name': 'Snake_hit_wall_max', 'Regex': 'Snake_hit_wall_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'Snake_was_eaten_max', 'Regex': 'Snake_was_eaten_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'Killed_another_snake_max', 'Regex': 'Killed_another_snake_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'Snake_hit_body_max', 'Regex': 'Snake_hit_body_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'Starved_max', 'Regex': 'Starved_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'Forbidden_move_max', 'Regex': 'Forbidden_move_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}
] 

# Vanilla
Train the policy without any HILL varying between the number of agents and map size

In [None]:
test_conditions = [(3, 11), (5, 11), (5, 7), (5, 19), (7, 11)]

job_name_prefix_base = 'Battlesnake-no-hill-'

for num_agents, map_size in test_conditions:
    for name in configs:
        algorithm = configs[name]["algorithm"]
        additional_config = configs[name]["additional_config"]
        job_name_prefix = job_name_prefix_base + "{}-s{}m{}".format(name, num_agents, map_size)
        print("job_prefix {}".format(job_name_prefix))

        estimator = RLEstimator(entry_point="train-mabs.py",
                                source_dir='training/training_src',
                                dependencies=["training/common/sagemaker_rl", "inference/inference_src/", "../BattlesnakeGym/"],
                                image_name=image_name,
                                role=role,
                                train_instance_type=instance_type,
                                train_instance_count=1,
                                output_path=s3_output_path,
                                base_job_name=job_name_prefix,
                                metric_definitions=metric_definitions,
                                hyperparameters={
                                    # See train-mabs.py to add additional hyperparameters
                                    # Also see ray_launcher.py for the rl.training.* hyperparameters
                                    #
                                    # number of training iterations
                                    "num_iters": iterations,
                                    # number of snakes in the gym
                                    "num_agents": num_agents,

                                    "iterate_map_size": False,
                                    "use_heuristics_action_masks": False,
                                    "map_size": map_size,
                                    "algorithm": algorithm,
                                    "additional_configs": additional_config,
                                    "rewards": vanilla_rewards
                                }
                            )

        estimator.fit(wait=False)

# HILL: in-training action masking
Train the policies with in-training action masking with 5 agents and a 11x11 map

In [None]:
test_conditions = [(5, 11)]
heuristics = ["banned_wall_hits", "banned_forbidden_moves"]
job_name_prefix_base = 'Battlesnake-in-training-am-'

for num_agents, map_size in test_conditions:
    for heuristic in heuristics:
        for name in configs:
            algorithm = configs[name]["algorithm"]
            additional_config = configs[name]["additional_config"]
            job_name_prefix = job_name_prefix_base + "{}-{}-".format(name, heuristic.replace("_", "-")[7:])
            print("job_prefix {}".format(job_name_prefix))

            estimator = RLEstimator(entry_point="train-mabs.py",
                                    source_dir='training/training_src',
                                    dependencies=["training/common/sagemaker_rl", "inference/inference_src/", "../BattlesnakeGym/"],
                                    image_name=image_name,
                                    role=role,
                                    train_instance_type=instance_type,
                                    train_instance_count=1,
                                    output_path=s3_output_path,
                                    base_job_name=job_name_prefix,
                                    metric_definitions=metric_definitions,
                                    hyperparameters={
                                        # See train-mabs.py to add additional hyperparameters
                                        # Also see ray_launcher.py for the rl.training.* hyperparameters
                                        #
                                        # number of training iterations
                                        "num_iters": iterations,
                                        # number of snakes in the gym
                                        "num_agents": num_agents,

                                        "iterate_map_size": False,
                                        "heuristics": [heuristic],
                                        "map_size": map_size,
                                        "algorithm": algorithm,
                                        "additional_configs": additional_config,
                                        "rewards": vanilla_rewards
                                    }
                                )

        estimator.fit(wait=False)

# HILL: reward manipulation
Train the policy with reward manipulation with 5 agents and a 11x11 map

In [None]:
forbidden_move_rewards = {"another_turn": 0.01,
                            "ate_food": 0,
                            "won": 5,
                            "died": -5,
                            "ate_another_snake": 0,
                            "hit_wall": 0,
                            "hit_other_snake": 0,
                            "hit_self": 0,
                            "was_eaten": 0,
                            "other_snake_hit_body": 0,
                            "forbidden_move": -2,
                            "starved": 0}

hit_wall_rewards = {"another_turn": 0.01,
                    "ate_food": 0,
                    "won": 5,
                    "died": -5,
                    "ate_another_snake": 0,
                    "hit_wall": -2,
                    "hit_other_snake": 0,
                    "hit_self": 0,
                    "was_eaten": 0,
                    "other_snake_hit_body": 0,
                    "forbidden_move": 0,
                    "starved": 0}

starved_rewards = {"another_turn": 0.01,
                    "ate_food": 0,
                    "won": 5,
                    "died": 5,
                    "ate_another_snake": 0,
                    "hit_wall": 0,
                    "hit_other_snake": 0,
                    "hit_self": 0,
                    "was_eaten": 0,
                    "other_snake_hit_body": 0,
                    "forbidden_move": 0,
                    "starved": -2}

kill_other_snake_rewards = {"another_turn": 0.01,
                            "ate_food": 0,
                            "won": 5,
                            "died": -5,
                            "ate_another_snake": 2,
                            "hit_wall": 0,
                            "hit_other_snake": 0,
                            "hit_self": 0,
                            "was_eaten": 0,
                            "other_snake_hit_body": 2,
                            "forbidden_move": 0,
                            "starved": 0}

In [None]:
test_conditions = [(5, 11)]
rewards_dicts = [("for-move-", forbidden_move_rewards), ("hit-wall", hit_wall_rewards), 
                 ("starved", starved_rewards), ("kill", kill_other_snake_rewards)]

job_name_prefix_base = 'Battlesnake-reward-manipulation-'

for reward_name, reward_dict in rewards_dicts:
    for num_agents, map_size in test_conditions:
        for name in configs:
            algorithm = configs[name]["algorithm"]
            additional_config = configs[name]["additional_config"]
            job_name_prefix = job_name_prefix_base + "{}-{}-".format(name, reward_name)
            print("job_prefix {}".format(job_name_prefix))

            estimator = RLEstimator(entry_point="train-mabs.py",
                                    source_dir='training/training_src',
                                    dependencies=["training/common/sagemaker_rl", "inference/inference_src/", "../BattlesnakeGym/"],
                                    image_name=image_name,
                                    role=role,
                                    train_instance_type=instance_type,
                                    train_instance_count=1,
                                    output_path=s3_output_path,
                                    base_job_name=job_name_prefix,
                                    metric_definitions=metric_definitions,
                                    hyperparameters={
                                        # See train-mabs.py to add additional hyperparameters
                                        # Also see ray_launcher.py for the rl.training.* hyperparameters
                                        #
                                        # number of training iterations
                                        "num_iters": iterations,
                                        # number of snakes in the gym
                                        "num_agents": num_agents,

                                        "iterate_map_size": False,
                                        "use_heuristics_action_masks": False,
                                        "map_size": map_size,
                                        "algorithm": algorithm,
                                        "additional_configs": additional_config,
                                        "rewards": reward_dict
                                    }
                                )

            estimator.fit(wait=False)