In [1]:
import numpy as np
from random import random, choice

from matplotlib import cm
from time import sleep
from colosseumrl.envs.tron import TronGridEnvironment, TronRender, TronRllibEnvironment

import gym
from gym import Env
from gym.spaces import Dict, Discrete, Box

import ray
from ray import tune
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG
from ray.rllib.agents.dqn import DQNTrainer, DEFAULT_CONFIG

from ray.rllib.models.preprocessors import Preprocessor
from ray.rllib.models import ModelCatalog

SEED = 1517
np.random.seed(SEED)

# Training on better agents
#### Now that we have mastered playing against our hand crafted agents, how do we go beyond to achieve some sort of optimum?

We use a common technique in reinforcement learning known as self-play. Here, we allow the opponents to update along side us, but with a delay. Once we begin defeating our current opponents a certain percentage of the time, we update their values with our own. This will encourage the policy to continually improve because it has to defeat its previous iteration.

## A more advanced pre-processor
For self-play to work, we need to make sure that the opponents see the exact same configuration of the board that player 0 sees. Otherwise the policies will be very confused and try to make player 0 win even when they're supposed to be opponents!

In [2]:
class TronExtractBoard(Preprocessor):
    """ Wrapper to extract just the board from the game state and simplify it for the network. """        
    def _init_shape(self, obs_space, options):
        board_size = env.observation_space['board'].shape[0]
        return (board_size + 2, board_size + 2, 2)
    
    def transform(self, observation):
        # Pretty hacky way to get the current player number
        # Requires having exactly 4 players
        board = observation['board']
        hor_offset = board.shape[0] // 2 + 2
        top_player = board[1, hor_offset]
        player_number = {1: 0, 4: 1, 3: 2, 2: 3}[top_player]

        return self._transform(observation, player_number)

    def _transform(self, observation, rotate: int = 0):
        board = observation['board'].copy()
        
        # Make all enemies look the same
        board[board > 1] = -1
        
        # Mark where all of the player heads are
        heads = np.zeros_like(board)
        
        if (rotate != 0):
            heads.ravel()[observation['heads']] += 1 + ((observation['directions'] - rotate) % 4)
            
            board = np.rot90(board, k=rotate)
            heads = np.rot90(heads, k=rotate)
            
        else:
            heads.ravel()[observation['heads']] += 1 + observation['directions']
            
        # Pad the outsides so that we know where the wall is
        board = np.pad(board, 1, 'constant', constant_values=-1)
        heads = np.pad(heads, 1, 'constant', constant_values=-1)
        
        # Combine together
        board = np.expand_dims(board, -1)
        heads = np.expand_dims(heads, -1)
        
        return np.concatenate([board, heads], axis=-1)

In [3]:
class TeamTron(TronRllibEnvironment):
    def step(self, action_dict):
        observation, reward_dict, done_dict, info_dict = super().step(action_dict)
        return observation, reward_dict, done_dict, info_dict

In [4]:
def test(render, env, trainer, frame_time = 0.4):
    extractBoard = TronExtractBoard(env.observation_space)
    policy = trainer.get_policy("training_policy")
    policy.cur_epsilon_value = 0
    render.close()
    obsDict = env.reset()
    doneDict = {'__all__' : False}
    actionDict = {}
    rewardDict = {}

    while not doneDict['__all__']:
        for player, obs in obsDict.items():
            actionDict[player] = trainer.compute_action(obs, prev_action=actionDict.get(player, None), prev_reward=rewardDict.get(player, None), policy_id='training_policy')

        obsDict, rewardDict, doneDict, results = env.step(actionDict)
        render.render(env.state)

        sleep(frame_time)

    render.render(env.state)

In [15]:
# A function that updates the opponent policy with the current training policy weights
def synchronize_policies(trainer):
    training_policy = trainer.get_policy("training_policy")
    opponent_policy = trainer.get_policy("opponent_policy")
#     opponent_policy.set_weights(training_policy.get_weights())
    

# A callback to calculate the win percentage after each episode
# We will use this to determine when to update the opponenets
def on_episode_end(info):
    episode = info["episode"]
    reward_history = episode._agent_reward_history["0"]
    reward = 0
    if len(reward_history) > 1:
        reward = (reward_history[-1] + 1) / 11
        
    episode.custom_metrics['final_reward'] = reward

In [16]:
# Initialize training environment
ray.shutdown()
ray.init()

def environment_creater(params=None):
    return TronRllibEnvironment(board_size=13, num_players=4)
    
def team_environment_creater(params=None):
    return TeamTron(board_size=13, num_players=4)   
    
env = environment_creater()

tune.register_env("tron_multi_player", environment_creater)
tune.register_env("tron_team", team_environment_creater)
ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

# Configure Deep Q Learning for multi-agent training
config = DEFAULT_CONFIG.copy()
config['num_workers'] = 4
config["timesteps_per_iteration"] = 128
config['target_network_update_freq'] = 256
config['buffer_size'] = 10_000
config['schedule_max_timesteps'] = 100_000
config['exploration_fraction'] = 0.9
config['compress_observations'] = False
config['num_envs_per_worker'] = 1
config['train_batch_size'] = 256
config['n_step'] = 2
config['callbacks'] = { 
        "on_episode_end": on_episode_end,
    }

# All of the models will use the same network as before
agent_config = {
    "model": {
        "vf_share_layers": True,
        "conv_filters": [(64, 5, 2), (128, 3, 2), (256, 3, 2)],
        "fcnet_hiddens": [128],
        "custom_preprocessor": 'tron_prep'
    }
}

config['multiagent'] = {
        "policies_to_train": ["training_policy"],
        "policy_mapping_fn": lambda x: "training_policy" if x == "0" else "opponent_policy",
        "policies": {"training_policy": (None, env.observation_space, env.action_space, agent_config),
                     "opponent_policy": (None, env.observation_space, env.action_space, agent_config)}
}
       
trainer = DQNTrainer(config, "tron_multi_player")
num_epoch = 25
render = TronRender(13, 4)

for epoch in range(num_epoch):
    print("Training iteration: {}".format(epoch), end='')
    res = trainer.train()
    print(f", Average reward: {res['policy_reward_mean']['training_policy']}")
    
    if res['custom_metrics']['final_reward_mean'] > 0.6:
        print("Updating opponents")
        synchronize_policies(trainer)
    
    if epoch == 10:
        print("Updating opponents")
        synchronize_policies(trainer)
    
    if epoch % 50 == 0:
        test(render, env, trainer)

checkpoint = trainer.save()        

2020-03-10 14:56:33,331	INFO resource_spec.py:212 -- Starting Ray with 3.71 GiB memory available for workers and up to 1.88 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-10 14:56:34,767	INFO services.py:1093 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


[2m[36m(pid=2082)[0m E0310 14:56:38.438226100    2082 socket_utils_common_posix.cc:208] check for SO_REUSEPORT: {"created":"@1583877398.438206600","description":"Protocol not available","errno":92,"file":"external/com_github_grpc_grpc/src/core/lib/iomgr/socket_utils_common_posix.cc","file_line":185,"os_error":"Protocol not available","syscall":"getsockopt(SO_REUSEPORT)"}
[2m[36m(pid=2082)[0m E0310 14:56:38.438536400    2082 socket_utils_common_posix.cc:313] setsockopt(TCP_USER_TIMEOUT) Protocol not available
[2m[36m(pid=2079)[0m E0310 14:56:38.287420900    2079 socket_utils_common_posix.cc:208] check for SO_REUSEPORT: {"created":"@1583877398.287403800","description":"Protocol not available","errno":92,"file":"external/com_github_grpc_grpc/src/core/lib/iomgr/socket_utils_common_posix.cc","file_line":185,"os_error":"Protocol not available","syscall":"getsockopt(SO_REUSEPORT)"}
[2m[36m(pid=2079)[0m E0310 14:56:38.287800900    2079 socket_utils_common_posix.cc:313] setsockopt(T





  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


, Average reward: 8.25
Training iteration: 1



, Average reward: 7.545454545454546
Training iteration: 2



, Average reward: 6.60655737704918
Training iteration: 3



, Average reward: 7.109756097560975
Training iteration: 4



, Average reward: 7.07
Training iteration: 5, Average reward: 7.32
Training iteration: 6, Average reward: 7.88
Training iteration: 7, Average reward: 8.42
Training iteration: 8, Average reward: 8.73
Training iteration: 9, Average reward: 8.47
Training iteration: 10, Average reward: 8.92
Updating opponents
{'training_policy/conv1/kernel': array([[[[ 5.43366894e-02,  3.87547985e-02,  1.73504967e-02, ...,
           3.17514092e-02,  5.17089628e-02,  1.34702511e-02],
         [ 5.52712567e-03, -3.67624983e-02, -5.94407022e-02, ...,
          -4.64903265e-02,  5.43430299e-02, -2.02433616e-02]],

        [[-1.04779992e-02, -3.65156904e-02, -5.76316118e-02, ...,
          -4.39119451e-02,  2.53797453e-02,  4.27816473e-02],
         [ 1.42724812e-02, -1.18599096e-02,  4.90300432e-02, ...,
          -5.26092798e-02,  4.10107672e-02, -2.16948222e-02]],

        [[ 8.69056955e-03,  3.93987671e-02, -5.82433399e-03, ...,
          -1.17036188e-02, -5.86499982e-02, -4.54434678e-02],
         [-1.569

In [17]:
trainer.restore(checkpoint)
for _ in range(10):
    test(render, env, trainer)

2020-03-10 14:57:49,060	INFO trainable.py:416 -- Restored on 192.168.24.68 from checkpoint: /home/andranik/ray_results/DQN_tron_multi_player_2020-03-10_14-56-37tw29698f/checkpoint_25/checkpoint-25
2020-03-10 14:57:49,062	INFO trainable.py:423 -- Current state after restoring: {'_iteration': 25, '_timesteps_total': 3456, '_time_total': 61.856730937957764, '_episodes_total': 392}
