In [1]:
# Option
LOAD_FROM_CHECKPOINT = False

In [2]:
import numpy as np
from random import random, choice

from matplotlib import cm
from time import sleep
from colosseumrl.envs.tron import TronGridEnvironment, TronRender, TronRllibEnvironment

import gym
from gym import Env
from gym.spaces import Dict, Discrete, Box

import ray
from ray import tune
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG
from ray.rllib.agents.dqn import DQNTrainer, DEFAULT_CONFIG

from ray.rllib.models.preprocessors import Preprocessor
from ray.rllib.models import ModelCatalog

SEED = 1517
np.random.seed(SEED)

## Training an Agent

##### Thinking of a more intelligent agent is pretty hard. So let's make machine learning find one for us! First, let's train an agent to defeat our personal atempt. We will employ Rllib in order to train an agent using Deep Q-Learning.

## Our manual agent again

In [3]:
class SimpleAvoidAgent:
    """ Basic single player agent to test single player version of Tron. """
    def __init__(self, noise=0.1):
        self.noise = noise

    def __call__(self, env, observation):
        # With some probability, select a random action for variation
        if random() <= self.noise:
            return choice([0, 1, 2])
        
        # Get game information
        board = observation['board']
        head = observation['heads'][0]
        direction = observation['directions'][0]
        
        # Find the head of our body
        board_size = board.shape[0]
        x, y = head % board_size, head // board_size

        # Check ahead. If it's clear, then take a step forward.
        nx, ny = env.next_cell(x, y, direction, board_size)
        if board[ny, nx] == 0:
            return 0

        # Check a random direction. If it's clear, then go there.
        offset, action, backup = choice([(1, 1, 2), (-1, 2, 1)])
        nx, ny = env.next_cell(x, y, (direction + offset) % 4, board_size)
        if board[ny, nx] == 0:
            return action

        # Otherwise, go the opposite direction.
        return backup

## Single Player Tron
##### We create a simpler variant of tron featuring only one actively participating agent. This will simplify the RL task to training an agent to play against a fixed set of opponents. We can imagine this as embedding our manual agents within the environment.

In [4]:
class SinglePlayer(gym.Env):
    """ Transform tron into a single player game with predefined enemy agents. """
    def __init__(self, env, active_player = '0', agents = SimpleAvoidAgent()):       
        if not isinstance(agents, list):
            agents = [agents]
        
        self.agents = agents
        self.active_player = active_player
        self.env = env
        
        self.observation_space = env.observation_space
        self.action_space = env.action_space
        
        self.observations = None
        
    def reset(self):
        self.observations = self.env.reset()
        return self.observations[self.active_player]
        
    def step(self, action, agents = None):
        if agents is None:
            agents = self.agents
        
        num_agents = len(agents)
        actions = {}
        
        agent_id = 0
        for player in self.env.players:
            player = str(player)
            
            if player == self.active_player:
                actions[player] = action
            else:
                actions[player] = agents[agent_id](self.env.env, self.observations[player])
                agent_id  = (agent_id + 1) % num_agents
        
        self.observations, rewards, dones, info = self.env.step(actions)
        
        return self.observations[self.active_player], rewards[self.active_player], dones[self.active_player], info

## Observation Preprocessing
##### Often times the original form of the observation is not ideal for neural network input. Therefore, we have to pre-process the observation to extract the key bits of information so that the network can easily learn a value or policy function.

In [5]:
class TronExtractBoard(Preprocessor):
    """ Wrapper to extract just the board from the game state and simplify it for the network. """        
    def _init_shape(self, obs_space, options):
        board_size = env.observation_space['board'].shape[0]
        return (board_size + 2, board_size + 2, 2)
    
    def transform(self, observation):
        if 'board' in observation:
            return self._transform(observation)
        else:
            return {player: self._transform(obs, int(player)) for player, obs in observation.items()}

    def _transform(self, observation, rotate: int = 0):
        board = observation['board'].copy()
        
        # Make all enemies look the same
        board[board > 1] = -1
        
        # Mark where all of the player heads are
        heads = np.zeros_like(board)
        
        if (rotate != 0):
            heads.ravel()[observation['heads']] += 1 + ((observation['directions'] - rotate) % 4)
            
            board = np.rot90(board, k=rotate)
            heads = np.rot90(heads, k=rotate)
            
        else:
            heads.ravel()[observation['heads']] += 1 + observation['directions']
            
        # Pad the outsides so that we know where the wall is
        board = np.pad(board, 1, 'constant', constant_values=-1)
        heads = np.pad(heads, 1, 'constant', constant_values=-1)
        
        # Combine together
        board = np.expand_dims(board, -1)
        heads = np.expand_dims(heads, -1)
        
        return np.concatenate([board, heads], axis=-1)

In [6]:
def test(render, env, trainer, frame_time = 0.4):
    policy = trainer.get_policy()
    policy.cur_epsilon_value = 0
    render.close()
    state = env.reset()
    done = False
    action = None
    reward = None
    cumulative_reward = 0

    while not done:
        action = trainer.compute_action(state, prev_action=action, prev_reward=reward)

        state, reward, done, results = env.step(action)
        cumulative_reward += reward
        render.render(env.env.state)

        sleep(frame_time)

    render.render(env.env.state)    
    return cumulative_reward

In [None]:
# Initialize training environment
ray.shutdown()
ray.init()

def environment_creater(params=None):
    agent = SimpleAvoidAgent(noise=0.05)
    return SinglePlayer(TronRllibEnvironment(board_size=13, num_players=4), agents=agent)

env = environment_creater()
tune.register_env("tron_single_player", environment_creater)
ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

# Configure Deep Q Learning with reasonable values
# config = DEFAULT_CONFIG.copy()
# config['num_workers'] = 4
# config['num_gpus'] = 0
# config["timesteps_per_iteration"] = 1024
# config['target_network_update_freq'] = 2048
# config['buffer_size'] = 50_000
# config['schedule_max_timesteps'] = 200_000
# config['exploration_fraction'] = 0.9
# config['compress_observations'] = False
# config['num_envs_per_worker'] = 1 if LOAD_FROM_CHECKPOINT else 4
# config['train_batch_size'] = 4096
# config['n_step'] = 2
# config['seed'] = SEED

# Configure Deep Q Learning for multi-agent training
config = DEFAULT_CONFIG.copy()
config['num_workers'] = 4
config["timesteps_per_iteration"] = 128
config['target_network_update_freq'] = 256
config['buffer_size'] = 10_000
config['schedule_max_timesteps'] = 100_000
config['exploration_fraction'] = 0.9
config['compress_observations'] = False
config['num_envs_per_worker'] = 1 if LOAD_FROM_CHECKPOINT else 4
config['train_batch_size'] = 256
config['n_step'] = 2
config['seed'] = SEED

# We will use a simple convolution network with 3 layers as our feature extractor
config['model']['vf_share_layers'] = True
config['model']['conv_filters'] = [(64, 5, 2), (128, 3, 2), (256, 3, 2)]
config['model']['fcnet_hiddens'] = [256]
config['model']['custom_preprocessor'] = 'tron_prep'

# Begin training or evaluation
trainer = DQNTrainer(config, "tron_single_player")
render = TronRender(13, 4)

if LOAD_FROM_CHECKPOINT:
    np.random.seed(SEED)
    trainer.restore("/home/andranik/ray_results/DQN_tron_single_player_2020-03-09_17-37-30tbji4p81/checkpoint_101/checkpoint-101")
    for _ in range(10):
        print(test(render, env, trainer))
        sleep(3)
else:
    num_epoch = 5001
    test_epochs = 100
    for epoch in range(num_epoch):
        print("Training iteration: {}".format(epoch), end='')
        res = trainer.train()
        print(f", Average reward: {res['episode_reward_mean']}")
        
        if epoch % 100 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)

        if epoch % test_epochs == 0:
            for _ in range(3):
                reward = test(render, env, trainer)
                sleep(2)

2020-03-09 23:55:48,972	INFO resource_spec.py:212 -- Starting Ray with 4.88 GiB memory available for workers and up to 2.46 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-09 23:55:50,092	INFO services.py:1093 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m
2020-03-09 23:55:54,381	INFO trainer.py:377 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-03-09 23:55:54,432	INFO trainer.py:524 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=2702)[0m E0309 23:55:54.352404900    2702 socket_utils_common_posix.cc:208] check for SO_REUSEPORT: {"created":"@1583823354.352389200","description":"Protocol not available","errno":92,"file":"external/com_github_grpc_grpc/src/core/lib/iomgr/socket_utils_common_posix.cc","file_line":185,"os_error":"Protocol not available","syscall":"getsockopt(SO_REUSEPORT)"}
[2m[36m(pid=2702)[0m E0309 23:55:54.352686000    2702 socket_utils_common_posix.cc:313] setsockopt(TCP_USER_TIMEOUT) Protocol not available
[2m[36m(pid=2704)[0m E0309 23:55:54.380412700    2704 socket_utils_common_posix.cc:208] check for SO_REUSEPORT: {"created":"@1583823354.380393000","description":"Protocol not available","errno":92,"file":"external/com_github_grpc_grpc/src/core/lib/iomgr/socket_utils_common_posix.cc","file_line":185,"os_error":"Protocol not available","syscall":"getsockopt(SO_REUSEPORT)"}
[2m[36m(pid=2704)[0m E0309 23:55:54.380723200    2704 socket_utils_common_posix.cc:313] setsockopt(T



[2m[36m(pid=2703)[0m E0309 23:55:54.482307000    2703 socket_utils_common_posix.cc:208] check for SO_REUSEPORT: {"created":"@1583823354.482287000","description":"Protocol not available","errno":92,"file":"external/com_github_grpc_grpc/src/core/lib/iomgr/socket_utils_common_posix.cc","file_line":185,"os_error":"Protocol not available","syscall":"getsockopt(SO_REUSEPORT)"}
[2m[36m(pid=2703)[0m E0309 23:55:54.482637600    2703 socket_utils_common_posix.cc:313] setsockopt(TCP_USER_TIMEOUT) Protocol not available
[2m[36m(pid=2705)[0m E0309 23:55:54.596358100    2705 socket_utils_common_posix.cc:208] check for SO_REUSEPORT: {"created":"@1583823354.596337700","description":"Protocol not available","errno":92,"file":"external/com_github_grpc_grpc/src/core/lib/iomgr/socket_utils_common_posix.cc","file_line":185,"os_error":"Protocol not available","syscall":"getsockopt(SO_REUSEPORT)"}
[2m[36m(pid=2705)[0m E0309 23:55:54.596695600    2705 socket_utils_common_posix.cc:313] setsockopt(T

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


, Average reward: 3.066666666666667
checkpoint saved at /home/andranik/ray_results/DQN_tron_single_player_2020-03-09_23-55-54_wowtdvl/checkpoint_1/checkpoint-1
Training iteration: 1



, Average reward: 5.693069306930693
Training iteration: 2, Average reward: 5.01
Training iteration: 3, Average reward: 4.64
Training iteration: 4, Average reward: 4.93
Training iteration: 5, Average reward: 5.03
Training iteration: 6, Average reward: 4.94
Training iteration: 7, Average reward: 5.32
Training iteration: 8, Average reward: 5.36
Training iteration: 9, Average reward: 5.76
Training iteration: 10, Average reward: 5.01
Training iteration: 11, Average reward: 4.78
Training iteration: 12, Average reward: 5.25
Training iteration: 13, Average reward: 5.03
Training iteration: 14, Average reward: 5.13
Training iteration: 15, Average reward: 5.02
Training iteration: 16, Average reward: 4.75
Training iteration: 17, Average reward: 5.31
Training iteration: 18, Average reward: 5.64
Training iteration: 19, Average reward: 5.73
Training iteration: 20, Average reward: 5.1
Training iteration: 21, Average reward: 4.93
Training iteration: 22, Average reward: 4.92
Training iteration: 23, Aver

Training iteration: 179, Average reward: 9.22
Training iteration: 180, Average reward: 10.4
Training iteration: 181, Average reward: 10.02
Training iteration: 182, Average reward: 10.3
Training iteration: 183, Average reward: 9.62
Training iteration: 184, Average reward: 9.78
Training iteration: 185, Average reward: 9.72
Training iteration: 186, Average reward: 9.53
Training iteration: 187, Average reward: 10.6
Training iteration: 188, Average reward: 11.14
Training iteration: 189, Average reward: 10.83
Training iteration: 190, Average reward: 10.92
Training iteration: 191, Average reward: 10.93
Training iteration: 192, Average reward: 11.82
Training iteration: 193, Average reward: 11.69
Training iteration: 194, Average reward: 12.04
Training iteration: 195, Average reward: 10.73
Training iteration: 196, Average reward: 11.21
Training iteration: 197, Average reward: 11.37
Training iteration: 198, Average reward: 10.92
Training iteration: 199, Average reward: 10.64
Training iteration: 2

Training iteration: 349, Average reward: 36.22
Training iteration: 350, Average reward: 34.2
Training iteration: 351, Average reward: 37.84
Training iteration: 352, Average reward: 39.16
Training iteration: 353, Average reward: 34.83
Training iteration: 354, Average reward: 33.52
Training iteration: 355, Average reward: 34.31
Training iteration: 356, Average reward: 34.6
Training iteration: 357, Average reward: 41.7
Training iteration: 358, Average reward: 39.38
Training iteration: 359, Average reward: 44.26
Training iteration: 360, Average reward: 45.05
Training iteration: 361, Average reward: 42.58
Training iteration: 362, Average reward: 32.82
Training iteration: 363, Average reward: 27.64
Training iteration: 364, Average reward: 21.85
Training iteration: 365, Average reward: 21.79
Training iteration: 366, Average reward: 21.78
Training iteration: 367, Average reward: 23.78
Training iteration: 368, Average reward: 24.86
Training iteration: 369, Average reward: 28.62
Training iterati

In [None]:
np.random.seed(SEED)
trainer.restore(checkpoint)
for _ in range(10):
    print(test(render, env, trainer))
    sleep(3)