# ReinforcementLearningBot training

In [2]:
import random
import sys
import numpy as np
from tqdm.notebook import trange
from stable_baselines3 import PPO, A2C, DQN
import supersuit

sys.path.append('..')
from env import BlockadeEnv
from blockade import Blockade
from players.ReinforcementLearningBot import ReinforcementLearningBot
from players.OptimizedBot import OptimizedBot
from players.HeuristicBot import HeuristicBot
from players.RandomBot import RandomBot

# Environment

In [3]:
# based on:
# - example from: https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html
# - wrappers explained in: https://stackoverflow.com/a/73192247

# training requires stable-baselines3 2.0.0a8 with modifications
# 1) in line 178 of stable_baselines3/common/on_policy_algorithm.py
# new_obs, rewards, dones, _, infos = env.step(clipped_actions)
# 2) in line 544 of stable_baselines3/common/off_policy_algorithm.py
# new_obs, rewards, dones, _, infos = env.step(actions)

env = BlockadeEnv(arena_size=15)
env = supersuit.pettingzoo_env_to_vec_env_v1(env)
env = supersuit.concat_vec_envs_v1(env, 1, base_class='stable_baselines3')

# Attempt 1 (default PPO, 20 mln steps)

In [4]:
model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=20000000, log_interval=1000)
model.save('../players/PPO15v1')

Using cpu device
---------------------------------------
| time/                   |           |
|    fps                  | 716       |
|    iterations           | 1000      |
|    time_elapsed         | 5719      |
|    total_timesteps      | 4096000   |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -6.49e-06 |
|    explained_variance   | 0.866     |
|    learning_rate        | 0.0003    |
|    loss                 | 415       |
|    n_updates            | 9990      |
|    policy_gradient_loss | -7.21e-09 |
|    value_loss           | 1.26e+03  |
---------------------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 691       |
|    iterations           | 2000      |
|    time_elapsed         | 11851     |
|    total_timesteps      | 8192000   |
| train/               

# Attempt 2 (default A2C, 20 mln steps)

In [5]:
model = A2C('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=20000000, log_interval=1000)
model.save('../players/A2C15v1')

Using cpu device
------------------------------------
| time/                 |          |
|    fps                | 1238     |
|    iterations         | 1000     |
|    time_elapsed       | 8        |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -0.457   |
|    explained_variance | -0.332   |
|    learning_rate      | 0.0007   |
|    n_updates          | 999      |
|    policy_loss        | 46.1     |
|    value_loss         | 5.55e+03 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1248     |
|    iterations         | 2000     |
|    time_elapsed       | 16       |
|    total_timesteps    | 20000    |
| train/                |          |
|    entropy_loss       | -0.513   |
|    explained_variance | 0.771    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1999     |
|    policy_loss        | 0.255    |
|    value_loss      

# Attempt 3 (default DQN, 20 mln steps)

In [6]:
model = DQN('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=20000000, log_interval=1000)
model.save('../players/DQN15v1')

Using cpu device
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.999    |
| time/               |          |
|    episodes         | 1000     |
|    fps              | 5176     |
|    time_elapsed     | 0        |
|    total_timesteps  | 3084     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.997    |
| time/               |          |
|    episodes         | 2000     |
|    fps              | 5220     |
|    time_elapsed     | 1        |
|    total_timesteps  | 6172     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.996    |
| time/               |          |
|    episodes         | 3000     |
|    fps              | 5326     |
|    time_elapsed     | 1        |
|    total_timesteps  | 9286     |
----------------------------------
----------------------------------
| r

# Compare bots

In [4]:
def make_bot_comparison(bot1, bot2, arena_size=15, num_seeds=500, repetitions=2):
    opt_win_counter = 0
    draw_counter = 0
    for seed in trange(num_seeds):
        for rep in range(repetitions):
            random.seed(seed)
            p1 = bot1
            p2 = bot2
            if rep % 2:
                p1, p2 = p2, p1
            game = Blockade(player1=p1,
                            player2=p2,
                            arena_size=arena_size,
                            verbose=False)
            outcome = game.run_windowless()
            if (p1 == bot1 and outcome == 1) or (p2 == bot1 and outcome == 2):
                opt_win_counter += 1
            elif outcome == 0:
                draw_counter += 1

    print(f'{bot1} against {bot2} results (arena_size={arena_size}):')
    total_games = num_seeds * repetitions
    lost_games = total_games - opt_win_counter - draw_counter
    print(f'{opt_win_counter}/{total_games} games won ({np.round(opt_win_counter / total_games * 100.0, 2)}%)')
    print(f'{draw_counter}/{total_games} draws ({np.round(draw_counter / total_games * 100.0, 2)}%)')
    print(f'{lost_games}/{total_games} games lost ({np.round(lost_games / total_games * 100.0, 2)}%)')
    
    return opt_win_counter, draw_counter, lost_games

## vs RandomBot

In [5]:
make_bot_comparison(ReinforcementLearningBot(verbose=False, model_name='../players/PPO15v1', model_type='ppo'), RandomBot(verbose=False))

  0%|          | 0/500 [00:00<?, ?it/s]

ReinforcementLearningBot against RandomBot results (arena_size=15):
464/1000 games won (46.4%)
103/1000 draws (10.3%)
433/1000 games lost (43.3%)


(464, 103, 433)

In [6]:
make_bot_comparison(ReinforcementLearningBot(verbose=False, model_name='../players/A2C15v1', model_type='a2c'), RandomBot(verbose=False))

  0%|          | 0/500 [00:00<?, ?it/s]

ReinforcementLearningBot against RandomBot results (arena_size=15):
567/1000 games won (56.7%)
114/1000 draws (11.4%)
319/1000 games lost (31.9%)


(567, 114, 319)

In [7]:
make_bot_comparison(ReinforcementLearningBot(verbose=False, model_name='../players/DQN15v1', model_type='dqn'), RandomBot(verbose=False))

  0%|          | 0/500 [00:00<?, ?it/s]

ReinforcementLearningBot against RandomBot results (arena_size=15):
499/1000 games won (49.9%)
88/1000 draws (8.8%)
413/1000 games lost (41.3%)


(499, 88, 413)

## vs HeuristicBot

In [8]:
make_bot_comparison(ReinforcementLearningBot(verbose=False, model_name='../players/PPO15v1', model_type='ppo'), HeuristicBot(verbose=False))

  0%|          | 0/500 [00:00<?, ?it/s]

ReinforcementLearningBot against HeuristicBot results (arena_size=15):
22/1000 games won (2.2%)
129/1000 draws (12.9%)
849/1000 games lost (84.9%)


(22, 129, 849)

In [9]:
make_bot_comparison(ReinforcementLearningBot(verbose=False, model_name='../players/A2C15v1', model_type='a2c'), HeuristicBot(verbose=False))

  0%|          | 0/500 [00:00<?, ?it/s]

ReinforcementLearningBot against HeuristicBot results (arena_size=15):
144/1000 games won (14.4%)
132/1000 draws (13.2%)
724/1000 games lost (72.4%)


(144, 132, 724)

In [10]:
make_bot_comparison(ReinforcementLearningBot(verbose=False, model_name='../players/DQN15v1', model_type='dqn'), HeuristicBot(verbose=False))

  0%|          | 0/500 [00:00<?, ?it/s]

ReinforcementLearningBot against HeuristicBot results (arena_size=15):
51/1000 games won (5.1%)
87/1000 draws (8.7%)
862/1000 games lost (86.2%)


(51, 87, 862)

## vs other RL bots

In [11]:
make_bot_comparison(ReinforcementLearningBot(verbose=False, model_name='../players/PPO15v1', model_type='ppo'), 
                    ReinforcementLearningBot(verbose=False, model_name='../players/A2C15v1', model_type='a2c'))

  0%|          | 0/500 [00:00<?, ?it/s]

ReinforcementLearningBot against ReinforcementLearningBot results (arena_size=15):
274/1000 games won (27.4%)
177/1000 draws (17.7%)
549/1000 games lost (54.9%)


(274, 177, 549)

In [12]:
make_bot_comparison(ReinforcementLearningBot(verbose=False, model_name='../players/PPO15v1', model_type='ppo'), 
                    ReinforcementLearningBot(verbose=False, model_name='../players/DQN15v1', model_type='dqn'))

  0%|          | 0/500 [00:00<?, ?it/s]

ReinforcementLearningBot against ReinforcementLearningBot results (arena_size=15):
549/1000 games won (54.9%)
53/1000 draws (5.3%)
398/1000 games lost (39.8%)


(549, 53, 398)

In [13]:
make_bot_comparison(ReinforcementLearningBot(verbose=False, model_name='../players/A2C15v1', model_type='a2c'), 
                    ReinforcementLearningBot(verbose=False, model_name='../players/DQN15v1', model_type='dqn'))

  0%|          | 0/500 [00:00<?, ?it/s]

ReinforcementLearningBot against ReinforcementLearningBot results (arena_size=15):
449/1000 games won (44.9%)
62/1000 draws (6.2%)
489/1000 games lost (48.9%)


(449, 62, 489)

The best results were achieved by attempt 2 (default A2C algorithm with 20 mln steps). Now the results should be further improved by training A2C with different hyperparameters.