In [62]:
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
import concurrent.futures
from torch import optim
import torch
import os
import random
import copy


%load_ext autoreload
%autoreload 2
import env
import network
import player


BOARD_XSIZE = env.BOARD_XSIZE
BOARD_YSIZE = env.BOARD_YSIZE

DIMS=(BOARD_XSIZE,BOARD_YSIZE)


EPISODES_PER_AGENT = 20
TRAIN_EPOCHS = 500000
MODEL_SAVE_INTERVAL = 100
MAKE_OPPONENT_INTERVAL = 1000
SUMMARY_STATS_INTERVAL = 10
RANDOM_SEED = 42

SUMMARY_DIR = './summary'
MODEL_DIR = './models'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)

use_cuda = torch.cuda.is_available()
torch.manual_seed(RANDOM_SEED)

cuda = torch.device("cuda")
cpu = torch.device("cpu")

if use_cuda:
    device = cuda
else:
    device = cpu

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
# TODO: restore neural net parameters

impostor_actor = network.Actor().to(device)
impostor_critic = network.Critic().to(device)
impostor_actor_optimizer = optim.Adam(impostor_actor.parameters(), lr=network.ACTOR_LR)
impostor_critic_optimizer = optim.Adam(impostor_critic.parameters(), lr=network.CRITIC_LR)

crewmate_actor = network.Actor().to(device)
crewmate_critic = network.Critic().to(device)
crewmate_actor_optimizer = optim.Adam(crewmate_actor.parameters(), lr=network.ACTOR_LR)
crewmate_critic_optimizer = optim.Adam(crewmate_critic.parameters(), lr=network.CRITIC_LR)

# Get Writer
writer = SummaryWriter(log_dir=SUMMARY_DIR)

impostor_step = 0
crewmate_step = 0

In [59]:
crewmate_pool : list[player.Player] = [
    player.RandomPlayer(),
]
impostor_pool : list[player.Player] = [
    player.RandomPlayer(),
]

In [60]:

def random_valid_location() -> tuple[int, int]:
    x = np.random.randint(0, BOARD_XSIZE)
    y = np.random.randint(0, BOARD_YSIZE)
    return (x, y)


def play(actor_engine: player.ActorPlayer, actor_is_impostor: bool, other_engines: list[player.Player]) -> tuple[
    list[env.Observation],
    list[env.Action],
    list[np.ndarray],
    list[env.Reward],
    list[env.Advantage],
    list[env.Reward],
    bool
]:
    # create environment
    e = env.Env()

    # create actor player at random location
    actor_playerstate = env.PlayerState(random_valid_location(), actor_is_impostor, False)
    # create other players at random locations
    other_playerstate = [env.PlayerState(random_valid_location(), False, False) for _ in other_engines]
    # If the actor is not an impostor, then the impostor is randomly chosen from the others.
    if not actor_is_impostor:
        random.choice(other_playerstate).impostor = True

    # set the players in the environment
    e.state.players = [actor_playerstate] + other_playerstate
    # set the player engines
    player_engines = [actor_engine] + other_engines

    # shuffle the player indices such that they the corresponding player states and engines have the same indices
    random_indices = np.random.permutation(len(player_engines))
    e.state.players = [e.state.players[i] for i in random_indices]
    player_engines = [player_engines[i] for i in random_indices]

    actor_index = env.Player(random_indices[0])

    s_t: list[env.Observation] = []
    a_t: list[env.Action] = []
    p_t: list[np.ndarray] = []
    r_t: list[env.Reward] = []
    # play the game
    while not e.game_over():
        for player, player_engine in enumerate(player_engines):
            player = env.Player(player)
            # if the player is the actor we're gathering data for, then we need to store the data
            if player == actor_index:
                obs, action_probs, chosen_action = player_engine.play(player, e)
                s_t += [obs]
                p_t += [action_probs]
                a_t += [chosen_action]
                e.play(chosen_action, player)
            else:
                # skip dead players
                if e.game_over_for(player):
                    continue
                # get chosen action from player engine and play it
                _, _, chosen_action = player_engine.play(player, e)
                e.play(chosen_action, player)
        # step and get rewards
        rewards = e.step()
        r_t += [rewards[actor_index]]
        # if the actor we're gathering data for is dead, then we need to stop
        if e.game_over_for(env.Player(actor_index)):
            break

    # compute advantage and value
    d_t = network.compute_advantage(actor_engine.critic, s_t, r_t)
    v_t = network.compute_value(r_t)

    return s_t, a_t, p_t, r_t, d_t, v_t, actor_is_impostor


In [63]:
impostor_reward_buf:list[float] = []
crewmate_reward_buf:list[float] = []

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    for _ in range(TRAIN_EPOCHS):
        crewmate_s_batch:list[env.Observation] = []
        crewmate_a_batch:list[env.Action] = []
        crewmate_p_batch:list[np.ndarray] = []
        crewmate_d_batch:list[env.Advantage] = []
        crewmate_v_batch:list[env.Value] = []
        
        impostor_s_batch:list[env.Observation] = []
        impostor_a_batch:list[env.Action] = []
        impostor_p_batch:list[np.ndarray] = []
        impostor_d_batch:list[env.Advantage] = []
        impostor_v_batch:list[env.Value] = []

        # create actor player
        crewmate_nn_player = player.ActorPlayer(crewmate_actor, crewmate_critic, crewmate_step)
        impostor_nn_player = player.ActorPlayer(impostor_actor, impostor_critic, impostor_step)


        futures = []
        for i in range(EPISODES_PER_AGENT):
            others = []
            for _ in range(3):
                others.append(player.RandomPlayer())
            # play the game
            future = executor.submit(play, crewmate_nn_player, False, others)
            futures.append(future)

        for i in range(EPISODES_PER_AGENT):
            others = []
            for _ in range(3):
                others.append(player.RandomPlayer())

            # play the game
            future = executor.submit(play, impostor_nn_player, True, others)
            futures.append(future)

        
        for future in concurrent.futures.as_completed(futures):
            s_t, a_t, p_t, r_t, d_t, v_t, was_impostor = future.result()

            if was_impostor:
                impostor_s_batch += s_t
                impostor_a_batch += a_t
                impostor_p_batch += p_t
                impostor_d_batch += d_t
                impostor_v_batch += v_t

                # statistics
                impostor_reward_buf.append(np.sum(r_t))
            else:
                crewmate_s_batch += s_t
                crewmate_a_batch += a_t
                crewmate_p_batch += p_t
                crewmate_d_batch += d_t
                crewmate_v_batch += v_t
    
                # statistics
                crewmate_reward_buf.append(np.sum(r_t))


        crewmate_actor_losses, crewmate_critic_losses = network.train_ppo(
            crewmate_actor,
            crewmate_critic,
            crewmate_actor_optimizer,
            crewmate_critic_optimizer,
            crewmate_s_batch,
            crewmate_a_batch,
            crewmate_p_batch,
            crewmate_d_batch,
            crewmate_v_batch
        )

        impostor_actor_losses, impostor_critic_losses = network.train_ppo(
            impostor_actor,
            impostor_critic,
            impostor_actor_optimizer,
            impostor_critic_optimizer,
            impostor_s_batch,
            impostor_a_batch,
            impostor_p_batch,
            impostor_d_batch,
            impostor_v_batch
        )

        for crewmate_actor_loss, crewmate_critic_loss, impostor_actor_loss, impostor_critic_loss in zip(crewmate_actor_losses, crewmate_critic_losses, impostor_actor_losses, impostor_critic_losses):
            writer.add_scalar('impostor_actor_loss', impostor_actor_loss, impostor_step)
            writer.add_scalar('impostor_critic_loss', impostor_critic_loss, impostor_step)

            writer.add_scalar('crewmate_actor_loss', crewmate_actor_loss, crewmate_step)
            writer.add_scalar('crewmate_critic_loss', crewmate_critic_loss, crewmate_step)

            if impostor_step % SUMMARY_STATS_INTERVAL == 0:
                writer.add_scalar('impostor_reward', np.mean(impostor_reward_buf), impostor_step)
                impostor_reward_buf = []
            
            if crewmate_step % SUMMARY_STATS_INTERVAL == 0:
                writer.add_scalar('crewmate_reward', np.mean(crewmate_reward_buf), crewmate_step)
                crewmate_reward_buf = []

            # Save the neural net parameters to disk.
            if impostor_step % MODEL_SAVE_INTERVAL == 0:
                torch.save(impostor_actor.state_dict(), f"{SUMMARY_DIR}/impostor_model_ep_{impostor_step}_actor.ckpt")
                torch.save(impostor_critic.state_dict(), f"{SUMMARY_DIR}/impostor_model_ep_{impostor_step}_critic.ckpt")

            # Save the neural net parameters to disk.
            if crewmate_step % MODEL_SAVE_INTERVAL == 0:
                torch.save(crewmate_actor.state_dict(), f"{SUMMARY_DIR}/crewmate_model_ep_{crewmate_step}_actor.ckpt")
                torch.save(crewmate_critic.state_dict(), f"{SUMMARY_DIR}/crewmate_model_ep_{crewmate_step}_critic.ckpt")
            
            crewmate_step += 1
            impostor_step += 1

ValueError: Entropy is too low!

In [None]:
others = []
for _ in range(3):
    others.append(player.RandomPlayer())
# play the game            
s_t, a_t, p_t, r_t, d_t, v_t, was_impostor = play(impostor_nn_player, False, others)

In [None]:
for s, a, p, r, d, v in zip(s_t, a_t, p_t, r_t, d_t, v_t):
    print("-----------------------------------")
    env.print_obs(s)
    env.print_action(a)
    print(p)
    print(r)
    print(d)
    print(v)

-----------------------------------
⬛⬛⬛⬛⬛
⬛⬛⬛⬛⬛
⬛⬛📦⬛⬛
🧑‍🚀😇👽⬛⬛
⬛🧑‍🚀⬛⬛⬛
Move Down
[0.15051111 0.18259882 0.18218189 0.31464924 0.17005894]
0.0
0.32521974733856907
0.2112896765117198
-----------------------------------
⬛⬛⬛⬛⬛
⬛⬛⬛⬛⬛
⬛⬛📦⬛⬛
⬛👽😇⬛⬛
🧑‍🚀🧑‍🚀⬛⬛⬛
Move Right
[0.14676518 0.24595995 0.16116141 0.20940875 0.23670471]
0.0
0.3285047952914839
0.21342391566840382
-----------------------------------
⬛⬛⬛⬛⬛
⬛⬛⬛⬛⬛
⬛👽📦⬛⬛
⬛⬛⬛⬛⬛
🧑‍🚀🧑‍🚀😇⬛⬛
Move Up
[0.16334487 0.13441719 0.26717463 0.20144596 0.23361735]
0.0
0.3318230255469534
0.2155797127963675
-----------------------------------
⬛⬛⬛⬛⬛
⬛⬛⬛⬛⬛
⬛👽📦⬛⬛
⬛⬛⬛⬛⬛
🧑‍🚀😇⬛⬛⬛
Move Down
[0.1817972  0.14639219 0.17924973 0.28601114 0.20654974]
0.0
0.3351747732797509
0.21775728565289648
-----------------------------------
⬛⬛⬛⬛⬛
⬛👽⬛⬛⬛
⬛⬛📦⬛⬛
🧑‍🚀⬛⬛⬛⬛
🧑‍🚀⬛😇⬛⬛
Move Left
[0.29646037 0.15855748 0.21904133 0.18223696 0.14370386]
0.0
0.33856037705025344
0.21995685419484493
-----------------------------------
⬛⬛⬛⬛⬛
👽⬛⬛⬛⬛
⬛⬛📦⬛⬛
⬛⬛😇⬛⬛
🧑‍🚀⬛⬛⬛⬛
Move Left
[0.30769467 0.21281081 0.

In [None]:
for a, s in zip(impostor_a_batch, impostor_s_batch):
    env.print_action(a)
    env.print_obs(s)

Move Up
------------
|⬛⬛⬛⬛⬛|
|🧑‍🚀⬛⬛⬛💀|
|⬛⬛⬛⬛⬛|
|⬛⬛⬛⬛⬛|
|⬛⬛🧑‍🚀⬛⬛|
------------

Move Left
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛⬛⬛|
|⬛👽⬛⬛⬛|
|⬛🧑‍🚀⬛⬛⬛|
|⬛🧑‍🚀⬛⬛⬛|
------------

Move Right
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛⬛⬛|
|⬛👽⬛⬛⬛|
|⬛🧑‍🚀⬛⬛⬛|
|⬛🧑‍🚀⬛⬛⬛|
------------

Move Down
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛💀⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛⬛⬛⬛|
------------

Move Up
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛💀⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛⬛⬛⬛|
------------

Move Down
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛💀⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛⬛⬛⬛|
------------

Move Up
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛💀⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛⬛⬛⬛|
------------

Move Down
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛💀⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛⬛⬛⬛|
------------

Move Up
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛💀⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛⬛⬛⬛|
------------

Move Up
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛💀⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛⬛⬛⬛|
------------

Move Right
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛💀⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛⬛⬛⬛|
------------

Wait
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛💀⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|
|⬛⬛⬛⬛⬛|
------------

Move Left
------------
|⬛⬛⬛⬛⬛|
|⬛⬛⬛🧑‍🚀⬛|


In [None]:
for a, s in zip(crewmate_a_batch, crewmate_s_batch):
    env.print_action(a)
    env.print_obs(s)

Wait
------------
|💀        |
|💀        |
|    📦    |
|      🧑‍🚀  |
|          |
------------

Wait
------------
|💀        |
|💀        |
|    📦    |
|      🧑‍🚀  |
|          |
------------

Move Up
------------
|💀        |
|💀        |
|    📦    |
|      🧑‍🚀  |
|          |
------------

Move Down
------------
|💀        |
|💀        |
|    📦    |
|      🧑‍🚀  |
|          |
------------

Move Left
------------
|💀        |
|💀        |
|    📦    |
|      🧑‍🚀  |
|          |
------------

Wait
------------
|💀        |
|💀        |
|    📦    |
|      🧑‍🚀  |
|          |
------------

Wait
------------
|          |
|      🧑‍🚀  |
|    📦    |
|        🧑‍🚀|
|👽        |
------------

Move Left
------------
|          |
|      🧑‍🚀  |
|    📦    |
|        🧑‍🚀|
|👽        |
------------

Wait
------------
|          |
|    🧑‍🚀    |
|    📦    |
|  🧑‍🚀      |
|    💀    |
------------

Move Right
------------
|          |
|    🧑‍🚀    |
|    📦    |
|  🧑‍🚀      |
|    💀    |
------------

Move Left
--------