In [1]:
import ChessGame
import numpy as np

from gym import spaces
from pettingzoo import AECEnv
from pettingzoo.utils import agent_selector

from pettingzoo.test import api_test
from pettingzoo.test import performance_benchmark

import ray
import torch

from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork
from ray.rllib.utils.torch_ops import FLOAT_MIN, FLOAT_MAX
from ray.rllib.utils.framework import try_import_tf, try_import_torch

from ray.rllib.agents.callbacks import DefaultCallbacks
# from ray.rllib.env import BaseEnv
# from ray.rllib.evaluation import MultiAgentEpisode, RolloutWorker
# from ray.rllib.policy import Policy
# from ray.rllib.policy.sample_batch import SampleBatch

tf1, tf, tfv = try_import_tf()
torch, nn = try_import_torch()

class ChessEnv (AECEnv):
    metadata = {'render.modes': ['human']}
    def __init__(self):
        super().__init__()
        # White is represented as player_0 and black is represented as player_1
        self.agents = ["player_" + str(num) for num in range (2)]
        self.possible_agents = ["player_" + str(num) for num in range (2)]
        self.agent_name_mapping = { "player_0" : 0,
                                    "player_1" : 1}
        self._agent_selector = agent_selector(self.agents)

        self.action_spaces = {name: spaces.Discrete(4672) for name in self.possible_agents}
        self.observation_spaces = {name: spaces.Dict({
            'observation': spaces.Box(low=-1, high=500, shape=(5, 8, 8, 14), dtype=np.float32),
            'action_mask': spaces.Box(low=0, high=1, shape=(4672,), dtype=np.float32)
        }) for name in self.possible_agents}

        self.blackEval = 0.5
        self.whiteEval = 0.5
        self.board = ChessGame.Board()
        self.rewards = None
        self.dones = None
        self.infos = {name: {} for name in self.agents}

        self.agent_selection = None

        
    def observe(self, agent):
        observation = []
        action_mask = np.zeros(shape = 4672, dtype = np.int16)

        if (self.board.whiteToMove):
            observation = self.board.GetBoard()[0]
            if (agent == self.possible_agents[0]):
                for moves in self.board.allLegalMoves:
                    action_mask[moves] = 1
        else:
            observation = self.board.GetBoard()[1]
            if (agent == self.possible_agents[1]):
                for moves in self.board.allLegalMoves:
                    action_mask[moves] = 1
        return {"observation" : observation, "action_mask" : action_mask}

    def reset (self):
        self.board.ResetBoard()

        self.agents = self.possible_agents[:]
        self.rewards = {agent : 0 for agent in self.agents}
        self._cumulative_rewards =  {agent : 0 for agent in self.agents}
        self.dones = {agent: False for agent in self.agents}
        self.observation = {agent : None for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}

        self._agent_selector = agent_selector(self.agents)
        self.agent_selection = self._agent_selector.reset()
        

    def step (self, action):
        if self.dones[self.agent_selection]:
            return self._was_done_step(action)
        
        self.rewards = {agent : 0 for agent in self.agents}
        current_agent = self.agent_selection
        self._cumulative_rewards[current_agent] = 0

        # Make the move
        self.board.Move(action)

        if (self.board.whiteToMove):
            # If it's white to move and there's checkmate on the board, white lost (and black won); apply appropriate rewards
            if (self.board.gameState > 0):
                self.rewards[self.agents[0]] = -1
                self.rewards[self.agents[1]] = 1
            # Game is drawn somehow, each agent gets a reward of 0
            elif (self.board.gameState < 0):
                self.rewards[self.agents[0]] = self.rewards[self.agents[1]] = 0
            else:
                self.rewards[self.agents[0]] = (self.board.eval - self.whiteEval) * 5
                self.whiteEval = self.board.eval

        else:
            # If it's black to move and there's checkmate on the board, black lost (and white won); apply appropriate rewards
            if (self.board.gameState > 0):
                self.rewards[self.agents[0]] = 1
                self.rewards[self.agents[1]] = -1
            # Game is drawn somehow, each agent gets a reward of 0
            elif (self.board.gameState < 0):
                self.rewards[self.agents[0]] = self.rewards[self.agents[1]] = 0
            else:
                self.rewards[self.agents[1]] = (self.board.eval - self.blackEval) * 5
                self.blackEval = self.board.eval

        if (self.board.gameState != 0):
            for name in self.agents:
                self.infos[name] = self.board.listOfMoves
                self.dones[name] = True

        self.agent_selection = self._agent_selector.next()
        self._accumulate_rewards()
 
    def render (self):
        self.board.ShowBoard()
    
    def close(self):
        pass

# Formal PettingZoo API Test & Performance Benchmarks
env = ChessEnv()
# env = wrappers.BaseWrapper(env)
# env = wrappers.OrderEnforcingWrapper(env)
api_test(env, num_cycles = 50, verbose_progress = True)
performance_benchmark(env)

class ChessNetwork (TorchModelV2, nn.Module):
    def __init__ (self, obs_space, action_space, num_outputs, model_config, name, **kwargs):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name, **kwargs)
        nn.Module.__init__(self)

        action_embed_size = 4672
        self.action_embed_model = FullyConnectedNetwork(
            spaces.Box(low=-1, high=500, shape=(5, 8, 8, 14)), action_space, action_embed_size,
            model_config, name + "action_embed")

    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        action_mask = input_dict["obs"]["action_mask"]

        # Compute the predicted action embedding
        action_logits, _ = self.action_embed_model({
            "obs": input_dict["obs"]['observation']
        })
        
        # Masks out invalid actions
        inf_mask = torch.clamp(torch.log(action_mask), FLOAT_MIN, FLOAT_MAX)

        return action_logits + inf_mask, state

    def value_function(self):
        return self.action_embed_model.value_function()

class AlgebraicNotation (DefaultCallbacks):
    def __init__(self):
        super().__init__()
        # Counter variable of number of total episodes
        self.episode_num = 1
        # Log chess games (algebraic notation) every "log_freq" games
        self.log_freq = 25
    def on_episode_end (self, *, worker, base_env, episode, env_index, **kwargs):
        if (self.episode_num % self.log_freq == 0):
            board = base_env.get_unwrapped()[0].env.board
            gameFile = open("C:/Users/408aa/Desktop/Code/Python/Chess_AI/AlgebraicNotations.txt", "a")
            algNotation = str()

            for turn in board.listOfMoves:
                algNotation = " ".join([algNotation, turn])
            
            algNotation = "".join(["Game #", str(self.episode_num), ": ", algNotation, "\n", "\n"])
            # print (algNotation)
            gameFile.write(algNotation)
            gameFile.close()
        self.episode_num +=1

def env_creator():
    env = ChessEnv()
    return env

# Clear the file with the game logs
def clearLogFile():
    gameFile = open("C:/Users/408aa/Desktop/Code/Python/Chess_AI/AlgebraicNotations.txt", "w")
    gameFile.close()


In [3]:
from ray.rllib.agents import ppo
from ray.rllib.env import PettingZooEnv
from ray import tune

from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env
# from ray.rllib.agents.registry import get_trainer_class

# from ray.tune.logger import pretty_print

import os


# Register Model and Environment
ModelCatalog.register_custom_model("ChessNetwork", ChessNetwork)

register_env("ChessEnv", lambda config : PettingZooEnv(env_creator()))

test_env = PettingZooEnv(env_creator())
obs_space = test_env.observation_space
act_space = test_env.action_space

config = ppo.DEFAULT_CONFIG.copy()

config["multiagent"] = {
    "policies": {
        "player_0": (None, obs_space, act_space, {}),
        "player_1": (None, obs_space, act_space, {}),
    },
    "policy_mapping_fn": lambda agent_id: agent_id
}

config["num_workers"] = 5
config["num_envs_per_worker"] = 1
config["num_cpus_per_worker"] = 2
config["num_cpus_for_driver"] = 4
# config["num_gpus"] = 1
config["framework"] = "torch"
config["model"] = {
    "custom_model" : "ChessNetwork",
}
config["env"] = "ChessEnv"
config["horizon"] = 150
config["rollout_fragment_length"] = 500
config["callbacks"] = AlgebraicNotation
config["log_level"] = "INFO"
ray.init(num_cpus = 16, num_gpus = 1, ignore_reinit_error = True)

tune.run(
    "PPO",
    name="Chess_Policy",
    stop={"timesteps_total": 750000},
    checkpoint_freq=1000,
    checkpoint_at_end = True,
    config=config,
    local_dir = os.getcwd()
)

ray.shutdown()

2021-07-26 01:23:55,684	INFO services.py:1274 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Trial name,status,loc
PPO_ChessEnv_d8249_00000,PENDING,


2021-07-26 01:24:07,450	ERROR syncer.py:72 -- Log sync requires rsync to be installed.
[2m[36m(pid=9864)[0m 2021-07-26 01:24:22,558	INFO catalog.py:414 -- Wrapping <class '__main__.ChessNetwork'> as None
[2m[36m(pid=15516)[0m 2021-07-26 01:24:22,681	INFO catalog.py:414 -- Wrapping <class '__main__.ChessNetwork'> as None
[2m[36m(pid=9864)[0m 2021-07-26 01:24:22,748	INFO torch_policy.py:137 -- TorchPolicy (worker=1) running on CPU.
[2m[36m(pid=15516)[0m 2021-07-26 01:24:22,769	INFO torch_policy.py:137 -- TorchPolicy (worker=3) running on CPU.
[2m[36m(pid=9864)[0m 2021-07-26 01:24:22,983	INFO catalog.py:414 -- Wrapping <class '__main__.ChessNetwork'> as None
[2m[36m(pid=15516)[0m 2021-07-26 01:24:22,983	INFO catalog.py:414 -- Wrapping <class '__main__.ChessNetwork'> as None
[2m[36m(pid=16724)[0m 2021-07-26 01:24:22,983	INFO catalog.py:414 -- Wrapping <class '__main__.ChessNetwork'> as None
[2m[36m(pid=9864)[0m 2021-07-26 01:24:23,055	INFO torch_policy.py:137 -- Tor

Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 3975
  custom_metrics: {}
  date: 2021-07-26_01-30-50
  done: false
  episode_len_mean: 170.0
  episode_media: {}
  episode_reward_max: 0.16015395522117615
  episode_reward_mean: 0.006650244817137718
  episode_reward_min: -0.05490392446517944
  episodes_this_iter: 20
  episodes_total: 20
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.2
          cur_lr: 5.0e-05
          entropy: 3.180171713232994
          entropy_coeff: 0.0
          kl: 0.024934864486567676
          policy_loss: 0.22749220672994852
          total_loss: 0.29122585244476795
          vf_explained_var: 0.8705475330352783
          vf_loss: 0.05874667107127607
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.2
          cur_lr: 5.0e-05
          entropy: 3.2271

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,1,386.991,4000,0.00665024,0.160154,-0.0549039,170


[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 7952
  custom_metrics: {}
  date: 2021-07-26_01-37-12
  done: false
  episode_len_mean: 165.04347826086956
  episode_media: {}
  episode_reward_max: 0.5576299130916595
  episode_reward_mean: 0.0003363152348395923
  episode_reward_min: -0.5481317266821861
  episodes_this_iter: 26
  episodes_total: 46
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.30000000000000004
          cur_lr: 5.0e-05
          entropy: 3.086345463991165
          entropy_coeff: 0.0
          kl: 0.015545258182100952
          policy_loss: 0.2350367633625865
          total_loss: 0.3510952116921544
          vf_explained_var: 0.8630298376083374
          vf_loss: 0.11139487684704363
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.30000000000000004
         

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,2,769.026,8000,0.000336315,0.55763,-0.548132,165.043


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 11928
  custom_metrics: {}
  date: 2021-07-26_01-43-30
  done: false
  episode_len_mean: 166.74285714285713
  episode_media: {}
  episode_reward_max: 0.5576299130916595
  episode_reward_mean: -0.0016624246324811662
  episode_reward_min: -0.5481317266821861
  episodes_this_iter: 24
  episodes_total: 70
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.30000000000000004
          cur_lr: 5.0e-05
          entropy: 3.1155604273080826
          entropy_coeff: 0.0
          kl: 0.021649044298101217
          policy_loss: 0.2333999676629901
          total_loss: 0.3178369989618659
          vf_explained_var: 0.8259205222129822
          vf_loss: 0.0779423164203763
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.30000000000000004
       

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,3,1146.45,12000,-0.00166242,0.55763,-0.548132,166.743


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 15907
  custom_metrics: {}
  date: 2021-07-26_01-49-56
  done: false
  episode_len_mean: 165.09574468085106
  episode_media: {}
  episode_reward_max: 0.5576299130916595
  episode_reward_mean: 0.0004177683211387472
  episode_reward_min: -0.5481317266821861
  episodes_this_iter: 24
  episodes_total: 94
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr: 5.0e-05
          entropy: 3.149796962738037
          entropy_coeff: 0.0
          kl: 0.011982064315816388
          policy_loss: 0.23989209905266762
          total_loss: 0.3087699208408594
          vf_explained_var: 0.884534478187561
          vf_loss: 0.06348589132539928
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
        

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,4,1533.03,16000,0.000417768,0.55763,-0.548132,165.096


[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 19884
  custom_metrics: {}
  date: 2021-07-26_01-56-16
  done: false
  episode_len_mean: 164.73
  episode_media: {}
  episode_reward_max: 0.9071803092956543
  episode_reward_mean: -0.005326104164123535
  episode_reward_min: -0.9188584983348846
  episodes_this_iter: 25
  episodes_total: 119
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr: 5.0e-05
          entropy: 3.136022225022316
          entropy_coeff: 0.0
          kl: 0.013263687142170966
          policy_loss: 0.24761915858834982
          total_loss: 0.34293216466903687
          vf_explained_var: 0.8504529595375061
          vf_loss: 0.08934433944523335
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,5,1912.38,20000,-0.0053261,0.90718,-0.918858,164.73


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 23861
  custom_metrics: {}
  date: 2021-07-26_02-02-34
  done: false
  episode_len_mean: 163.75
  episode_media: {}
  episode_reward_max: 0.9071803092956543
  episode_reward_mean: 0.00013033930445089937
  episode_reward_min: -0.9188584983348846
  episodes_this_iter: 26
  episodes_total: 145
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr: 5.0e-05
          entropy: 3.0320945084095
          entropy_coeff: 0.0
          kl: 0.01340043026721105
          policy_loss: 0.23650844395160675
          total_loss: 0.32034061942249537
          vf_explained_var: 0.834111213684082
          vf_loss: 0.07780198007822037
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr: 5.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,6,2290.86,24000,0.000130339,0.90718,-0.918858,163.75


[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 27841
  custom_metrics: {}
  date: 2021-07-26_02-08-52
  done: false
  episode_len_mean: 162.33
  episode_media: {}
  episode_reward_max: 0.9071803092956543
  episode_reward_mean: 0.0018392592668533326
  episode_reward_min: -0.9188584983348846
  episodes_this_iter: 24
  episodes_total: 169
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr: 5.0e-05
          entropy: 2.9400500059127808
          entropy_coeff: 0.0
          kl: 0.01711081143002957
          policy_loss: 0.2340418230742216
          total_loss: 0.3500050902366638
          vf_explained_var: 0.8512314558029175
          vf_loss: 0.1082634013146162
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr: 5.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,7,2668.17,28000,0.00183926,0.90718,-0.918858,162.33


[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 31819
  custom_metrics: {}
  date: 2021-07-26_02-15-14
  done: false
  episode_len_mean: 162.93
  episode_media: {}
  episode_reward_max: 0.9071803092956543
  episode_reward_mean: -0.0026316992938518525
  episode_reward_min: -0.9188584983348846
  episodes_this_iter: 23
  episodes_total: 192
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr: 5.0e-05
          entropy: 2.9406385719776154
          entropy_coeff: 0.0
          kl: 0.014365610666573048
          policy_loss: 0.23203949723392725
          total_loss: 0.3351135775446892
          vf_explained_var: 0.8388615846633911
          vf_loss: 0.09660955425351858
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,8,3049.89,32000,-0.0026317,0.90718,-0.918858,162.93


[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 35795
  custom_metrics: {}
  date: 2021-07-26_02-21-34
  done: false
  episode_len_mean: 162.94
  episode_media: {}
  episode_reward_max: 0.9071803092956543
  episode_reward_mean: 0.012384618818759918
  episode_reward_min: -0.3729225695133209
  episodes_this_iter: 25
  episodes_total: 217
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr: 5.0e-05
          entropy: 2.8688370883464813
          entropy_coeff: 0.0
          kl: 0.017976999166421592
          policy_loss: 0.2269759690389037
          total_loss: 0.31734666135162115
          vf_explained_var: 0.7967200875282288
          vf_loss: 0.08228104095906019
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,9,3430.66,36000,0.0123846,0.90718,-0.372923,162.94


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 39772
  custom_metrics: {}
  date: 2021-07-26_02-27-49
  done: false
  episode_len_mean: 166.2
  episode_media: {}
  episode_reward_max: 0.39528921246528625
  episode_reward_mean: 0.00027446448802948
  episode_reward_min: -0.3729225695133209
  episodes_this_iter: 23
  episodes_total: 240
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr: 5.0e-05
          entropy: 2.7596963346004486
          entropy_coeff: 0.0
          kl: 0.019190570223145187
          policy_loss: 0.21689125802367926
          total_loss: 0.3006228441372514
          vf_explained_var: 0.8396494388580322
          vf_loss: 0.07509583770297468
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
       

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,10,3805.59,40000,0.000274464,0.395289,-0.372923,166.2


[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 43750
  custom_metrics: {}
  date: 2021-07-26_02-34-08
  done: false
  episode_len_mean: 164.14
  episode_media: {}
  episode_reward_max: 0.3079441376030445
  episode_reward_mean: -0.001198873668909073
  episode_reward_min: -0.29993074014782906
  episodes_this_iter: 25
  episodes_total: 265
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.45000000000000007
          cur_lr: 5.0e-05
          entropy: 2.5985332131385803
          entropy_coeff: 0.0
          kl: 0.02525810559745878
          policy_loss: 0.20140550006181002
          total_loss: 0.3264635158702731
          vf_explained_var: 0.8032166361808777
          vf_loss: 0.11369187943637371
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
     

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,11,4184.52,44000,-0.00119887,0.307944,-0.299931,164.14


[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 47728
  custom_metrics: {}
  date: 2021-07-26_02-40-26
  done: false
  episode_len_mean: 162.69
  episode_media: {}
  episode_reward_max: 0.3350786119699478
  episode_reward_mean: 0.002907438576221466
  episode_reward_min: -0.30772536993026733
  episodes_this_iter: 26
  episodes_total: 291
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.658664643764496
          entropy_coeff: 0.0
          kl: 0.011202807683730498
          policy_loss: 0.21770122647285461
          total_loss: 0.294051474891603
          vf_explained_var: 0.8878992199897766
          vf_loss: 0.06878835684619844
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,12,4562.24,48000,0.00290744,0.335079,-0.307725,162.69


[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 51705
  custom_metrics: {}
  date: 2021-07-26_02-46-49
  done: false
  episode_len_mean: 162.92
  episode_media: {}
  episode_reward_max: 0.3350786119699478
  episode_reward_mean: 0.000812993897125125
  episode_reward_min: -0.30772536993026733
  episodes_this_iter: 24
  episodes_total: 315
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.6028587967157364
          entropy_coeff: 0.0
          kl: 0.012891228368971497
          policy_loss: 0.20356919150799513
          total_loss: 0.2957883533090353
          vf_explained_var: 0.8240993022918701
          vf_loss: 0.08351758774369955
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,13,4944.49,52000,0.000812994,0.335079,-0.307725,162.92


[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 55683
  custom_metrics: {}
  date: 2021-07-26_02-53-06
  done: false
  episode_len_mean: 161.68
  episode_media: {}
  episode_reward_max: 0.7909342832863331
  episode_reward_mean: -0.003747224807739258
  episode_reward_min: -0.763214249163866
  episodes_this_iter: 24
  episodes_total: 339
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.5411361902952194
          entropy_coeff: 0.0
          kl: 0.01195088354870677
          policy_loss: 0.2074593175202608
          total_loss: 0.2697366503998637
          vf_explained_var: 0.8563655614852905
          vf_loss: 0.05421048391144723
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.6

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,14,5322.1,56000,-0.00374722,0.790934,-0.763214,161.68


[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 59662
  custom_metrics: {}
  date: 2021-07-26_02-59-28
  done: false
  episode_len_mean: 164.23
  episode_media: {}
  episode_reward_max: 1.3763539493083954
  episode_reward_mean: 0.0004311978816986084
  episode_reward_min: -1.3853581994771957
  episodes_this_iter: 24
  episodes_total: 363
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.5248454362154007
          entropy_coeff: 0.0
          kl: 0.012446825625374913
          policy_loss: 0.20566678047180176
          total_loss: 0.29249664954841137
          vf_explained_var: 0.8177640438079834
          vf_loss: 0.07842825911939144
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,15,5703.5,60000,0.000431198,1.37635,-1.38536,164.23


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 63641
  custom_metrics: {}
  date: 2021-07-26_03-05-49
  done: false
  episode_len_mean: 164.3
  episode_media: {}
  episode_reward_max: 1.3763539493083954
  episode_reward_mean: -0.0008632682263851165
  episode_reward_min: -1.3853581994771957
  episodes_this_iter: 25
  episodes_total: 388
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.3461432605981827
          entropy_coeff: 0.0
          kl: 0.012364266091026366
          policy_loss: 0.19380967412143946
          total_loss: 0.26945517770946026
          vf_explained_var: 0.8502097129821777
          vf_loss: 0.06729962537065148
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,16,6084.91,64000,-0.000863268,1.37635,-1.38536,164.3


[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 67620
  custom_metrics: {}
  date: 2021-07-26_03-12-04
  done: false
  episode_len_mean: 163.94
  episode_media: {}
  episode_reward_max: 1.3763539493083954
  episode_reward_mean: -0.010495791537687182
  episode_reward_min: -1.3853581994771957
  episodes_this_iter: 24
  episodes_total: 412
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.324533224105835
          entropy_coeff: 0.0
          kl: 0.015775291481986642
          policy_loss: 0.18777210265398026
          total_loss: 0.3136245207861066
          vf_explained_var: 0.7587059736251831
          vf_loss: 0.11520409723743796
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,17,6459.99,68000,-0.0104958,1.37635,-1.38536,163.94


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 71598
  custom_metrics: {}
  date: 2021-07-26_03-18-23
  done: false
  episode_len_mean: 164.74
  episode_media: {}
  episode_reward_max: 1.3763539493083954
  episode_reward_mean: 0.0033143579959869383
  episode_reward_min: -1.3853581994771957
  episodes_this_iter: 24
  episodes_total: 436
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.241977274417877
          entropy_coeff: 0.0
          kl: 0.016377863299567252
          policy_loss: 0.18101412989199162
          total_loss: 0.2572501068934798
          vf_explained_var: 0.7797032594680786
          vf_loss: 0.06518092448823154
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,18,6838.95,72000,0.00331436,1.37635,-1.38536,164.74


[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 75574
  custom_metrics: {}
  date: 2021-07-26_03-24-40
  done: false
  episode_len_mean: 163.02
  episode_media: {}
  episode_reward_max: 1.2839394062757492
  episode_reward_mean: 0.0011795565485954284
  episode_reward_min: -1.276942864060402
  episodes_this_iter: 26
  episodes_total: 462
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.121547281742096
          entropy_coeff: 0.0
          kl: 0.015625798900146037
          policy_loss: 0.16834120685234666
          total_loss: 0.2631753711029887
          vf_explained_var: 0.807925820350647
          vf_loss: 0.08428675029426813
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,19,7215.32,76000,0.00117956,1.28394,-1.27694,163.02


[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 79552
  custom_metrics: {}
  date: 2021-07-26_03-30-59
  done: false
  episode_len_mean: 163.23
  episode_media: {}
  episode_reward_max: 1.2839394062757492
  episode_reward_mean: -0.00028880387544631957
  episode_reward_min: -1.276942864060402
  episodes_this_iter: 25
  episodes_total: 487
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.0392955243587494
          entropy_coeff: 0.0
          kl: 0.01735805784119293
          policy_loss: 0.16300051170401275
          total_loss: 0.27337853237986565
          vf_explained_var: 0.8249858617782593
          vf_loss: 0.098661326803267
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,20,7594.13,80000,-0.000288804,1.28394,-1.27694,163.23


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 83531
  custom_metrics: {}
  date: 2021-07-26_03-37-20
  done: false
  episode_len_mean: 165.2
  episode_media: {}
  episode_reward_max: 0.9551995247602463
  episode_reward_mean: 0.00849531516432762
  episode_reward_min: -0.8594801276922226
  episodes_this_iter: 21
  episodes_total: 508
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 1.9808035716414452
          entropy_coeff: 0.0
          kl: 0.019486898032482713
          policy_loss: 0.14318452589213848
          total_loss: 0.23723811376839876
          vf_explained_var: 0.8523565530776978
          vf_loss: 0.08089993079192936
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 2.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,21,7975.39,84000,0.00849532,0.9552,-0.85948,165.2


[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 87507
  custom_metrics: {}
  date: 2021-07-26_03-43-35
  done: false
  episode_len_mean: 164.24
  episode_media: {}
  episode_reward_max: 0.7131193578243256
  episode_reward_mean: -0.0003279343247413635
  episode_reward_min: -0.6151993572711945
  episodes_this_iter: 26
  episodes_total: 534
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 1.9495989009737968
          entropy_coeff: 0.0
          kl: 0.01783649151911959
          policy_loss: 0.14689700934104621
          total_loss: 0.23295787442475557
          vf_explained_var: 0.8215059638023376
          vf_loss: 0.0740212332457304
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,22,8350.22,88000,-0.000327934,0.713119,-0.615199,164.24


[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 91485
  custom_metrics: {}
  date: 2021-07-26_03-49-56
  done: false
  episode_len_mean: 164.7
  episode_media: {}
  episode_reward_max: 0.7131193578243256
  episode_reward_mean: -0.005532904900610447
  episode_reward_min: -0.5424053966999054
  episodes_this_iter: 23
  episodes_total: 557
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 1.847617968916893
          entropy_coeff: 0.0
          kl: 0.018187083129305393
          policy_loss: 0.12917444109916687
          total_loss: 0.24733708798885345
          vf_explained_var: 0.8182804584503174
          vf_loss: 0.1058863669168204
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 1.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,23,8731.2,92000,-0.0055329,0.713119,-0.542405,164.7


[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 95461
  custom_metrics: {}
  date: 2021-07-26_03-56-13
  done: false
  episode_len_mean: 166.91
  episode_media: {}
  episode_reward_max: 0.7131193578243256
  episode_reward_mean: 0.0005707845091819763
  episode_reward_min: -0.5424053966999054
  episodes_this_iter: 25
  episodes_total: 582
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 1.8018948584794998
          entropy_coeff: 0.0
          kl: 0.013654735230375081
          policy_loss: 0.12939692079089582
          total_loss: 0.18475168524309993
          vf_explained_var: 0.7456867694854736
          vf_loss: 0.04613781673833728
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,24,9108.28,96000,0.000570785,0.713119,-0.542405,166.91


[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 99438
  custom_metrics: {}
  date: 2021-07-26_04-02-33
  done: false
  episode_len_mean: 165.94
  episode_media: {}
  episode_reward_max: 0.7131193578243256
  episode_reward_mean: 0.000856366939842701
  episode_reward_min: -0.9960585460066795
  episodes_this_iter: 24
  episodes_total: 606
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 1.7476862370967865
          entropy_coeff: 0.0
          kl: 0.018496771401260048
          policy_loss: 0.12501053628511727
          total_loss: 0.20573822176083922
          vf_explained_var: 0.8250178694725037
          vf_loss: 0.06824236619286239
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,25,9488.16,100000,0.000856367,0.713119,-0.996059,165.94


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 103415
  custom_metrics: {}
  date: 2021-07-26_04-08-50
  done: false
  episode_len_mean: 166.44
  episode_media: {}
  episode_reward_max: 0.6873968243598938
  episode_reward_mean: -0.00043884068727493284
  episode_reward_min: -0.9960585460066795
  episodes_this_iter: 24
  episodes_total: 630
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 1.6283072605729103
          entropy_coeff: 0.0
          kl: 0.01699730783002451
          policy_loss: 0.10434567066840827
          total_loss: 0.17524190992116928
          vf_explained_var: 0.8015459775924683
          vf_loss: 0.05942305573262274
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entrop

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,26,9864.89,104000,-0.000438841,0.687397,-0.996059,166.44


[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 107392
  custom_metrics: {}
  date: 2021-07-26_04-15-10
  done: false
  episode_len_mean: 166.04
  episode_media: {}
  episode_reward_max: 0.6873968243598938
  episode_reward_mean: -0.001523762010037899
  episode_reward_min: -0.9960585460066795
  episodes_this_iter: 24
  episodes_total: 654
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 1.5910507142543793
          entropy_coeff: 0.0
          kl: 0.017119448632001877
          policy_loss: 0.1099407181609422
          total_loss: 0.18457771325483918
          vf_explained_var: 0.755502462387085
          vf_loss: 0.06308136763982475
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,27,10245,108000,-0.00152376,0.687397,-0.996059,166.04


[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 111369
  custom_metrics: {}
  date: 2021-07-26_04-21-30
  done: false
  episode_len_mean: 165.33
  episode_media: {}
  episode_reward_max: 0.6873968243598938
  episode_reward_mean: -0.00250115692615509
  episode_reward_min: -0.9960585460066795
  episodes_this_iter: 25
  episodes_total: 679
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 1.4987719058990479
          entropy_coeff: 0.0
          kl: 0.015977214614395052
          policy_loss: 0.10172899338067509
          total_loss: 0.20405222102999687
          vf_explained_var: 0.7969744801521301
          vf_loss: 0.09153860807418823
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,28,10625.4,112000,-0.00250116,0.687397,-0.996059,165.33


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 115346
  custom_metrics: {}
  date: 2021-07-26_04-27-46
  done: false
  episode_len_mean: 166.3
  episode_media: {}
  episode_reward_max: 0.6222594901919365
  episode_reward_mean: 1.7218291759490967e-05
  episode_reward_min: -0.9960585460066795
  episodes_this_iter: 23
  episodes_total: 702
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 1.39481832832098
          entropy_coeff: 0.0
          kl: 0.017836511367931962
          policy_loss: 0.09435273124836385
          total_loss: 0.21550889313220978
          vf_explained_var: 0.7634519338607788
          vf_loss: 0.10911651747301221
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,29,11000.5,116000,1.72183e-05,0.622259,-0.996059,166.3


[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 119323
  custom_metrics: {}
  date: 2021-07-26_04-34-05
  done: false
  episode_len_mean: 166.06
  episode_media: {}
  episode_reward_max: 0.6099645793437958
  episode_reward_mean: -0.004338023415766656
  episode_reward_min: -0.675106942653656
  episodes_this_iter: 24
  episodes_total: 726
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.675
          cur_lr: 5.0e-05
          entropy: 1.3585397377610207
          entropy_coeff: 0.0
          kl: 0.021098376309964806
          policy_loss: 0.09323408972704783
          total_loss: 0.2310422477312386
          vf_explained_var: 0.6507806181907654
          vf_loss: 0.12356675742194057
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,30,11379.9,120000,-0.00433802,0.609965,-0.675107,166.06


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 123299
  custom_metrics: {}
  date: 2021-07-26_04-40-23
  done: false
  episode_len_mean: 165.2
  episode_media: {}
  episode_reward_max: 0.5103340744972229
  episode_reward_mean: 0.00601176917552948
  episode_reward_min: -0.4690195620059967
  episodes_this_iter: 25
  episodes_total: 751
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 1.3293742164969444
          entropy_coeff: 0.0
          kl: 0.012290777056477964
          policy_loss: 0.0759044298902154
          total_loss: 0.17309262417256832
          vf_explained_var: 0.7882711291313171
          vf_loss: 0.0847437831107527
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,31,11757.4,124000,0.00601177,0.510334,-0.46902,165.2


[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 127278
  custom_metrics: {}
  date: 2021-07-26_04-46-44
  done: false
  episode_len_mean: 166.05
  episode_media: {}
  episode_reward_max: 0.5126515030860901
  episode_reward_mean: 0.0018653854727745055
  episode_reward_min: -0.4690195620059967
  episodes_this_iter: 23
  episodes_total: 774
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 1.2959352284669876
          entropy_coeff: 0.0
          kl: 0.011706114339176565
          policy_loss: 0.09305524069350213
          total_loss: 0.22983677173033357
          vf_explained_var: 0.7763849496841431
          vf_loss: 0.12492909329012036
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,32,12139.2,128000,0.00186539,0.512652,-0.46902,166.05


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 131253
  custom_metrics: {}
  date: 2021-07-26_04-53-06
  done: false
  episode_len_mean: 166.05
  episode_media: {}
  episode_reward_max: 0.5126515030860901
  episode_reward_mean: -0.0025054562836885454
  episode_reward_min: -0.4690195620059967
  episodes_this_iter: 25
  episodes_total: 799
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 1.2343087419867516
          entropy_coeff: 0.0
          kl: 0.010133193951332942
          policy_loss: 0.06223591428715736
          total_loss: 0.14692929957527667
          vf_explained_var: 0.696048378944397
          vf_loss: 0.074433522997424
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,33,12520.3,132000,-0.00250546,0.512652,-0.46902,166.05


[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)
[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 135231
  custom_metrics: {}
  date: 2021-07-26_04-59-20
  done: false
  episode_len_mean: 166.33
  episode_media: {}
  episode_reward_max: 0.5126515030860901
  episode_reward_mean: 0.0035018805181607606
  episode_reward_min: -0.3300616145133972
  episodes_this_iter: 25
  episodes_total: 824
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 1.1663584932684898
          entropy_coeff: 0.0
          kl: 0.012267777521628886
          policy_loss: 0.057156573311658576
          total_loss: 0.14020774164237082
          vf_explained_var: 0.7879290580749512
          vf_loss: 0.07063004141673446
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,34,12894.9,136000,0.00350188,0.512652,-0.330062,166.33


[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 139210
  custom_metrics: {}
  date: 2021-07-26_05-05-41
  done: false
  episode_len_mean: 165.76
  episode_media: {}
  episode_reward_max: 0.5126515030860901
  episode_reward_mean: -0.0006059989333152771
  episode_reward_min: -0.3300616145133972
  episodes_this_iter: 23
  episodes_total: 847
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 1.1884133219718933
          entropy_coeff: 0.0
          kl: 0.014144547400064766
          policy_loss: 0.05569313751766458
          total_loss: 0.13644980953540653
          vf_explained_var: 0.7024601101875305
          vf_loss: 0.0664353147149086
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,35,13275.2,140000,-0.000605999,0.512652,-0.330062,165.76


[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 143186
  custom_metrics: {}
  date: 2021-07-26_05-11-57
  done: false
  episode_len_mean: 166.39
  episode_media: {}
  episode_reward_max: 0.4876578599214554
  episode_reward_mean: -0.006265918165445328
  episode_reward_min: -0.4911380261182785
  episodes_this_iter: 24
  episodes_total: 871
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 1.1754895076155663
          entropy_coeff: 0.0
          kl: 0.016007139871362597
          policy_loss: 0.05244946334278211
          total_loss: 0.1463182977749966
          vf_explained_var: 0.6793840527534485
          vf_loss: 0.07766160136088729
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,36,13651.1,144000,-0.00626592,0.487658,-0.491138,166.39


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 147161
  custom_metrics: {}
  date: 2021-07-26_05-18-17
  done: false
  episode_len_mean: 166.39
  episode_media: {}
  episode_reward_max: 0.4876578599214554
  episode_reward_mean: -0.0024272985756397247
  episode_reward_min: -0.5140547454357147
  episodes_this_iter: 25
  episodes_total: 896
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 1.0736010037362576
          entropy_coeff: 0.0
          kl: 0.013637702912092209
          policy_loss: 0.04017334966920316
          total_loss: 0.12417029635980725
          vf_explained_var: 0.6737267971038818
          vf_loss: 0.07018877123482525
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,37,14031,148000,-0.0024273,0.487658,-0.514055,166.39


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 151139
  custom_metrics: {}
  date: 2021-07-26_05-24-35
  done: false
  episode_len_mean: 166.7
  episode_media: {}
  episode_reward_max: 0.5088929831981659
  episode_reward_mean: 0.0006784375756978989
  episode_reward_min: -0.5140547454357147
  episodes_this_iter: 22
  episodes_total: 918
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 1.1045069582760334
          entropy_coeff: 0.0
          kl: 0.013104680692777038
          policy_loss: 0.04967635066714138
          total_loss: 0.24693943839520216
          vf_explained_var: 0.4892632067203522
          vf_loss: 0.18399460427463055
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,38,14409.8,152000,0.000678438,0.508893,-0.514055,166.7


[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 155117
  custom_metrics: {}
  date: 2021-07-26_05-30-57
  done: false
  episode_len_mean: 168.47
  episode_media: {}
  episode_reward_max: 0.5088929831981659
  episode_reward_mean: -0.0017667628824710846
  episode_reward_min: -0.5140547454357147
  episodes_this_iter: 23
  episodes_total: 941
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 1.141966387629509
          entropy_coeff: 0.0
          kl: 0.011462331749498844
          policy_loss: 0.01982866565231234
          total_loss: 0.11426447797566652
          vf_explained_var: 0.5866849422454834
          vf_loss: 0.0828302032314241
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,39,14791.2,156000,-0.00176676,0.508893,-0.514055,168.47


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 159092
  custom_metrics: {}
  date: 2021-07-26_05-37-19
  done: false
  episode_len_mean: 168.47
  episode_media: {}
  episode_reward_max: 0.5088929831981659
  episode_reward_mean: 0.0031669847667217255
  episode_reward_min: -0.5140547454357147
  episodes_this_iter: 25
  episodes_total: 966
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 1.0323207788169384
          entropy_coeff: 0.0
          kl: 0.013862078660167754
          policy_loss: 0.03198348206933588
          total_loss: 0.1391957150772214
          vf_explained_var: 0.5578508377075195
          vf_loss: 0.0931768745649606
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,40,15173.7,160000,0.00316698,0.508893,-0.514055,168.47


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 163068
  custom_metrics: {}
  date: 2021-07-26_05-43-36
  done: false
  episode_len_mean: 169.79
  episode_media: {}
  episode_reward_max: 0.5088929831981659
  episode_reward_mean: -0.0006911247968673706
  episode_reward_min: -0.5140547454357147
  episodes_this_iter: 24
  episodes_total: 990
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.992459699511528
          entropy_coeff: 0.0
          kl: 0.01531478192191571
          policy_loss: 0.022847257205285132
          total_loss: 0.10361387603916228
          vf_explained_var: 0.47987571358680725
          vf_loss: 0.06526040122844279
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,41,15550.4,164000,-0.000691125,0.508893,-0.514055,169.79


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 167046
  custom_metrics: {}
  date: 2021-07-26_05-49-57
  done: false
  episode_len_mean: 169.79
  episode_media: {}
  episode_reward_max: 0.5088929831981659
  episode_reward_mean: 0.005337438359856605
  episode_reward_min: -0.4646351933479309
  episodes_this_iter: 22
  episodes_total: 1012
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.9807867407798767
          entropy_coeff: 0.0
          kl: 0.013527967967092991
          policy_loss: 0.02068101440090686
          total_loss: 0.08313906809780747
          vf_explained_var: 0.48576194047927856
          vf_loss: 0.04876098642125726
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,42,15931.6,168000,0.00533744,0.508893,-0.464635,169.79


[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 171023
  custom_metrics: {}
  date: 2021-07-26_05-56-17
  done: false
  episode_len_mean: 168.85
  episode_media: {}
  episode_reward_max: 0.47081321477890015
  episode_reward_mean: 0.0018887445330619812
  episode_reward_min: -0.4646351933479309
  episodes_this_iter: 24
  episodes_total: 1036
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.9998495131731033
          entropy_coeff: 0.0
          kl: 0.011447015218436718
          policy_loss: 0.030648142332211137
          total_loss: 0.1217607983853668
          vf_explained_var: 0.5637997388839722
          vf_loss: 0.07952255220152438
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,43,16311.2,172000,0.00188874,0.470813,-0.464635,168.85


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 174998
  custom_metrics: {}
  date: 2021-07-26_06-02-39
  done: false
  episode_len_mean: 169.06
  episode_media: {}
  episode_reward_max: 0.351642370223999
  episode_reward_mean: 0.0030019260942935944
  episode_reward_min: -0.30365467071533203
  episodes_this_iter: 25
  episodes_total: 1061
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.9518574588000774
          entropy_coeff: 0.0
          kl: 0.011318057688185945
          policy_loss: 0.023048970964737236
          total_loss: 0.12699179304763675
          vf_explained_var: 0.4920194149017334
          vf_loss: 0.09248329093679786
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,44,16693.1,176000,0.00300193,0.351642,-0.303655,169.06


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 178976
  custom_metrics: {}
  date: 2021-07-26_06-09-02
  done: false
  episode_len_mean: 168.64
  episode_media: {}
  episode_reward_max: 0.21775811910629272
  episode_reward_mean: 0.00040539950132369997
  episode_reward_min: -0.203237384557724
  episodes_this_iter: 23
  episodes_total: 1084
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.9204311370849609
          entropy_coeff: 0.0
          kl: 0.01669340004445985
          policy_loss: 0.02426050551002845
          total_loss: 0.11532915779389441
          vf_explained_var: 0.6488839387893677
          vf_loss: 0.07416658289730549
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,45,17076.2,180000,0.0004054,0.217758,-0.203237,168.64


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 182953
  custom_metrics: {}
  date: 2021-07-26_06-15-25
  done: false
  episode_len_mean: 167.91
  episode_media: {}
  episode_reward_max: 0.21775811910629272
  episode_reward_mean: -0.0020721897482872008
  episode_reward_min: -0.203237384557724
  episodes_this_iter: 24
  episodes_total: 1108
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.8943781331181526
          entropy_coeff: 0.0
          kl: 0.013129546015989035
          policy_loss: 0.03177569224499166
          total_loss: 0.11319200514117256
          vf_explained_var: 0.5079443454742432
          vf_loss: 0.06812264793552458
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,46,17459,184000,-0.00207219,0.217758,-0.203237,167.91


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 186930
  custom_metrics: {}
  date: 2021-07-26_06-21-48
  done: false
  episode_len_mean: 166.74
  episode_media: {}
  episode_reward_max: 0.21775811910629272
  episode_reward_mean: -0.0025256164371967316
  episode_reward_min: -0.2708292007446289
  episodes_this_iter: 25
  episodes_total: 1133
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.9123660288751125
          entropy_coeff: 0.0
          kl: 0.01145681960042566
          policy_loss: 0.022642184747383
          total_loss: 0.12571207096334547
          vf_explained_var: 0.549735426902771
          vf_loss: 0.09146985318511724
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,47,17841.9,188000,-0.00252562,0.217758,-0.270829,166.74


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 190906
  custom_metrics: {}
  date: 2021-07-26_06-28-08
  done: false
  episode_len_mean: 166.74
  episode_media: {}
  episode_reward_max: 0.2790452539920807
  episode_reward_mean: -0.0011309921741485597
  episode_reward_min: -0.2708292007446289
  episodes_this_iter: 24
  episodes_total: 1157
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.8626994080841541
          entropy_coeff: 0.0
          kl: 0.011026714084437117
          policy_loss: 0.016470959526486695
          total_loss: 0.1895892722532153
          vf_explained_var: 0.4669538140296936
          vf_loss: 0.16195375844836235
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,48,18221.4,192000,-0.00113099,0.279045,-0.270829,166.74


[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 194885
  custom_metrics: {}
  date: 2021-07-26_06-34-31
  done: false
  episode_len_mean: 165.7
  episode_media: {}
  episode_reward_max: 0.2790452539920807
  episode_reward_mean: 0.0008966825902462005
  episode_reward_min: -0.2708292007446289
  episodes_this_iter: 25
  episodes_total: 1182
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.7928293459117413
          entropy_coeff: 0.0
          kl: 0.010642024804838002
          policy_loss: 0.023041911306791008
          total_loss: 0.09499839460477233
          vf_explained_var: 0.7170650362968445
          vf_loss: 0.06118143396452069
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,49,18604.2,196000,0.000896683,0.279045,-0.270829,165.7


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 198862
  custom_metrics: {}
  date: 2021-07-26_06-40-50
  done: false
  episode_len_mean: 166.43
  episode_media: {}
  episode_reward_max: 0.7561546564102173
  episode_reward_mean: 0.0008176550269126892
  episode_reward_min: -0.749002993106842
  episodes_this_iter: 22
  episodes_total: 1204
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.8343536593019962
          entropy_coeff: 0.0
          kl: 0.013209480210207403
          policy_loss: 0.012258114758878946
          total_loss: 0.07539266342064366
          vf_explained_var: 0.5383814573287964
          vf_loss: 0.0497599458321929
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,50,18983.6,200000,0.000817655,0.756155,-0.749003,166.43


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 202838
  custom_metrics: {}
  date: 2021-07-26_06-47-12
  done: false
  episode_len_mean: 168.54
  episode_media: {}
  episode_reward_max: 0.7561546564102173
  episode_reward_mean: 0.003053615987300873
  episode_reward_min: -0.749002993106842
  episodes_this_iter: 24
  episodes_total: 1228
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.8047435916960239
          entropy_coeff: 0.0
          kl: 0.010866865864954889
          policy_loss: 0.012966892449185252
          total_loss: 0.11881432705558836
          vf_explained_var: 0.6061452627182007
          vf_loss: 0.09484473639167845
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,51,19365.4,204000,0.00305362,0.756155,-0.749003,168.54


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 206814
  custom_metrics: {}
  date: 2021-07-26_06-53-35
  done: false
  episode_len_mean: 168.54
  episode_media: {}
  episode_reward_max: 0.7561546564102173
  episode_reward_mean: -0.002671980857849121
  episode_reward_min: -0.749002993106842
  episodes_this_iter: 24
  episodes_total: 1252
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.7724885381758213
          entropy_coeff: 0.0
          kl: 0.01476651115808636
          policy_loss: 0.01585260676802136
          total_loss: 0.11491768469568342
          vf_explained_var: 0.6820502281188965
          vf_loss: 0.08411398786120117
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,52,19748.4,208000,-0.00267198,0.756155,-0.749003,168.54


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 210792
  custom_metrics: {}
  date: 2021-07-26_06-59-52
  done: false
  episode_len_mean: 169.02
  episode_media: {}
  episode_reward_max: 0.7561546564102173
  episode_reward_mean: 0.0019376151263713836
  episode_reward_min: -0.749002993106842
  episodes_this_iter: 22
  episodes_total: 1274
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.7853282205760479
          entropy_coeff: 0.0
          kl: 0.015266374859493226
          policy_loss: 0.005552583723329008
          total_loss: 0.0908972134348005
          vf_explained_var: 0.6873229742050171
          vf_loss: 0.06988742365501821
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,53,20125.3,212000,0.00193762,0.756155,-0.749003,169.02


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 214768
  custom_metrics: {}
  date: 2021-07-26_07-06-14
  done: false
  episode_len_mean: 170.0
  episode_media: {}
  episode_reward_max: 0.42210251092910767
  episode_reward_mean: 0.000954851508140564
  episode_reward_min: -0.3980053961277008
  episodes_this_iter: 24
  episodes_total: 1298
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.8019839338958263
          entropy_coeff: 0.0
          kl: 0.011432848230469972
          policy_loss: 0.026048823492601514
          total_loss: 0.11072681401856244
          vf_explained_var: 0.6535382270812988
          vf_loss: 0.07310223486274481
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,54,20507.2,216000,0.000954852,0.422103,-0.398005,170


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 218744
  custom_metrics: {}
  date: 2021-07-26_07-12-33
  done: false
  episode_len_mean: 168.74
  episode_media: {}
  episode_reward_max: 0.42210251092910767
  episode_reward_mean: -0.00042228251695632937
  episode_reward_min: -0.3980053961277008
  episodes_this_iter: 26
  episodes_total: 1324
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.7783854156732559
          entropy_coeff: 0.0
          kl: 0.01202682062285021
          policy_loss: 0.012889442790765315
          total_loss: 0.08924112690147012
          vf_explained_var: 0.6701078414916992
          vf_loss: 0.06417452543973923
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,55,20886.4,220000,-0.000422283,0.422103,-0.398005,168.74


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 222723
  custom_metrics: {}
  date: 2021-07-26_07-18-54
  done: false
  episode_len_mean: 168.74
  episode_media: {}
  episode_reward_max: 0.42210251092910767
  episode_reward_mean: -5.179047584533691e-05
  episode_reward_min: -0.37687331438064575
  episodes_this_iter: 21
  episodes_total: 1345
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.79128348082304
          entropy_coeff: 0.0
          kl: 0.012285807228181511
          policy_loss: 0.01520316704409197
          total_loss: 0.08294719422701746
          vf_explained_var: 0.7132752537727356
          vf_loss: 0.05530464625917375
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,56,21267.1,224000,-5.17905e-05,0.422103,-0.376873,168.74


[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 226699
  custom_metrics: {}
  date: 2021-07-26_07-25-14
  done: false
  episode_len_mean: 167.69
  episode_media: {}
  episode_reward_max: 0.39110325276851654
  episode_reward_mean: -0.0005258254706859589
  episode_reward_min: -0.37687331438064575
  episodes_this_iter: 25
  episodes_total: 1370
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.7307028062641621
          entropy_coeff: 0.0
          kl: 0.011815391306299716
          policy_loss: 0.009172684338409454
          total_loss: 0.12506542494520545
          vf_explained_var: 0.6861726641654968
          vf_loss: 0.10392966168001294
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cu

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,57,21646.9,228000,-0.000525825,0.391103,-0.376873,167.69


[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 230676
  custom_metrics: {}
  date: 2021-07-26_07-31-36
  done: false
  episode_len_mean: 167.11
  episode_media: {}
  episode_reward_max: 0.39110325276851654
  episode_reward_mean: -0.0004936486482620239
  episode_reward_min: -0.37687331438064575
  episodes_this_iter: 24
  episodes_total: 1394
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.7164573222398758
          entropy_coeff: 0.0
          kl: 0.01409184094518423
          policy_loss: 0.02222554065519944
          total_loss: 0.15146972611546516
          vf_explained_var: 0.49301302433013916
          vf_loss: 0.11497619515284896
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,58,22029.1,232000,-0.000493649,0.391103,-0.376873,167.11


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 234652
  custom_metrics: {}
  date: 2021-07-26_07-37-59
  done: false
  episode_len_mean: 167.25
  episode_media: {}
  episode_reward_max: 0.39110325276851654
  episode_reward_mean: 0.002648817002773285
  episode_reward_min: -0.37687331438064575
  episodes_this_iter: 25
  episodes_total: 1419
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.7396693900227547
          entropy_coeff: 0.0
          kl: 0.012359951884718612
          policy_loss: 0.014976738835684955
          total_loss: 0.09365009737666696
          vf_explained_var: 0.7175599932670593
          vf_loss: 0.0661589065566659
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,59,22412.5,236000,0.00264882,0.391103,-0.376873,167.25


[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 238631
  custom_metrics: {}
  date: 2021-07-26_07-44-18
  done: false
  episode_len_mean: 167.44
  episode_media: {}
  episode_reward_max: 0.39110325276851654
  episode_reward_mean: 0.0027048207819461824
  episode_reward_min: -0.3792615979909897
  episodes_this_iter: 22
  episodes_total: 1441
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.6525150425732136
          entropy_coeff: 0.0
          kl: 0.012785671569872648
          policy_loss: 0.006887557567097247
          total_loss: 0.13999388506636024
          vf_explained_var: 0.6382715702056885
          vf_loss: 0.12016083672642708
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,60,22791.1,240000,0.00270482,0.391103,-0.379262,167.44


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 242607
  custom_metrics: {}
  date: 2021-07-26_07-50-40
  done: false
  episode_len_mean: 168.49
  episode_media: {}
  episode_reward_max: 0.33161893486976624
  episode_reward_mean: 3.018677234649658e-05
  episode_reward_min: -0.3792615979909897
  episodes_this_iter: 24
  episodes_total: 1465
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.6880155950784683
          entropy_coeff: 0.0
          kl: 0.014036066248081625
          policy_loss: 0.02419271832332015
          total_loss: 0.10377682454418391
          vf_explained_var: 0.524625301361084
          vf_loss: 0.06537259463220835
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,61,23173,244000,3.01868e-05,0.331619,-0.379262,168.49


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 246583
  custom_metrics: {}
  date: 2021-07-26_07-57-01
  done: false
  episode_len_mean: 168.49
  episode_media: {}
  episode_reward_max: 0.33161893486976624
  episode_reward_mean: 9.263306856155396e-05
  episode_reward_min: -0.3792615979909897
  episodes_this_iter: 24
  episodes_total: 1489
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.7474690601229668
          entropy_coeff: 0.0
          kl: 0.011552578362170607
          policy_loss: 0.027831846848130226
          total_loss: 0.09721398772671819
          vf_explained_var: 0.6995270252227783
          vf_loss: 0.05768515937961638
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,62,23553.7,248000,9.26331e-05,0.331619,-0.379262,168.49


[2m[36m(pid=13488)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)
[2m[36m(pid=13488)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 250562
  custom_metrics: {}
  date: 2021-07-26_08-03-25
  done: false
  episode_len_mean: 166.75
  episode_media: {}
  episode_reward_max: 0.33161893486976624
  episode_reward_mean: 0.0005428910255432129
  episode_reward_min: -0.3792615979909897
  episodes_this_iter: 25
  episodes_total: 1514
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.6843242086470127
          entropy_coeff: 0.0
          kl: 0.013267039554193616
          policy_loss: 0.024193664983613417
          total_loss: 0.12667646817862988
          vf_explained_var: 0.733250617980957
          vf_loss: 0.08904991997405887
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,63,23937.5,252000,0.000542891,0.331619,-0.379262,166.75


[2m[36m(pid=9864)[0m   arr = np.array(v)
[2m[36m(pid=9864)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 254540
  custom_metrics: {}
  date: 2021-07-26_08-09-49
  done: false
  episode_len_mean: 167.3
  episode_media: {}
  episode_reward_max: 0.32684817910194397
  episode_reward_mean: 0.00044394657015800476
  episode_reward_min: -0.31120985746383667
  episodes_this_iter: 23
  episodes_total: 1537
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.6626420132815838
          entropy_coeff: 0.0
          kl: 0.010949589777737856
          policy_loss: 0.013309201807714999
          total_loss: 0.11881820141570643
          vf_explained_var: 0.6997231245040894
          vf_loss: 0.09442254295572639
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,64,24321.9,256000,0.000443947,0.326848,-0.31121,167.3


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 258516
  custom_metrics: {}
  date: 2021-07-26_08-16-14
  done: false
  episode_len_mean: 167.3
  episode_media: {}
  episode_reward_max: 0.32684817910194397
  episode_reward_mean: -0.0007197976112365723
  episode_reward_min: -0.31120985746383667
  episodes_this_iter: 24
  episodes_total: 1561
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.652814943343401
          entropy_coeff: 0.0
          kl: 0.01309731270885095
          policy_loss: 0.007236378325615078
          total_loss: 0.07453252421692014
          vf_explained_var: 0.7244385480880737
          vf_loss: 0.054035119945183396
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,65,24706.7,260000,-0.000719798,0.326848,-0.31121,167.3


[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 262492
  custom_metrics: {}
  date: 2021-07-26_08-22-38
  done: false
  episode_len_mean: 166.56
  episode_media: {}
  episode_reward_max: 0.32684817910194397
  episode_reward_mean: 0.00033213049173355105
  episode_reward_min: -0.31120985746383667
  episodes_this_iter: 25
  episodes_total: 1586
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.685294859111309
          entropy_coeff: 0.0
          kl: 0.010729820729466155
          policy_loss: 0.007895790389738977
          total_loss: 0.13219217507867143
          vf_explained_var: 0.6894916892051697
          vf_loss: 0.11343244044110179
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,66,25090.9,264000,0.00033213,0.326848,-0.31121,166.56


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 266470
  custom_metrics: {}
  date: 2021-07-26_08-28-56
  done: false
  episode_len_mean: 169.14
  episode_media: {}
  episode_reward_max: 0.3100321814417839
  episode_reward_mean: -0.0006485983729362487
  episode_reward_min: -0.2879130467772484
  episodes_this_iter: 22
  episodes_total: 1608
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.6748069562017918
          entropy_coeff: 0.0
          kl: 0.012365894392132759
          policy_loss: 0.0016048342222347856
          total_loss: 0.10891928721684963
          vf_explained_var: 0.5916390419006348
          vf_loss: 0.09479398652911186
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,67,25468.6,268000,-0.000648598,0.310032,-0.287913,169.14


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 270446
  custom_metrics: {}
  date: 2021-07-26_08-35-18
  done: false
  episode_len_mean: 168.39
  episode_media: {}
  episode_reward_max: 0.9479574114084244
  episode_reward_mean: -0.0024201452732086183
  episode_reward_min: -0.9518609195947647
  episodes_this_iter: 25
  episodes_total: 1633
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.654979694634676
          entropy_coeff: 0.0
          kl: 0.010045646544313058
          policy_loss: 0.00812797254184261
          total_loss: 0.10282703035045415
          vf_explained_var: 0.6926887631416321
          vf_loss: 0.08452783850952983
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,68,25850.5,272000,-0.00242015,0.947957,-0.951861,168.39


[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=17992)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)
[2m[36m(pid=17992)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 274424
  custom_metrics: {}
  date: 2021-07-26_08-41-40
  done: false
  episode_len_mean: 167.37
  episode_media: {}
  episode_reward_max: 0.9479574114084244
  episode_reward_mean: -0.0002464443445205688
  episode_reward_min: -0.9518609195947647
  episodes_this_iter: 24
  episodes_total: 1657
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.7164765745401382
          entropy_coeff: 0.0
          kl: 0.01360721339005977
          policy_loss: 0.006629800016526133
          total_loss: 0.11293217819184065
          vf_explained_var: 0.6374211311340332
          vf_loss: 0.09252507239580154
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,69,26231.9,276000,-0.000246444,0.947957,-0.951861,167.37


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 278401
  custom_metrics: {}
  date: 2021-07-26_08-48-04
  done: false
  episode_len_mean: 167.37
  episode_media: {}
  episode_reward_max: 0.9479574114084244
  episode_reward_mean: 0.00012546777725219727
  episode_reward_min: -0.9518609195947647
  episodes_this_iter: 23
  episodes_total: 1680
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.6740572713315487
          entropy_coeff: 0.0
          kl: 0.012843323842389509
          policy_loss: 0.006223243137355894
          total_loss: 0.09761864342726767
          vf_explained_var: 0.3442728817462921
          vf_loss: 0.07839153753593564
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,70,26616.7,280000,0.000125468,0.947957,-0.951861,167.37


[2m[36m(pid=15516)[0m   arr = np.array(v)
[2m[36m(pid=15516)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 282377
  custom_metrics: {}
  date: 2021-07-26_08-54-33
  done: false
  episode_len_mean: 168.1
  episode_media: {}
  episode_reward_max: 0.9479574114084244
  episode_reward_mean: 0.0010836802423000336
  episode_reward_min: -0.9518609195947647
  episodes_this_iter: 25
  episodes_total: 1705
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.6719315238296986
          entropy_coeff: 0.0
          kl: 0.011900842015165836
          policy_loss: 0.0005236782599240541
          total_loss: 0.08530562371015549
          vf_explained_var: 0.529930830001831
          vf_loss: 0.07273234357126057
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,71,27005.3,284000,0.00108368,0.947957,-0.951861,168.1


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 286355
  custom_metrics: {}
  date: 2021-07-26_09-00-58
  done: false
  episode_len_mean: 168.97
  episode_media: {}
  episode_reward_max: 0.5030038952827454
  episode_reward_mean: 0.0026683688163757326
  episode_reward_min: -0.5076117813587189
  episodes_this_iter: 22
  episodes_total: 1727
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.6827193647623062
          entropy_coeff: 0.0
          kl: 0.010703535983338952
          policy_loss: -0.009723388706333935
          total_loss: 0.09671742626233026
          vf_explained_var: 0.6246081590652466
          vf_loss: 0.09560347837395966
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,72,27390.6,288000,0.00266837,0.503004,-0.507612,168.97


[2m[36m(pid=16724)[0m   arr = np.array(v)
[2m[36m(pid=16724)[0m   self[k] = np.array(v)


Result for PPO_ChessEnv_d8249_00000:
  agent_timesteps_total: 290331
  custom_metrics: {}
  date: 2021-07-26_09-07-27
  done: false
  episode_len_mean: 168.71
  episode_media: {}
  episode_reward_max: 0.5030038952827454
  episode_reward_mean: -0.0016481637954711913
  episode_reward_min: -0.5076117813587189
  episodes_this_iter: 25
  episodes_total: 1752
  experiment_id: 0bb2662462f74b3ab6e6a400b1419099
  hostname: Asus-ZephryusG14
  info:
    learner:
      player_0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_lr: 5.0e-05
          entropy: 0.5815132036805153
          entropy_coeff: 0.0
          kl: 0.011957329872529954
          policy_loss: 0.007326827268116176
          total_loss: 0.06937311892397702
          vf_explained_var: 0.6833558082580566
          vf_loss: 0.04993949248455465
      player_1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0125000000000002
          cur_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_ChessEnv_d8249_00000,RUNNING,10.0.0.37:5700,73,27779.4,292000,-0.00164816,0.503004,-0.507612,168.71


