In [1]:
from email import utils
import os
import datetime
from typing import Optional, Tuple
import json


os.environ["WANDB_NOTEBOOK_NAME"] = "Tianshow_Centralized_Training"

import numpy as np
import torch
from tianshou.data import Collector, VectorReplayBuffer, PrioritizedVectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.env.pettingzoo_env_parallel import PettingZooParallelEnv

from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy, RainbowPolicy
from tianshou.trainer import OffpolicyTrainer
from torch.utils.tensorboard import SummaryWriter

# from pettingzoo.sisl import pursuit_v4
from pettingzoo.mpe import simple_spread_v3
#import Mods.TaskSpreadEnv as TaskSpreadEnv

from TaskAllocation.RL_Policies.DNN_Spread import DNN_Spread
from TaskAllocation.RL_Policies.MPE_Task_MultiHead import MPE_Task_MultiHead

#import Mods.TaskPursuitEnv as TaskPursuitEnv
import Mods.ActionLoggerWrapper as ActionLoggerWrapper
import Mods.VDNPolicy as VDNPolicy
import Mods.PettingZooParallelEnv2 as PettingZooParallelEnv2
import Mods.CollectorMA as CollectorMA

from TaskAllocation.RL_Policies.Custom_Classes import CustomNet
from TaskAllocation.RL_Policies.Custom_Classes import CustomCollector
from TaskAllocation.RL_Policies.Custom_Classes import CustomParallelToAECWrapper

# Add specific modification to tianshou
import wandb
from tianshou.utils import WandbLogger
from tianshou.utils.logger.base import LOG_DATA_TYPE

def new_write(self, step_type: str, step: int, data: LOG_DATA_TYPE) -> None:
    data[step_type] = step
    wandb.log(data)
    
WandbLogger.write = new_write 

from pettingzoo.utils import wrappers
import gym

class ActionLoggerWrapper(gym.Wrapper):
    def __init__(self, env):
        super(ActionLoggerWrapper, self).__init__(env)
        self.actions = []

    def step(self, action):
        self.actions.append(action)
        return self.env.step(action)

    def reset(self, **kwargs):      
        if self.actions:
            # Convert all actions to numpy arrays and standardize their shapes
            formatted_actions = [np.array(a).flatten() for a in self.actions]
            flattened_actions = np.concatenate(formatted_actions)

            try:
                # Compute the histogram
                hist_data, bin_edges = np.histogram(flattened_actions, bins='auto')

                # Log the actions as a histogram to wandb
                wandb.log({"actions_histogram": wandb.Histogram(np_histogram=(hist_data, bin_edges))})
            except Exception as e:
                pass#print("Error in logging histogram:", e)

            self.actions = []
        return self.env.reset(**kwargs)


#from tianshou_DQN import train
model  =  "DNN_Spread" #"DNN_Spread"#"MPE_Task_MultiHead" # #"CNN_ATT_SISL" #"MultiHead_SISL" 
test_num  =  "_Desk_01_8feat"
policyModel  =  "DQN"

train_env_num = 10
test_env_num = 10

name = model + test_num

# log
now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
log_name = name + str(now)
log_path = os.path.join('./', "Logs", "dqn_sisl", log_name)

#policy
load_policy_name = f'policy_MPE_Task_MultiHead_Desk_01_8feat240112-145703_29_BestRew.pth'
save_policy_name = f'policy_{log_name}'
policy_path = "vdn_Spread"

Policy_Config = {
    "same_policy" : True,
    "load_model" : False,
    "freeze_CNN" : False     
                }

Spread_Config = {
    "N": 3,                      # Default = 3
    "local_ratio": 0.5,          # Default = 0.5
    "max_cycles": 25,            # Default = 25
    "continuous_actions": False, # Default = False
    "render_mode": None          # Default = None 
}

max_cycles = Spread_Config["max_cycles"]
n_agents = Spread_Config["N"]

dqn_params = {"discount_factor": 0.98, 
              "estimation_step": 5, 
              "target_update_freq": 1000,
              "optminizer": "Adam",
              "lr": 0.0001 }

trainer_params = {"max_epoch": 500,
                  "step_per_epoch": 2000 * max_cycles,
                  "step_per_collect": 250 * max_cycles, #6250
                  "episode_per_test": 50,
                  "batch_size" :  256,
                  "update_per_step": 1 / 30, #Only run after close a Collect (run many times as necessary to meet the value)
                  "tn_eps_max": 0.30,
                  "ts_eps_max": 0.01,
                  "warmup_size" : 1
                  }


runConfig = dqn_params
runConfig.update(Policy_Config)
runConfig.update(trainer_params) 
runConfig.update(Spread_Config)

model_load_path = os.path.join(policy_path, load_policy_name)  
model_save_path = os.path.join(policy_path, save_policy_name)        
os.makedirs(os.path.join(policy_path), exist_ok=True)  
os.makedirs(os.path.join(log_path), exist_ok=True)

def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
    policy_load_path = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    
    env = _get_env()       
    agent_observation_space = env.observation_space.shape
   
    action_shape = env.action_space
    
    device="cuda" if torch.cuda.is_available() else "cpu"  

    agents = []        
    
    if Policy_Config["same_policy"]:
        policies_number = 1
    else:
        policies_number = 3#len(env.agents)

    for _ in range(policies_number):                   

        if model == "DNN_Spread":
            net = DNN_Spread(
                obs_shape=agent_observation_space[0],                
                action_shape=5,                
                device="cuda" if torch.cuda.is_available() else "cpu"
                
            ).to(device)

        if model == "VDN_Spread":
            net = DNN_Spread(
                obs_shape=agent_observation_space[0],                
                action_shape=5,                
                device="cuda" if torch.cuda.is_available() else "cpu"
                
            ).to(device)

        if model == "MPE_Task_MultiHead":
            net = MPE_Task_MultiHead(                
                num_tasks=Spread_Config['N'] * 2 + 5,
                num_features_per_task = 2,#6 + 2 + 1,                
                device="cuda" if torch.cuda.is_available() else "cpu"
                
            ).to(device)

        optim = torch.optim.Adam(net.parameters(), lr=dqn_params["lr"], weight_decay=0.0 , amsgrad= True, eps=1e-06 )                

        if policyModel == "DQN":
            agent_learn = DQNPolicy(
                model=net,
                optim=optim,
                action_space = action_shape,
                discount_factor= dqn_params["discount_factor"],
                estimation_step=dqn_params["estimation_step"],
                target_update_freq=dqn_params["target_update_freq"],
                reward_normalization = False,
                clip_loss_grad = False 
            ) 
        
        if policyModel == "VDN":
            agent_learn = DQNPolicy(
                model=net,
                optim=optim,
                action_space = action_shape,
                discount_factor= dqn_params["discount_factor"],
                estimation_step=dqn_params["estimation_step"],
                target_update_freq=dqn_params["target_update_freq"],
                reward_normalization = False,
                clip_loss_grad = False,                
            ) 

        if Policy_Config["load_model"] is True:
            # Load the saved checkpoint             
            agent_learn.load_state_dict(torch.load(model_load_path))
            print(f'Loaded-> {model_load_path}')
                   
        #print(env.agents)
        #agents = [agent_learn for _ in range(len(env.agents))]
        
        agents.append(agent_learn)

    if Policy_Config["same_policy"]:
        agents = [agents[0] for _ in range(len(env.agents))]
    else:
        for _ in range(len(env.agents) - policies_number):
            agents.append(agents[0])

    if policyModel == "DQN":
        policy = MultiAgentPolicyManager(policies = agents, env=env)  

    if policyModel == "VDN":
        policy = VDNPolicy.VDNMAPolicy(policies = agents, env=env, device="cuda" if torch.cuda.is_available() else "cpu" )  

        
    return policy, optim, env.agents

def _get_env(test=False):
    """This function is needed to provide callables for DummyVectorEnv."""   
    # env_paralell = MultiUAVEnv()  
    #env = pursuit_v4.env()    
    #env = TaskSpreadEnv.env(
    # env = simple_spread_v3.parallel_env(
    env = simple_spread_v3.env(
        max_cycles=Spread_Config["max_cycles"],
        local_ratio=Spread_Config["local_ratio"],
        N=Spread_Config["N"],
        continuous_actions=Spread_Config["continuous_actions"],
        render_mode=" human" #Spread_Config["render_mode"]
    )    
    
    #env = parallel_to_aec_wrapper(env_paralell)    
    # env = CustomParallelToAECWrapper(env_paralell)
    env = ActionLoggerWrapper(env)
    env = PettingZooEnv(env) 
    # env = PettingZooParallelEnv(env)
       
    return  env

# print(json.dumps(runConfig, indent=4))


In [2]:
if __name__ == "__main__":
                        
    torch.set_grad_enabled(True) 
   
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(train_env_num)])
    test_envs = DummyVectorEnv([_get_env for _ in range(test_env_num)]) 

    # seed
    seed = 0
    np.random.seed(seed)
    
    torch.manual_seed(seed)

    train_envs.seed(seed)
    test_envs.seed(seed)

    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents()    

    if False:
        agents_buffers_training = {agent : 
                           PrioritizedVectorReplayBuffer( 300_000, 
                                                          len(train_envs), 
                                                          alpha=0.6, 
                                                          beta=0.4) 
                                                          for agent in agents
                         }
        agents_buffers_test = {agent : 
                           PrioritizedVectorReplayBuffer( 300_000, 
                                                          len(train_envs), 
                                                          alpha=0.6, 
                                                          beta=0.4) 
                                                          for agent in agents
                         }
    
        # ======== Step 3: Collector setup =========
        train_collector = CollectorMA.CollectorMA(
            policy,
            train_envs,
            agents_buffers_training,                        
            exploration_noise=True             
        )
        test_collector = CollectorMA.CollectorMA(policy, test_envs, agents_buffers_test, exploration_noise=True)

    if True:
         # ======== Step 3: Collector setup =========
        train_collector = Collector(
        policy,
        train_envs,
        # VectorReplayBuffer(300_000, len(train_envs)),
        PrioritizedVectorReplayBuffer( 300_000, len(train_envs), alpha=0.6, beta=0.4) , 
        #ListReplayBuffer(100000)       
        # buffer = StateMemoryVectorReplayBuffer(
        #         300_000,
        #         len(train_envs),  # Assuming train_envs is your vectorized environment
        #         memory_size=10,                
        #     ),
        exploration_noise=True             
        )
        test_collector = Collector(policy, test_envs, exploration_noise=True)
        
    print("Buffer Warming Up ")    
    for i in range(trainer_params["warmup_size"]):#int(trainer_params['batch_size'] / (300 * 10 ) )):
        
        train_collector.collect(n_episode=train_env_num)#,random=True) #trainer_params['batch_size'] * train_env_num))
        #train_collector.collect(n_step=300 * 10)
        print(".", end="") 
    
    # len_buffer = len(train_collector.buffer) / (Spread_Config["max_cycles"] * Spread_Config["N"])
    # print("\nBuffer Lenght: ", len_buffer ) 
    
    info = { "Buffer"  : "PriorizedReplayBuffer", " Warmup_ep" : runConfig["warmup_size"]}
    # ======== tensorboard logging setup =========                       
    logger = WandbLogger(
        train_interval = runConfig["max_cycles"] * runConfig["N"] ,
        test_interval = 1,#runConfig["max_cycles"] * runConfig["n_pursuers"],
        update_interval = runConfig["max_cycles"],
        save_interval = 1,
        write_flush = True,
        project = "Spread_Eval01",
        name = log_name,
        entity = None,
        run_id = log_name,
        config = runConfig,
        monitor_gym = True )
    
    writer = SummaryWriter(log_path)    
    writer.add_text("args", str(runConfig))    
    logger.load(writer)

    
    global_step_holder = [0] 
    
    
    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):                
        
        if Policy_Config["same_policy"]:
            torch.save(policy.policies[agents[0]].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_BestRew.pth")
            print("Best Saved Rew" , str(global_step_holder[0]))
        
        else:
            for n,agent in enumerate(agents):
                torch.save(policy.policies[agent].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_" + agent + ".pth")
            
            print("Bests Saved Rew" , str(global_step_holder[0]))
        
    def save_test_best_fn(policy):                
        
        if Policy_Config["same_policy"]:
            torch.save(policy.policies[agents[0]].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_BestLen.pth")
            print("Best Saved Length" , str(global_step_holder[0]))
        
        else:
            for n,agent in enumerate(agents):
                torch.save(policy.policies[agent].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_" + agent + ".pth")
            
            print("Best Saved Length" , str(global_step_holder[0]))
        

    def stop_fn(mean_rewards):
        return mean_rewards >= 99999939.0

    def train_fn(epoch, env_step):
        epsilon = trainer_params['tn_eps_max'] - (trainer_params['tn_eps_max'] - trainer_params['tn_eps_max']/100)*(epoch/trainer_params['max_epoch'])          
        if Policy_Config["same_policy"]:
            policy.policies[agents[0]].set_eps(epsilon)
        else:
            for agent in agents:
                policy.policies[agent].set_eps(epsilon)
                
        
        # if env_step % 500 == 0:
            # logger.write("train/env_step", env_step, {"train/eps": eps})


    def test_fn(epoch, env_step):
               
        epsilon = trainer_params['ts_eps_max']#0.01#max(0.001, 0.1 - epoch * 0.001)
        if Policy_Config["same_policy"]:
            policy.policies[agents[0]].set_eps(epsilon)
        else:            
            for agent in agents:                             
                 policy.policies[agent].set_eps(epsilon)
                
        
        if global_step_holder[0] % 10 == 0:
            
            if Policy_Config["same_policy"]:
                torch.save(policy.policies[agents[0]].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_Step.pth")
                print("Steps Policy Saved " , str(global_step_holder[0]))
            
            else:
                for n,agent in enumerate(agents):
                    torch.save(policy.policies[agent].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_" + agent + "Step" + str(global_step_holder[0]) + ".pth")
                
                print("Steps Policy Saved " , str(global_step_holder[0]))
        
    def reward_metric(rews):       
                
        global_step_holder[0] +=1 
        # print(rews)
        return rews


    # # ======== Step 5: Run the trainer =========
    offPolicyTrainer = OffpolicyTrainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,        
        max_epoch=trainer_params['max_epoch'],
        step_per_epoch=trainer_params['step_per_epoch'],
        step_per_collect=trainer_params['step_per_collect'],        
        episode_per_test= trainer_params['episode_per_test'],
        batch_size=trainer_params['batch_size'],
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        # save_test_best_fn=save_test_best_fn,
        update_per_step=trainer_params['update_per_step'],
        logger=logger,
        test_in_train=True,
        reward_metric=reward_metric,
        show_progress = True 
               
        )
    
    result = offPolicyTrainer.run()
    writer.close()
    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[0]])")



Buffer Warming Up 




.

[34m[1mwandb[0m: Currently logged in as: [33mandrekuros[0m. Use [1m`wandb login --relogin`[0m to force relogin
  from IPython.core.display import HTML, display  # type: ignore


Steps Policy Saved  0
Bests Saved Rew 1


Epoch #1: 25001it [00:40, 620.97it/s, agent_0/loss=0.180, agent_1/loss=0.787, agent_2/loss=0.848, env_step=25000, len=75, n/ep=80, n/st=6250, rew=-35.01]                           


Bests Saved Rew 6
Epoch #1: test_reward: -28.572341 ± 8.322666, best_reward: -28.572341 ± 8.322666 in #1


Epoch #2: 25001it [00:38, 648.82it/s, agent_0/loss=0.168, agent_1/loss=0.830, agent_2/loss=0.803, env_step=50000, len=75, n/ep=80, n/st=6250, rew=-28.69]                           


Steps Policy Saved  10
Epoch #2: test_reward: -29.140289 ± 9.330465, best_reward: -28.572341 ± 8.322666 in #1


Epoch #3: 25001it [00:37, 664.50it/s, agent_0/loss=0.154, agent_1/loss=0.707, agent_2/loss=0.704, env_step=75000, len=75, n/ep=90, n/st=6250, rew=-27.78]                           


Bests Saved Rew 16
Epoch #3: test_reward: -24.448605 ± 6.144603, best_reward: -24.448605 ± 6.144603 in #3


Epoch #4: 25001it [00:38, 652.36it/s, agent_0/loss=0.141, agent_1/loss=0.697, agent_2/loss=0.739, env_step=100000, len=75, n/ep=80, n/st=6250, rew=-25.50]                           


Steps Policy Saved  20
Epoch #4: test_reward: -26.471742 ± 6.515872, best_reward: -24.448605 ± 6.144603 in #3


Epoch #5: 25001it [00:38, 657.51it/s, agent_0/loss=0.140, agent_1/loss=0.698, agent_2/loss=0.733, env_step=125000, len=75, n/ep=80, n/st=6250, rew=-25.26]                           


Bests Saved Rew 26
Epoch #5: test_reward: -23.074004 ± 5.092964, best_reward: -23.074004 ± 5.092964 in #5


Epoch #6: 25001it [00:37, 660.20it/s, agent_0/loss=0.118, agent_1/loss=0.589, agent_2/loss=0.655, env_step=150000, len=75, n/ep=90, n/st=6250, rew=-24.29]                           


Steps Policy Saved  30
Epoch #6: test_reward: -27.943002 ± 7.241477, best_reward: -23.074004 ± 5.092964 in #5


Epoch #7: 25001it [00:37, 661.16it/s, agent_0/loss=0.119, agent_1/loss=0.639, agent_2/loss=0.668, env_step=175000, len=75, n/ep=80, n/st=6250, rew=-23.57]                           


Epoch #7: test_reward: -24.236488 ± 5.728032, best_reward: -23.074004 ± 5.092964 in #5


Epoch #8: 25001it [00:38, 652.15it/s, agent_0/loss=0.121, agent_1/loss=0.667, agent_2/loss=0.683, env_step=200000, len=75, n/ep=80, n/st=6250, rew=-23.41]                           


Steps Policy Saved  40
Epoch #8: test_reward: -24.687556 ± 6.703183, best_reward: -23.074004 ± 5.092964 in #5


Epoch #9: 25001it [00:37, 670.20it/s, agent_0/loss=0.122, agent_1/loss=0.743, agent_2/loss=0.727, env_step=225000, len=75, n/ep=90, n/st=6250, rew=-23.25]                            


Epoch #9: test_reward: -24.428342 ± 5.932659, best_reward: -23.074004 ± 5.092964 in #5


Epoch #10: 25001it [00:38, 649.47it/s, agent_0/loss=0.135, agent_1/loss=0.760, agent_2/loss=0.811, env_step=250000, len=75, n/ep=80, n/st=6250, rew=-23.80]                           


Steps Policy Saved  50
Epoch #10: test_reward: -24.491059 ± 6.133425, best_reward: -23.074004 ± 5.092964 in #5


Epoch #11: 25001it [00:38, 647.73it/s, agent_0/loss=0.138, agent_1/loss=0.871, agent_2/loss=0.916, env_step=275000, len=75, n/ep=80, n/st=6250, rew=-23.03]                           


Bests Saved Rew 56
Epoch #11: test_reward: -22.860642 ± 5.997847, best_reward: -22.860642 ± 5.997847 in #11


Epoch #12: 25001it [00:37, 663.49it/s, agent_0/loss=0.129, agent_1/loss=0.785, agent_2/loss=0.849, env_step=300000, len=75, n/ep=90, n/st=6250, rew=-23.04]                           


Steps Policy Saved  60
Bests Saved Rew 61
Epoch #12: test_reward: -22.547580 ± 5.349286, best_reward: -22.547580 ± 5.349286 in #12


Epoch #13: 25001it [00:37, 664.61it/s, agent_0/loss=0.127, agent_1/loss=0.788, agent_2/loss=0.925, env_step=325000, len=75, n/ep=80, n/st=6250, rew=-21.68]                           


Epoch #13: test_reward: -23.351046 ± 5.653278, best_reward: -22.547580 ± 5.349286 in #12


Epoch #14: 25001it [00:38, 652.90it/s, agent_0/loss=0.129, agent_1/loss=0.853, agent_2/loss=0.889, env_step=350000, len=75, n/ep=80, n/st=6250, rew=-21.71]                           


Steps Policy Saved  70
Bests Saved Rew 71
Epoch #14: test_reward: -21.657522 ± 5.314194, best_reward: -21.657522 ± 5.314194 in #14


Epoch #15: 25001it [00:38, 657.56it/s, agent_0/loss=0.134, agent_1/loss=0.818, agent_2/loss=0.855, env_step=375000, len=75, n/ep=90, n/st=6250, rew=-21.47]                           


Bests Saved Rew 76
Epoch #15: test_reward: -20.037465 ± 5.031871, best_reward: -20.037465 ± 5.031871 in #15


Epoch #16: 25001it [00:38, 653.11it/s, agent_0/loss=0.136, agent_1/loss=0.859, agent_2/loss=0.903, env_step=400000, len=75, n/ep=80, n/st=6250, rew=-22.07]                           


Steps Policy Saved  80
Epoch #16: test_reward: -21.103815 ± 4.685271, best_reward: -20.037465 ± 5.031871 in #15


Epoch #17: 25001it [00:36, 688.88it/s, agent_0/loss=0.134, agent_1/loss=0.866, agent_2/loss=0.912, env_step=425000, len=75, n/ep=80, n/st=6250, rew=-22.43]                           


Epoch #17: test_reward: -21.055746 ± 5.953920, best_reward: -20.037465 ± 5.031871 in #15


Epoch #18: 25001it [00:37, 665.12it/s, agent_0/loss=0.130, agent_1/loss=0.805, agent_2/loss=0.882, env_step=450000, len=75, n/ep=90, n/st=6250, rew=-22.04]                           


Steps Policy Saved  90
Bests Saved Rew 91
Epoch #18: test_reward: -19.561938 ± 5.183604, best_reward: -19.561938 ± 5.183604 in #18


Epoch #19: 25001it [00:37, 670.93it/s, agent_0/loss=0.131, agent_1/loss=0.881, agent_2/loss=0.836, env_step=475000, len=75, n/ep=80, n/st=6250, rew=-21.44]                           


Epoch #19: test_reward: -21.307860 ± 4.100875, best_reward: -19.561938 ± 5.183604 in #18


Epoch #20: 25001it [00:36, 680.09it/s, agent_0/loss=0.122, agent_1/loss=0.827, agent_2/loss=0.857, env_step=500000, len=75, n/ep=80, n/st=6250, rew=-22.16]                           


Steps Policy Saved  100
Epoch #20: test_reward: -20.711333 ± 4.120550, best_reward: -19.561938 ± 5.183604 in #18


Epoch #21: 25001it [00:36, 677.90it/s, agent_0/loss=0.131, agent_1/loss=0.859, agent_2/loss=0.858, env_step=525000, len=75, n/ep=90, n/st=6250, rew=-20.86]                           


Epoch #21: test_reward: -21.650793 ± 4.815897, best_reward: -19.561938 ± 5.183604 in #18


Epoch #22: 25001it [00:37, 674.05it/s, agent_0/loss=0.122, agent_1/loss=0.870, agent_2/loss=0.842, env_step=550000, len=75, n/ep=80, n/st=6250, rew=-21.88]                           


Steps Policy Saved  110
Epoch #22: test_reward: -21.774877 ± 5.281846, best_reward: -19.561938 ± 5.183604 in #18


Epoch #23: 25001it [00:36, 678.73it/s, agent_0/loss=0.123, agent_1/loss=0.931, agent_2/loss=0.848, env_step=575000, len=75, n/ep=80, n/st=6250, rew=-21.70]                           


Epoch #23: test_reward: -20.082534 ± 5.141089, best_reward: -19.561938 ± 5.183604 in #18


Epoch #24: 25001it [00:36, 687.10it/s, agent_0/loss=0.116, agent_1/loss=0.890, agent_2/loss=0.838, env_step=600000, len=75, n/ep=90, n/st=6250, rew=-21.26]                            


Steps Policy Saved  120
Epoch #24: test_reward: -19.950928 ± 4.970826, best_reward: -19.561938 ± 5.183604 in #18


Epoch #25: 25001it [00:37, 669.02it/s, agent_0/loss=0.116, agent_1/loss=0.894, agent_2/loss=0.829, env_step=625000, len=75, n/ep=80, n/st=6250, rew=-23.22]                           


Epoch #25: test_reward: -20.891955 ± 5.464428, best_reward: -19.561938 ± 5.183604 in #18


Epoch #26: 25001it [00:36, 676.59it/s, agent_0/loss=0.116, agent_1/loss=0.955, agent_2/loss=0.789, env_step=650000, len=75, n/ep=80, n/st=6250, rew=-21.77]                           


Steps Policy Saved  130
Epoch #26: test_reward: -22.086153 ± 4.086262, best_reward: -19.561938 ± 5.183604 in #18


Epoch #27: 25001it [00:36, 676.26it/s, agent_0/loss=0.116, agent_1/loss=0.994, agent_2/loss=0.801, env_step=675000, len=75, n/ep=90, n/st=6250, rew=-22.72]                           


Epoch #27: test_reward: -20.821669 ± 5.587397, best_reward: -19.561938 ± 5.183604 in #18


Epoch #28: 25001it [00:37, 674.52it/s, agent_0/loss=0.110, agent_1/loss=1.119, agent_2/loss=0.805, env_step=700000, len=75, n/ep=80, n/st=6250, rew=-21.02]                           


Steps Policy Saved  140
Epoch #28: test_reward: -20.953780 ± 5.842889, best_reward: -19.561938 ± 5.183604 in #18


Epoch #29: 25001it [00:36, 690.41it/s, agent_0/loss=0.114, agent_1/loss=1.118, agent_2/loss=0.825, env_step=725000, len=75, n/ep=80, n/st=6250, rew=-20.86]                           


Epoch #29: test_reward: -23.927776 ± 6.109648, best_reward: -19.561938 ± 5.183604 in #18


Epoch #30: 25001it [00:36, 689.15it/s, agent_0/loss=0.107, agent_1/loss=1.108, agent_2/loss=0.861, env_step=750000, len=75, n/ep=90, n/st=6250, rew=-21.62]                            


Steps Policy Saved  150
Epoch #30: test_reward: -20.705076 ± 5.036365, best_reward: -19.561938 ± 5.183604 in #18


Epoch #31: 25001it [00:37, 672.48it/s, agent_0/loss=0.108, agent_1/loss=1.238, agent_2/loss=0.881, env_step=775000, len=75, n/ep=80, n/st=6250, rew=-22.96]                           


Epoch #31: test_reward: -23.315482 ± 5.462823, best_reward: -19.561938 ± 5.183604 in #18


Epoch #32: 25001it [00:36, 678.85it/s, agent_0/loss=0.110, agent_1/loss=1.291, agent_2/loss=0.860, env_step=800000, len=75, n/ep=80, n/st=6250, rew=-23.21]                           


Steps Policy Saved  160
Epoch #32: test_reward: -21.086562 ± 5.963281, best_reward: -19.561938 ± 5.183604 in #18


Epoch #33: 25001it [00:36, 677.43it/s, agent_0/loss=0.109, agent_1/loss=1.403, agent_2/loss=0.873, env_step=825000, len=75, n/ep=90, n/st=6250, rew=-22.57]                           


Epoch #33: test_reward: -23.077714 ± 6.709586, best_reward: -19.561938 ± 5.183604 in #18


Epoch #34: 25001it [00:37, 672.55it/s, agent_0/loss=0.105, agent_1/loss=1.561, agent_2/loss=0.857, env_step=850000, len=75, n/ep=80, n/st=6250, rew=-23.57]                           


Steps Policy Saved  170
Epoch #34: test_reward: -21.089815 ± 4.615624, best_reward: -19.561938 ± 5.183604 in #18


Epoch #35: 25001it [00:37, 668.57it/s, agent_0/loss=0.110, agent_1/loss=1.572, agent_2/loss=0.838, env_step=875000, len=75, n/ep=80, n/st=6250, rew=-21.66]                           


Epoch #35: test_reward: -23.301956 ± 5.028620, best_reward: -19.561938 ± 5.183604 in #18


Epoch #36: 25001it [00:37, 669.19it/s, agent_0/loss=0.106, agent_1/loss=1.570, agent_2/loss=0.857, env_step=900000, len=75, n/ep=90, n/st=6250, rew=-22.18]                           


Steps Policy Saved  180
Epoch #36: test_reward: -21.493766 ± 5.055444, best_reward: -19.561938 ± 5.183604 in #18


Epoch #37: 25001it [00:36, 680.06it/s, agent_0/loss=0.110, agent_1/loss=1.663, agent_2/loss=0.843, env_step=925000, len=75, n/ep=80, n/st=6250, rew=-21.53]                           


Epoch #37: test_reward: -21.102559 ± 5.439577, best_reward: -19.561938 ± 5.183604 in #18


Epoch #38: 25001it [00:36, 688.72it/s, agent_0/loss=0.103, agent_1/loss=1.792, agent_2/loss=0.838, env_step=950000, len=75, n/ep=80, n/st=6250, rew=-22.31]                            


Steps Policy Saved  190
Epoch #38: test_reward: -20.139896 ± 4.921473, best_reward: -19.561938 ± 5.183604 in #18


Epoch #39: 25001it [00:36, 680.51it/s, agent_0/loss=0.106, agent_1/loss=1.817, agent_2/loss=0.796, env_step=975000, len=75, n/ep=90, n/st=6250, rew=-20.83]                           


Epoch #39: test_reward: -21.244649 ± 5.641709, best_reward: -19.561938 ± 5.183604 in #18


Epoch #40: 25001it [00:36, 676.34it/s, agent_0/loss=0.110, agent_1/loss=1.792, agent_2/loss=0.787, env_step=1000000, len=75, n/ep=80, n/st=6250, rew=-22.54]                           


Steps Policy Saved  200
Epoch #40: test_reward: -23.103881 ± 5.572133, best_reward: -19.561938 ± 5.183604 in #18


Epoch #41: 25001it [00:37, 670.52it/s, agent_0/loss=0.108, agent_1/loss=1.849, agent_2/loss=0.775, env_step=1025000, len=75, n/ep=80, n/st=6250, rew=-21.83]                           


Epoch #41: test_reward: -20.922868 ± 5.166278, best_reward: -19.561938 ± 5.183604 in #18


Epoch #42: 25001it [00:36, 681.04it/s, agent_0/loss=0.107, agent_1/loss=1.770, agent_2/loss=0.740, env_step=1050000, len=75, n/ep=90, n/st=6250, rew=-22.25]                            


Steps Policy Saved  210
Epoch #42: test_reward: -21.397510 ± 5.965227, best_reward: -19.561938 ± 5.183604 in #18


Epoch #43: 25001it [00:37, 669.71it/s, agent_0/loss=0.107, agent_1/loss=1.759, agent_2/loss=0.731, env_step=1075000, len=75, n/ep=80, n/st=6250, rew=-21.18]                           


Epoch #43: test_reward: -20.983980 ± 5.653256, best_reward: -19.561938 ± 5.183604 in #18


Epoch #44: 25001it [00:37, 674.85it/s, agent_0/loss=0.103, agent_1/loss=1.718, agent_2/loss=0.744, env_step=1100000, len=75, n/ep=80, n/st=6250, rew=-22.29]                           


Steps Policy Saved  220
Epoch #44: test_reward: -22.073666 ± 5.714171, best_reward: -19.561938 ± 5.183604 in #18


Epoch #45: 25001it [00:37, 672.75it/s, agent_0/loss=0.109, agent_1/loss=1.625, agent_2/loss=0.748, env_step=1125000, len=75, n/ep=90, n/st=6250, rew=-21.39]                           


Epoch #45: test_reward: -22.655006 ± 5.617088, best_reward: -19.561938 ± 5.183604 in #18


Epoch #46: 25001it [00:36, 676.38it/s, agent_0/loss=0.107, agent_1/loss=1.798, agent_2/loss=0.701, env_step=1150000, len=75, n/ep=80, n/st=6250, rew=-22.12]                           


Steps Policy Saved  230
Epoch #46: test_reward: -23.201281 ± 5.510550, best_reward: -19.561938 ± 5.183604 in #18


Epoch #47: 25001it [00:37, 670.03it/s, agent_0/loss=0.105, agent_1/loss=1.766, agent_2/loss=0.666, env_step=1175000, len=75, n/ep=80, n/st=6250, rew=-22.78]                           


Epoch #47: test_reward: -21.929433 ± 5.803890, best_reward: -19.561938 ± 5.183604 in #18


Epoch #48: 25001it [00:37, 666.70it/s, agent_0/loss=0.100, agent_1/loss=1.732, agent_2/loss=0.715, env_step=1200000, len=75, n/ep=90, n/st=6250, rew=-22.49]                           


Steps Policy Saved  240
Epoch #48: test_reward: -20.910840 ± 5.231448, best_reward: -19.561938 ± 5.183604 in #18


Epoch #49: 25001it [00:36, 682.67it/s, agent_0/loss=0.105, agent_1/loss=1.848, agent_2/loss=0.704, env_step=1225000, len=75, n/ep=80, n/st=6250, rew=-23.16]                           


Epoch #49: test_reward: -22.473139 ± 6.554896, best_reward: -19.561938 ± 5.183604 in #18


Epoch #50: 25001it [00:37, 661.17it/s, agent_0/loss=0.111, agent_1/loss=1.753, agent_2/loss=0.686, env_step=1250000, len=75, n/ep=80, n/st=6250, rew=-22.04]                           


Steps Policy Saved  250
Epoch #50: test_reward: -21.638234 ± 5.077276, best_reward: -19.561938 ± 5.183604 in #18


Epoch #51: 25001it [00:37, 673.27it/s, agent_0/loss=0.103, agent_1/loss=1.749, agent_2/loss=0.718, env_step=1275000, len=75, n/ep=90, n/st=6250, rew=-21.13]                           


Epoch #51: test_reward: -22.896583 ± 6.075582, best_reward: -19.561938 ± 5.183604 in #18


Epoch #52: 25001it [00:36, 682.74it/s, agent_0/loss=0.106, agent_1/loss=1.864, agent_2/loss=0.698, env_step=1300000, len=75, n/ep=80, n/st=6250, rew=-22.05]                            


Steps Policy Saved  260
Epoch #52: test_reward: -23.180570 ± 4.760844, best_reward: -19.561938 ± 5.183604 in #18


Epoch #53: 25001it [00:37, 670.72it/s, agent_0/loss=0.107, agent_1/loss=1.860, agent_2/loss=0.707, env_step=1325000, len=75, n/ep=80, n/st=6250, rew=-21.94]                           


Epoch #53: test_reward: -23.907511 ± 5.511402, best_reward: -19.561938 ± 5.183604 in #18


Epoch #54: 25001it [00:37, 666.76it/s, agent_0/loss=0.103, agent_1/loss=1.758, agent_2/loss=0.739, env_step=1350000, len=75, n/ep=90, n/st=6250, rew=-21.52]                           


Steps Policy Saved  270
Epoch #54: test_reward: -21.161816 ± 5.607961, best_reward: -19.561938 ± 5.183604 in #18


Epoch #55: 25001it [00:36, 681.19it/s, agent_0/loss=0.104, agent_1/loss=1.833, agent_2/loss=0.669, env_step=1375000, len=75, n/ep=80, n/st=6250, rew=-21.41]                            


Epoch #55: test_reward: -21.716060 ± 5.009540, best_reward: -19.561938 ± 5.183604 in #18


Epoch #56: 25001it [00:37, 669.37it/s, agent_0/loss=0.104, agent_1/loss=1.805, agent_2/loss=0.689, env_step=1400000, len=75, n/ep=80, n/st=6250, rew=-23.16]                           


Steps Policy Saved  280
Epoch #56: test_reward: -23.080426 ± 5.984202, best_reward: -19.561938 ± 5.183604 in #18


Epoch #57: 25001it [00:37, 667.33it/s, agent_0/loss=0.105, agent_1/loss=1.838, agent_2/loss=0.653, env_step=1425000, len=75, n/ep=90, n/st=6250, rew=-23.66]                           


Epoch #57: test_reward: -19.588593 ± 4.880962, best_reward: -19.561938 ± 5.183604 in #18


Epoch #58: 25001it [00:36, 686.43it/s, agent_0/loss=0.106, agent_1/loss=1.883, agent_2/loss=0.654, env_step=1450000, len=75, n/ep=80, n/st=6250, rew=-22.00]                            


Steps Policy Saved  290
Epoch #58: test_reward: -21.763866 ± 4.722534, best_reward: -19.561938 ± 5.183604 in #18


Epoch #59: 25001it [00:37, 672.96it/s, agent_0/loss=0.107, agent_1/loss=1.859, agent_2/loss=0.646, env_step=1475000, len=75, n/ep=80, n/st=6250, rew=-21.42]                           


Epoch #59: test_reward: -20.152047 ± 5.189259, best_reward: -19.561938 ± 5.183604 in #18


Epoch #60: 25001it [00:37, 674.93it/s, agent_0/loss=0.101, agent_1/loss=1.829, agent_2/loss=0.636, env_step=1500000, len=75, n/ep=90, n/st=6250, rew=-21.82]                           


Steps Policy Saved  300
Epoch #60: test_reward: -23.926611 ± 6.714333, best_reward: -19.561938 ± 5.183604 in #18


Epoch #61: 25001it [00:37, 674.75it/s, agent_0/loss=0.101, agent_1/loss=1.680, agent_2/loss=0.641, env_step=1525000, len=75, n/ep=80, n/st=6250, rew=-23.05]                           


Epoch #61: test_reward: -21.860168 ± 6.518977, best_reward: -19.561938 ± 5.183604 in #18


Epoch #62: 25001it [00:37, 672.13it/s, agent_0/loss=0.107, agent_1/loss=1.903, agent_2/loss=0.637, env_step=1550000, len=75, n/ep=80, n/st=6250, rew=-23.36]                           


Steps Policy Saved  310
Epoch #62: test_reward: -22.172437 ± 5.124099, best_reward: -19.561938 ± 5.183604 in #18


Epoch #63: 25001it [00:37, 674.48it/s, agent_0/loss=0.108, agent_1/loss=1.803, agent_2/loss=0.664, env_step=1575000, len=75, n/ep=90, n/st=6250, rew=-22.24]                           


Epoch #63: test_reward: -20.525994 ± 6.012541, best_reward: -19.561938 ± 5.183604 in #18


Epoch #64: 25001it [00:37, 664.50it/s, agent_0/loss=0.110, agent_1/loss=1.870, agent_2/loss=0.656, env_step=1600000, len=75, n/ep=80, n/st=6250, rew=-22.77]                           


Steps Policy Saved  320
Epoch #64: test_reward: -22.564471 ± 5.859548, best_reward: -19.561938 ± 5.183604 in #18


Epoch #65: 25001it [00:37, 659.30it/s, agent_0/loss=0.108, agent_1/loss=1.817, agent_2/loss=0.645, env_step=1625000, len=75, n/ep=80, n/st=6250, rew=-23.30]                           


Epoch #65: test_reward: -21.608489 ± 5.482060, best_reward: -19.561938 ± 5.183604 in #18


Epoch #66: 25001it [00:36, 688.00it/s, agent_0/loss=0.106, agent_1/loss=1.712, agent_2/loss=0.653, env_step=1650000, len=75, n/ep=90, n/st=6250, rew=-22.67]                           


Steps Policy Saved  330
Epoch #66: test_reward: -21.834525 ± 4.423151, best_reward: -19.561938 ± 5.183604 in #18


Epoch #67: 25001it [00:36, 679.81it/s, agent_0/loss=0.111, agent_1/loss=1.759, agent_2/loss=0.654, env_step=1675000, len=75, n/ep=80, n/st=6250, rew=-21.49]                           


Epoch #67: test_reward: -23.775690 ± 6.963774, best_reward: -19.561938 ± 5.183604 in #18


Epoch #68: 25001it [00:37, 663.22it/s, agent_0/loss=0.109, agent_1/loss=1.667, agent_2/loss=0.676, env_step=1700000, len=75, n/ep=80, n/st=6250, rew=-21.18]                           


Steps Policy Saved  340
Epoch #68: test_reward: -21.231197 ± 4.948625, best_reward: -19.561938 ± 5.183604 in #18


Epoch #69: 25001it [00:37, 665.69it/s, agent_0/loss=0.108, agent_1/loss=1.776, agent_2/loss=0.687, env_step=1725000, len=75, n/ep=90, n/st=6250, rew=-23.03]                           


Epoch #69: test_reward: -21.734304 ± 7.300340, best_reward: -19.561938 ± 5.183604 in #18


Epoch #70: 25001it [00:36, 675.70it/s, agent_0/loss=0.108, agent_1/loss=1.678, agent_2/loss=0.666, env_step=1750000, len=75, n/ep=80, n/st=6250, rew=-21.72]                           


Steps Policy Saved  350
Epoch #70: test_reward: -21.955084 ± 5.828222, best_reward: -19.561938 ± 5.183604 in #18


Epoch #71: 25001it [00:36, 675.70it/s, agent_0/loss=0.115, agent_1/loss=1.701, agent_2/loss=0.705, env_step=1775000, len=75, n/ep=80, n/st=6250, rew=-23.18]                           


Epoch #71: test_reward: -20.704151 ± 4.883298, best_reward: -19.561938 ± 5.183604 in #18


Epoch #72: 25001it [00:37, 668.75it/s, agent_0/loss=0.115, agent_1/loss=1.716, agent_2/loss=0.691, env_step=1800000, len=75, n/ep=90, n/st=6250, rew=-22.15]                           


Steps Policy Saved  360
Epoch #72: test_reward: -22.047434 ± 5.602587, best_reward: -19.561938 ± 5.183604 in #18


Epoch #73: 25001it [00:37, 671.69it/s, agent_0/loss=0.116, agent_1/loss=1.713, agent_2/loss=0.716, env_step=1825000, len=75, n/ep=80, n/st=6250, rew=-22.35]                           


Epoch #73: test_reward: -24.100353 ± 6.809801, best_reward: -19.561938 ± 5.183604 in #18


Epoch #74: 25001it [00:37, 661.71it/s, agent_0/loss=0.116, agent_1/loss=1.716, agent_2/loss=0.717, env_step=1850000, len=75, n/ep=80, n/st=6250, rew=-22.82]                           


Steps Policy Saved  370
Epoch #74: test_reward: -22.801630 ± 5.824508, best_reward: -19.561938 ± 5.183604 in #18


Epoch #75: 25001it [00:36, 676.21it/s, agent_0/loss=0.121, agent_1/loss=1.782, agent_2/loss=0.738, env_step=1875000, len=75, n/ep=90, n/st=6250, rew=-21.89]                           


Epoch #75: test_reward: -23.525403 ± 5.675313, best_reward: -19.561938 ± 5.183604 in #18


Epoch #76: 25001it [00:37, 673.12it/s, agent_0/loss=0.119, agent_1/loss=1.761, agent_2/loss=0.707, env_step=1900000, len=75, n/ep=80, n/st=6250, rew=-21.58]                           


Steps Policy Saved  380
Epoch #76: test_reward: -22.242692 ± 6.203020, best_reward: -19.561938 ± 5.183604 in #18


Epoch #77: 25001it [00:37, 674.32it/s, agent_0/loss=0.118, agent_1/loss=1.807, agent_2/loss=0.684, env_step=1925000, len=75, n/ep=80, n/st=6250, rew=-21.19]                           


Epoch #77: test_reward: -21.399269 ± 6.814529, best_reward: -19.561938 ± 5.183604 in #18


Epoch #78: 25001it [00:36, 688.90it/s, agent_0/loss=0.121, agent_1/loss=1.777, agent_2/loss=0.720, env_step=1950000, len=75, n/ep=90, n/st=6250, rew=-23.46]                            


Steps Policy Saved  390
Epoch #78: test_reward: -21.615475 ± 5.346808, best_reward: -19.561938 ± 5.183604 in #18


Epoch #79: 25001it [00:36, 676.27it/s, agent_0/loss=0.118, agent_1/loss=1.877, agent_2/loss=0.743, env_step=1975000, len=75, n/ep=80, n/st=6250, rew=-21.84]                           


Epoch #79: test_reward: -22.094981 ± 5.881762, best_reward: -19.561938 ± 5.183604 in #18


Epoch #80: 25001it [00:37, 673.24it/s, agent_0/loss=0.121, agent_1/loss=1.818, agent_2/loss=0.752, env_step=2000000, len=75, n/ep=80, n/st=6250, rew=-23.12]                           


Steps Policy Saved  400
Epoch #80: test_reward: -21.632960 ± 5.229812, best_reward: -19.561938 ± 5.183604 in #18


Epoch #81: 25001it [00:37, 675.01it/s, agent_0/loss=0.116, agent_1/loss=1.926, agent_2/loss=0.754, env_step=2025000, len=75, n/ep=90, n/st=6250, rew=-21.21]                           


Epoch #81: test_reward: -21.850786 ± 5.097189, best_reward: -19.561938 ± 5.183604 in #18


Epoch #82: 25001it [00:37, 669.82it/s, agent_0/loss=0.122, agent_1/loss=1.914, agent_2/loss=0.731, env_step=2050000, len=75, n/ep=80, n/st=6250, rew=-21.51]                           


Steps Policy Saved  410
Epoch #82: test_reward: -23.500082 ± 5.033359, best_reward: -19.561938 ± 5.183604 in #18


Epoch #83: 25001it [00:37, 667.28it/s, agent_0/loss=0.119, agent_1/loss=2.078, agent_2/loss=0.770, env_step=2075000, len=75, n/ep=80, n/st=6250, rew=-22.55]                           


Epoch #83: test_reward: -21.320193 ± 5.809093, best_reward: -19.561938 ± 5.183604 in #18


Epoch #84: 25001it [00:36, 677.91it/s, agent_0/loss=0.120, agent_1/loss=2.109, agent_2/loss=0.801, env_step=2100000, len=75, n/ep=90, n/st=6250, rew=-23.04]                           


Steps Policy Saved  420
Epoch #84: test_reward: -22.374945 ± 5.444317, best_reward: -19.561938 ± 5.183604 in #18


Epoch #85: 25001it [00:37, 667.94it/s, agent_0/loss=0.117, agent_1/loss=2.065, agent_2/loss=0.753, env_step=2125000, len=75, n/ep=80, n/st=6250, rew=-22.35]                           


Epoch #85: test_reward: -22.091433 ± 4.872588, best_reward: -19.561938 ± 5.183604 in #18


Epoch #86: 25001it [00:37, 668.60it/s, agent_0/loss=0.118, agent_1/loss=2.036, agent_2/loss=0.743, env_step=2150000, len=75, n/ep=80, n/st=6250, rew=-21.42]                           


Steps Policy Saved  430
Epoch #86: test_reward: -22.041180 ± 6.161554, best_reward: -19.561938 ± 5.183604 in #18


Epoch #87: 25001it [00:36, 680.31it/s, agent_0/loss=0.124, agent_1/loss=2.098, agent_2/loss=0.730, env_step=2175000, len=75, n/ep=90, n/st=6250, rew=-21.92]                           


Epoch #87: test_reward: -21.782127 ± 5.650386, best_reward: -19.561938 ± 5.183604 in #18


Epoch #88: 25001it [00:36, 676.36it/s, agent_0/loss=0.116, agent_1/loss=2.129, agent_2/loss=0.768, env_step=2200000, len=75, n/ep=80, n/st=6250, rew=-22.59]                           


Steps Policy Saved  440
Epoch #88: test_reward: -21.730274 ± 5.141314, best_reward: -19.561938 ± 5.183604 in #18


Epoch #89: 25001it [00:36, 680.51it/s, agent_0/loss=0.124, agent_1/loss=2.280, agent_2/loss=0.788, env_step=2225000, len=75, n/ep=80, n/st=6250, rew=-23.30]                           


Epoch #89: test_reward: -21.677077 ± 5.379996, best_reward: -19.561938 ± 5.183604 in #18


Epoch #90: 25001it [00:36, 686.20it/s, agent_0/loss=0.117, agent_1/loss=2.271, agent_2/loss=0.765, env_step=2250000, len=75, n/ep=90, n/st=6250, rew=-21.22]                            


Steps Policy Saved  450
Epoch #90: test_reward: -22.465733 ± 4.883785, best_reward: -19.561938 ± 5.183604 in #18


Epoch #91: 25001it [00:36, 678.31it/s, agent_0/loss=0.114, agent_1/loss=2.062, agent_2/loss=0.777, env_step=2275000, len=75, n/ep=80, n/st=6250, rew=-22.87]                           


Epoch #91: test_reward: -22.295109 ± 5.724512, best_reward: -19.561938 ± 5.183604 in #18


Epoch #92: 25001it [00:37, 671.32it/s, agent_0/loss=0.117, agent_1/loss=2.158, agent_2/loss=0.716, env_step=2300000, len=75, n/ep=80, n/st=6250, rew=-22.41]                           


Steps Policy Saved  460
Epoch #92: test_reward: -22.503671 ± 5.858489, best_reward: -19.561938 ± 5.183604 in #18


Epoch #93: 25001it [00:36, 678.38it/s, agent_0/loss=0.114, agent_1/loss=2.104, agent_2/loss=0.746, env_step=2325000, len=75, n/ep=90, n/st=6250, rew=-22.36]                           


Epoch #93: test_reward: -21.133923 ± 3.976560, best_reward: -19.561938 ± 5.183604 in #18


Epoch #94: 25001it [00:37, 675.41it/s, agent_0/loss=0.115, agent_1/loss=2.353, agent_2/loss=0.735, env_step=2350000, len=75, n/ep=80, n/st=6250, rew=-21.94]                           


Steps Policy Saved  470
Epoch #94: test_reward: -22.688392 ± 5.153698, best_reward: -19.561938 ± 5.183604 in #18


Epoch #95: 25001it [00:37, 673.08it/s, agent_0/loss=0.111, agent_1/loss=2.180, agent_2/loss=0.736, env_step=2375000, len=75, n/ep=80, n/st=6250, rew=-21.31]                           


Epoch #95: test_reward: -22.271677 ± 6.458102, best_reward: -19.561938 ± 5.183604 in #18


Epoch #96: 25001it [00:37, 662.75it/s, agent_0/loss=0.111, agent_1/loss=2.298, agent_2/loss=0.702, env_step=2400000, len=75, n/ep=90, n/st=6250, rew=-20.73]                           


Steps Policy Saved  480
Epoch #96: test_reward: -21.832145 ± 6.143395, best_reward: -19.561938 ± 5.183604 in #18


Epoch #97: 25001it [00:37, 671.18it/s, agent_0/loss=0.107, agent_1/loss=2.352, agent_2/loss=0.720, env_step=2425000, len=75, n/ep=80, n/st=6250, rew=-22.10]                           


Epoch #97: test_reward: -22.791206 ± 5.067688, best_reward: -19.561938 ± 5.183604 in #18


Epoch #98: 25001it [00:36, 685.64it/s, agent_0/loss=0.115, agent_1/loss=2.440, agent_2/loss=0.770, env_step=2450000, len=75, n/ep=80, n/st=6250, rew=-22.90]                            


Steps Policy Saved  490
Epoch #98: test_reward: -23.749663 ± 5.306764, best_reward: -19.561938 ± 5.183604 in #18


Epoch #99: 25001it [00:36, 680.23it/s, agent_0/loss=0.115, agent_1/loss=2.368, agent_2/loss=0.814, env_step=2475000, len=75, n/ep=90, n/st=6250, rew=-22.63]                           


Epoch #99: test_reward: -22.175370 ± 6.229390, best_reward: -19.561938 ± 5.183604 in #18


Epoch #100: 25001it [00:37, 669.19it/s, agent_0/loss=0.112, agent_1/loss=2.397, agent_2/loss=0.798, env_step=2500000, len=75, n/ep=80, n/st=6250, rew=-22.56]                           


Steps Policy Saved  500
Epoch #100: test_reward: -22.160959 ± 5.917115, best_reward: -19.561938 ± 5.183604 in #18


Epoch #101: 25001it [00:37, 671.76it/s, agent_0/loss=0.110, agent_1/loss=2.398, agent_2/loss=0.806, env_step=2525000, len=75, n/ep=80, n/st=6250, rew=-22.17]                            


Epoch #101: test_reward: -22.299464 ± 4.975260, best_reward: -19.561938 ± 5.183604 in #18


Epoch #102: 25001it [00:37, 673.38it/s, agent_0/loss=0.116, agent_1/loss=2.380, agent_2/loss=0.797, env_step=2550000, len=75, n/ep=90, n/st=6250, rew=-22.47]                           


Steps Policy Saved  510
Epoch #102: test_reward: -21.613463 ± 5.250341, best_reward: -19.561938 ± 5.183604 in #18


Epoch #103: 25001it [00:36, 676.70it/s, agent_0/loss=0.107, agent_1/loss=2.453, agent_2/loss=0.828, env_step=2575000, len=75, n/ep=80, n/st=6250, rew=-22.58]                            


Epoch #103: test_reward: -20.862072 ± 5.382030, best_reward: -19.561938 ± 5.183604 in #18


Epoch #104: 25001it [00:37, 663.01it/s, agent_0/loss=0.115, agent_1/loss=2.579, agent_2/loss=0.825, env_step=2600000, len=75, n/ep=80, n/st=6250, rew=-23.06]                           


Steps Policy Saved  520
Epoch #104: test_reward: -20.817407 ± 5.952690, best_reward: -19.561938 ± 5.183604 in #18


Epoch #105: 25001it [00:37, 673.95it/s, agent_0/loss=0.115, agent_1/loss=2.673, agent_2/loss=0.866, env_step=2625000, len=75, n/ep=90, n/st=6250, rew=-22.44]                           


Epoch #105: test_reward: -22.356222 ± 5.965334, best_reward: -19.561938 ± 5.183604 in #18


Epoch #106: 25001it [00:37, 667.91it/s, agent_0/loss=0.116, agent_1/loss=2.706, agent_2/loss=0.836, env_step=2650000, len=75, n/ep=80, n/st=6250, rew=-22.86]                           


Steps Policy Saved  530
Epoch #106: test_reward: -22.309251 ± 5.619907, best_reward: -19.561938 ± 5.183604 in #18


Epoch #107: 25001it [00:37, 666.12it/s, agent_0/loss=0.118, agent_1/loss=2.800, agent_2/loss=0.835, env_step=2675000, len=75, n/ep=80, n/st=6250, rew=-22.68]                           


Epoch #107: test_reward: -23.946846 ± 6.704443, best_reward: -19.561938 ± 5.183604 in #18


Epoch #108: 25001it [00:37, 672.95it/s, agent_0/loss=0.109, agent_1/loss=2.752, agent_2/loss=0.829, env_step=2700000, len=75, n/ep=90, n/st=6250, rew=-23.56]                           


Steps Policy Saved  540
Epoch #108: test_reward: -21.884476 ± 5.328155, best_reward: -19.561938 ± 5.183604 in #18


Epoch #109: 25001it [00:37, 666.14it/s, agent_0/loss=0.115, agent_1/loss=2.610, agent_2/loss=0.814, env_step=2725000, len=75, n/ep=80, n/st=6250, rew=-21.95]                           


Epoch #109: test_reward: -22.899843 ± 6.067396, best_reward: -19.561938 ± 5.183604 in #18


Epoch #110: 25001it [00:38, 655.61it/s, agent_0/loss=0.118, agent_1/loss=2.811, agent_2/loss=0.824, env_step=2750000, len=75, n/ep=80, n/st=6250, rew=-23.79]                           


Steps Policy Saved  550
Epoch #110: test_reward: -23.386083 ± 5.861976, best_reward: -19.561938 ± 5.183604 in #18


Epoch #111: 25001it [00:36, 676.15it/s, agent_0/loss=0.118, agent_1/loss=2.724, agent_2/loss=0.815, env_step=2775000, len=75, n/ep=90, n/st=6250, rew=-21.62]                           


Epoch #111: test_reward: -21.753446 ± 6.331835, best_reward: -19.561938 ± 5.183604 in #18


Epoch #112: 25001it [00:37, 668.69it/s, agent_0/loss=0.119, agent_1/loss=2.959, agent_2/loss=0.862, env_step=2800000, len=75, n/ep=80, n/st=6250, rew=-22.76]                           


Steps Policy Saved  560
Epoch #112: test_reward: -22.294198 ± 6.491027, best_reward: -19.561938 ± 5.183604 in #18


Epoch #113: 25001it [00:37, 663.43it/s, agent_0/loss=0.122, agent_1/loss=2.925, agent_2/loss=0.828, env_step=2825000, len=75, n/ep=80, n/st=6250, rew=-23.10]                           


Epoch #113: test_reward: -21.493673 ± 6.099589, best_reward: -19.561938 ± 5.183604 in #18


Epoch #114: 25001it [00:37, 673.76it/s, agent_0/loss=0.116, agent_1/loss=2.984, agent_2/loss=0.824, env_step=2850000, len=75, n/ep=90, n/st=6250, rew=-21.57]                           


Steps Policy Saved  570
Epoch #114: test_reward: -22.953397 ± 5.744012, best_reward: -19.561938 ± 5.183604 in #18


Epoch #115: 25001it [00:36, 675.95it/s, agent_0/loss=0.112, agent_1/loss=3.140, agent_2/loss=0.834, env_step=2875000, len=75, n/ep=80, n/st=6250, rew=-22.55]                           


Epoch #115: test_reward: -23.154501 ± 4.862374, best_reward: -19.561938 ± 5.183604 in #18


Epoch #116: 25001it [00:36, 687.53it/s, agent_0/loss=0.115, agent_1/loss=3.324, agent_2/loss=0.841, env_step=2900000, len=75, n/ep=80, n/st=6250, rew=-22.46]                            


Steps Policy Saved  580
Epoch #116: test_reward: -22.136703 ± 6.888936, best_reward: -19.561938 ± 5.183604 in #18


Epoch #117: 25001it [00:37, 671.63it/s, agent_0/loss=0.116, agent_1/loss=3.292, agent_2/loss=0.811, env_step=2925000, len=75, n/ep=90, n/st=6250, rew=-22.74]                           


Epoch #117: test_reward: -19.935038 ± 5.200613, best_reward: -19.561938 ± 5.183604 in #18


Epoch #118: 25001it [00:37, 670.18it/s, agent_0/loss=0.111, agent_1/loss=3.542, agent_2/loss=0.831, env_step=2950000, len=75, n/ep=80, n/st=6250, rew=-21.98]                           


Steps Policy Saved  590
Epoch #118: test_reward: -23.818289 ± 5.820777, best_reward: -19.561938 ± 5.183604 in #18


Epoch #119: 25001it [00:37, 671.58it/s, agent_0/loss=0.112, agent_1/loss=3.623, agent_2/loss=0.853, env_step=2975000, len=75, n/ep=80, n/st=6250, rew=-21.60]                           


Epoch #119: test_reward: -22.318479 ± 5.303616, best_reward: -19.561938 ± 5.183604 in #18


Epoch #120: 25001it [00:37, 672.80it/s, agent_0/loss=0.115, agent_1/loss=3.819, agent_2/loss=0.871, env_step=3000000, len=75, n/ep=90, n/st=6250, rew=-21.33]                           


Steps Policy Saved  600
Epoch #120: test_reward: -22.230720 ± 5.117157, best_reward: -19.561938 ± 5.183604 in #18


Epoch #121: 25001it [00:36, 678.52it/s, agent_0/loss=0.118, agent_1/loss=4.298, agent_2/loss=0.877, env_step=3025000, len=75, n/ep=80, n/st=6250, rew=-22.69]                           


Epoch #121: test_reward: -20.376609 ± 5.047527, best_reward: -19.561938 ± 5.183604 in #18


Epoch #122: 25001it [00:36, 677.50it/s, agent_0/loss=0.109, agent_1/loss=4.389, agent_2/loss=0.876, env_step=3050000, len=75, n/ep=80, n/st=6250, rew=-23.08]                            


Steps Policy Saved  610
Epoch #122: test_reward: -22.170391 ± 4.954196, best_reward: -19.561938 ± 5.183604 in #18


Epoch #123: 25001it [00:37, 668.10it/s, agent_0/loss=0.112, agent_1/loss=4.632, agent_2/loss=0.889, env_step=3075000, len=75, n/ep=90, n/st=6250, rew=-20.48]                           


Epoch #123: test_reward: -20.643321 ± 5.237300, best_reward: -19.561938 ± 5.183604 in #18


Epoch #124: 25001it [00:37, 669.28it/s, agent_0/loss=0.119, agent_1/loss=4.592, agent_2/loss=0.918, env_step=3100000, len=75, n/ep=80, n/st=6250, rew=-22.51]                           


Steps Policy Saved  620
Epoch #124: test_reward: -22.348237 ± 5.507128, best_reward: -19.561938 ± 5.183604 in #18


Epoch #125: 25001it [00:37, 668.32it/s, agent_0/loss=0.109, agent_1/loss=4.808, agent_2/loss=0.874, env_step=3125000, len=75, n/ep=80, n/st=6250, rew=-23.09]                           


Epoch #125: test_reward: -23.678939 ± 5.836737, best_reward: -19.561938 ± 5.183604 in #18


Epoch #126: 25001it [00:37, 671.10it/s, agent_0/loss=0.110, agent_1/loss=4.853, agent_2/loss=0.946, env_step=3150000, len=75, n/ep=90, n/st=6250, rew=-21.92]                           


Steps Policy Saved  630
Epoch #126: test_reward: -21.120788 ± 5.424585, best_reward: -19.561938 ± 5.183604 in #18


Epoch #127: 25001it [00:37, 670.76it/s, agent_0/loss=0.114, agent_1/loss=4.637, agent_2/loss=0.976, env_step=3175000, len=75, n/ep=80, n/st=6250, rew=-22.45]                           


Epoch #127: test_reward: -23.290054 ± 5.004513, best_reward: -19.561938 ± 5.183604 in #18


Epoch #128: 25001it [00:36, 683.80it/s, agent_0/loss=0.111, agent_1/loss=4.847, agent_2/loss=0.948, env_step=3200000, len=75, n/ep=80, n/st=6250, rew=-21.62]                            


Steps Policy Saved  640
Epoch #128: test_reward: -23.850110 ± 5.684213, best_reward: -19.561938 ± 5.183604 in #18


Epoch #129: 25001it [00:37, 675.05it/s, agent_0/loss=0.116, agent_1/loss=4.887, agent_2/loss=0.921, env_step=3225000, len=75, n/ep=90, n/st=6250, rew=-23.20]                            


Epoch #129: test_reward: -23.583745 ± 6.246853, best_reward: -19.561938 ± 5.183604 in #18


Epoch #130: 25001it [00:37, 659.83it/s, agent_0/loss=0.118, agent_1/loss=5.366, agent_2/loss=1.004, env_step=3250000, len=75, n/ep=80, n/st=6250, rew=-22.87]                           


Steps Policy Saved  650
Epoch #130: test_reward: -21.162242 ± 4.912563, best_reward: -19.561938 ± 5.183604 in #18


Epoch #131: 25001it [00:37, 665.10it/s, agent_0/loss=0.117, agent_1/loss=4.884, agent_2/loss=0.982, env_step=3275000, len=75, n/ep=80, n/st=6250, rew=-23.41]                           


Epoch #131: test_reward: -23.893773 ± 5.526943, best_reward: -19.561938 ± 5.183604 in #18


Epoch #132: 25001it [00:37, 674.86it/s, agent_0/loss=0.113, agent_1/loss=4.861, agent_2/loss=1.016, env_step=3300000, len=75, n/ep=90, n/st=6250, rew=-22.96]                           


Steps Policy Saved  660
Epoch #132: test_reward: -20.926925 ± 5.491453, best_reward: -19.561938 ± 5.183604 in #18


Epoch #133: 25001it [00:36, 675.92it/s, agent_0/loss=0.117, agent_1/loss=4.754, agent_2/loss=1.070, env_step=3325000, len=75, n/ep=80, n/st=6250, rew=-22.46]                           


Epoch #133: test_reward: -20.994067 ± 5.556536, best_reward: -19.561938 ± 5.183604 in #18


Epoch #134: 25001it [00:36, 677.83it/s, agent_0/loss=0.123, agent_1/loss=5.196, agent_2/loss=1.057, env_step=3350000, len=75, n/ep=80, n/st=6250, rew=-23.39]                            


Steps Policy Saved  670
Epoch #134: test_reward: -21.783523 ± 5.895973, best_reward: -19.561938 ± 5.183604 in #18


Epoch #135: 25001it [00:37, 675.26it/s, agent_0/loss=0.124, agent_1/loss=5.191, agent_2/loss=1.146, env_step=3375000, len=75, n/ep=90, n/st=6250, rew=-22.64]                           


Epoch #135: test_reward: -22.322557 ± 6.427406, best_reward: -19.561938 ± 5.183604 in #18


Epoch #136: 25001it [00:36, 679.28it/s, agent_0/loss=0.122, agent_1/loss=5.061, agent_2/loss=1.168, env_step=3400000, len=75, n/ep=80, n/st=6250, rew=-23.41]                           


Steps Policy Saved  680
Epoch #136: test_reward: -22.151269 ± 6.290540, best_reward: -19.561938 ± 5.183604 in #18


Epoch #137: 25001it [00:37, 666.12it/s, agent_0/loss=0.123, agent_1/loss=5.193, agent_2/loss=1.192, env_step=3425000, len=75, n/ep=80, n/st=6250, rew=-23.05]                           


Epoch #137: test_reward: -21.940273 ± 5.758359, best_reward: -19.561938 ± 5.183604 in #18


Epoch #138: 25001it [00:37, 673.77it/s, agent_0/loss=0.126, agent_1/loss=5.287, agent_2/loss=1.290, env_step=3450000, len=75, n/ep=90, n/st=6250, rew=-24.75]                           


Steps Policy Saved  690
Epoch #138: test_reward: -22.822360 ± 5.012565, best_reward: -19.561938 ± 5.183604 in #18


Epoch #139: 25001it [00:37, 675.55it/s, agent_0/loss=0.131, agent_1/loss=5.323, agent_2/loss=1.329, env_step=3475000, len=75, n/ep=80, n/st=6250, rew=-22.86]                           


Epoch #139: test_reward: -22.461543 ± 6.074702, best_reward: -19.561938 ± 5.183604 in #18


Epoch #140: 25001it [00:36, 676.62it/s, agent_0/loss=0.125, agent_1/loss=5.403, agent_2/loss=1.376, env_step=3500000, len=75, n/ep=80, n/st=6250, rew=-22.56]                            


Steps Policy Saved  700
Epoch #140: test_reward: -22.497828 ± 4.818070, best_reward: -19.561938 ± 5.183604 in #18


Epoch #141: 25001it [00:37, 669.65it/s, agent_0/loss=0.122, agent_1/loss=5.045, agent_2/loss=1.318, env_step=3525000, len=75, n/ep=90, n/st=6250, rew=-23.12]                           


Epoch #141: test_reward: -23.281082 ± 7.269137, best_reward: -19.561938 ± 5.183604 in #18


Epoch #142: 25001it [00:37, 668.88it/s, agent_0/loss=0.128, agent_1/loss=4.839, agent_2/loss=1.323, env_step=3550000, len=75, n/ep=80, n/st=6250, rew=-23.11]                           


Steps Policy Saved  710
Epoch #142: test_reward: -22.800541 ± 5.412067, best_reward: -19.561938 ± 5.183604 in #18


Epoch #143: 25001it [00:36, 685.22it/s, agent_0/loss=0.129, agent_1/loss=4.855, agent_2/loss=1.357, env_step=3575000, len=75, n/ep=80, n/st=6250, rew=-22.76]                           


Epoch #143: test_reward: -21.215246 ± 5.459678, best_reward: -19.561938 ± 5.183604 in #18


Epoch #144: 25001it [00:37, 675.61it/s, agent_0/loss=0.121, agent_1/loss=5.068, agent_2/loss=1.295, env_step=3600000, len=75, n/ep=90, n/st=6250, rew=-22.92]                            


Steps Policy Saved  720
Epoch #144: test_reward: -23.257903 ± 5.418807, best_reward: -19.561938 ± 5.183604 in #18


Epoch #145: 25001it [00:36, 678.07it/s, agent_0/loss=0.115, agent_1/loss=5.152, agent_2/loss=1.354, env_step=3625000, len=75, n/ep=80, n/st=6250, rew=-22.39]                           


Epoch #145: test_reward: -21.788221 ± 5.058685, best_reward: -19.561938 ± 5.183604 in #18


Epoch #146: 25001it [00:37, 668.35it/s, agent_0/loss=0.126, agent_1/loss=5.308, agent_2/loss=1.268, env_step=3650000, len=75, n/ep=80, n/st=6250, rew=-21.73]                           


Steps Policy Saved  730
Epoch #146: test_reward: -22.973689 ± 5.846022, best_reward: -19.561938 ± 5.183604 in #18


Epoch #147: 25001it [00:37, 673.59it/s, agent_0/loss=0.122, agent_1/loss=5.149, agent_2/loss=1.302, env_step=3675000, len=75, n/ep=90, n/st=6250, rew=-22.67]                           


Epoch #147: test_reward: -22.716581 ± 5.952240, best_reward: -19.561938 ± 5.183604 in #18


Epoch #148: 25001it [00:37, 671.78it/s, agent_0/loss=0.123, agent_1/loss=5.382, agent_2/loss=1.296, env_step=3700000, len=75, n/ep=80, n/st=6250, rew=-22.70]                           


Steps Policy Saved  740
Epoch #148: test_reward: -23.956130 ± 6.340063, best_reward: -19.561938 ± 5.183604 in #18


Epoch #149: 25001it [00:37, 664.68it/s, agent_0/loss=0.121, agent_1/loss=5.159, agent_2/loss=1.343, env_step=3725000, len=75, n/ep=80, n/st=6250, rew=-22.84]                           


Epoch #149: test_reward: -23.175443 ± 5.645475, best_reward: -19.561938 ± 5.183604 in #18


Epoch #150: 25001it [00:37, 665.75it/s, agent_0/loss=0.120, agent_1/loss=5.501, agent_2/loss=1.281, env_step=3750000, len=75, n/ep=90, n/st=6250, rew=-24.36]                           


Steps Policy Saved  750
Epoch #150: test_reward: -24.431230 ± 8.080755, best_reward: -19.561938 ± 5.183604 in #18


Epoch #151: 25001it [00:38, 655.57it/s, agent_0/loss=0.128, agent_1/loss=5.537, agent_2/loss=1.351, env_step=3775000, len=75, n/ep=80, n/st=6250, rew=-23.28]                           


Epoch #151: test_reward: -22.569840 ± 6.104083, best_reward: -19.561938 ± 5.183604 in #18


Epoch #152: 25001it [00:37, 662.65it/s, agent_0/loss=0.125, agent_1/loss=5.635, agent_2/loss=1.438, env_step=3800000, len=75, n/ep=80, n/st=6250, rew=-22.50]                           


Steps Policy Saved  760
Epoch #152: test_reward: -22.759193 ± 6.747307, best_reward: -19.561938 ± 5.183604 in #18


Epoch #153: 25001it [00:37, 665.19it/s, agent_0/loss=0.137, agent_1/loss=6.482, agent_2/loss=1.343, env_step=3825000, len=75, n/ep=90, n/st=6250, rew=-22.57]                           


Epoch #153: test_reward: -23.085709 ± 5.100723, best_reward: -19.561938 ± 5.183604 in #18


Epoch #154: 25001it [00:37, 666.79it/s, agent_0/loss=0.129, agent_1/loss=6.545, agent_2/loss=1.352, env_step=3850000, len=75, n/ep=80, n/st=6250, rew=-22.38]                           


Steps Policy Saved  770
Epoch #154: test_reward: -22.523286 ± 5.477719, best_reward: -19.561938 ± 5.183604 in #18


Epoch #155: 25001it [00:36, 676.75it/s, agent_0/loss=0.130, agent_1/loss=6.460, agent_2/loss=1.356, env_step=3875000, len=75, n/ep=80, n/st=6250, rew=-24.05]                           


Epoch #155: test_reward: -22.235737 ± 4.646235, best_reward: -19.561938 ± 5.183604 in #18


Epoch #156: 25001it [00:36, 681.99it/s, agent_0/loss=0.128, agent_1/loss=6.687, agent_2/loss=1.248, env_step=3900000, len=75, n/ep=90, n/st=6250, rew=-24.23]                            


Steps Policy Saved  780
Epoch #156: test_reward: -23.160760 ± 5.670305, best_reward: -19.561938 ± 5.183604 in #18


Epoch #157: 25001it [00:37, 670.46it/s, agent_0/loss=0.129, agent_1/loss=6.930, agent_2/loss=1.312, env_step=3925000, len=75, n/ep=80, n/st=6250, rew=-23.86]                           


Epoch #157: test_reward: -22.360173 ± 6.459101, best_reward: -19.561938 ± 5.183604 in #18


Epoch #158: 25001it [00:36, 682.03it/s, agent_0/loss=0.132, agent_1/loss=7.079, agent_2/loss=1.379, env_step=3950000, len=75, n/ep=80, n/st=6250, rew=-22.20]                            


Steps Policy Saved  790
Epoch #158: test_reward: -21.011265 ± 5.462896, best_reward: -19.561938 ± 5.183604 in #18


Epoch #159: 25001it [00:36, 683.85it/s, agent_0/loss=0.129, agent_1/loss=7.193, agent_2/loss=1.309, env_step=3975000, len=75, n/ep=90, n/st=6250, rew=-21.60]                           


Epoch #159: test_reward: -22.224223 ± 5.991949, best_reward: -19.561938 ± 5.183604 in #18


Epoch #160: 25001it [00:36, 676.66it/s, agent_0/loss=0.131, agent_1/loss=7.228, agent_2/loss=1.304, env_step=4000000, len=75, n/ep=80, n/st=6250, rew=-23.89]                           


Steps Policy Saved  800
Epoch #160: test_reward: -22.825863 ± 6.544934, best_reward: -19.561938 ± 5.183604 in #18


Epoch #161: 25001it [00:36, 678.04it/s, agent_0/loss=0.126, agent_1/loss=7.111, agent_2/loss=1.236, env_step=4025000, len=75, n/ep=80, n/st=6250, rew=-24.23]                           


Epoch #161: test_reward: -24.218172 ± 5.898534, best_reward: -19.561938 ± 5.183604 in #18


Epoch #162: 25001it [00:37, 675.24it/s, agent_0/loss=0.129, agent_1/loss=7.565, agent_2/loss=1.175, env_step=4050000, len=75, n/ep=90, n/st=6250, rew=-22.40]                           


Steps Policy Saved  810
Epoch #162: test_reward: -22.590332 ± 5.523637, best_reward: -19.561938 ± 5.183604 in #18


Epoch #163: 25001it [00:37, 675.70it/s, agent_0/loss=0.125, agent_1/loss=7.486, agent_2/loss=1.178, env_step=4075000, len=75, n/ep=80, n/st=6250, rew=-22.74]                           


Epoch #163: test_reward: -21.645664 ± 5.494630, best_reward: -19.561938 ± 5.183604 in #18


Epoch #164: 25001it [00:37, 666.00it/s, agent_0/loss=0.121, agent_1/loss=7.660, agent_2/loss=1.218, env_step=4100000, len=75, n/ep=80, n/st=6250, rew=-22.72]                           


Steps Policy Saved  820
Epoch #164: test_reward: -23.130785 ± 5.564605, best_reward: -19.561938 ± 5.183604 in #18


Epoch #165: 25001it [00:38, 657.00it/s, agent_0/loss=0.116, agent_1/loss=7.315, agent_2/loss=1.221, env_step=4125000, len=75, n/ep=90, n/st=6250, rew=-22.81]                           


Epoch #165: test_reward: -22.892066 ± 5.645934, best_reward: -19.561938 ± 5.183604 in #18


Epoch #166: 25001it [00:37, 669.55it/s, agent_0/loss=0.118, agent_1/loss=7.445, agent_2/loss=1.201, env_step=4150000, len=75, n/ep=80, n/st=6250, rew=-22.53]                           


Steps Policy Saved  830
Epoch #166: test_reward: -21.178503 ± 5.452819, best_reward: -19.561938 ± 5.183604 in #18


Epoch #167: 25001it [00:37, 667.23it/s, agent_0/loss=0.121, agent_1/loss=7.645, agent_2/loss=1.253, env_step=4175000, len=75, n/ep=80, n/st=6250, rew=-23.96]                           


Epoch #167: test_reward: -24.061956 ± 6.822286, best_reward: -19.561938 ± 5.183604 in #18


Epoch #168: 25001it [00:37, 660.24it/s, agent_0/loss=0.118, agent_1/loss=7.298, agent_2/loss=1.208, env_step=4200000, len=75, n/ep=90, n/st=6250, rew=-21.69]                           


Steps Policy Saved  840
Epoch #168: test_reward: -22.813340 ± 5.929744, best_reward: -19.561938 ± 5.183604 in #18


Epoch #169: 25001it [00:34, 734.18it/s, agent_0/loss=0.119, agent_1/loss=6.682, agent_2/loss=1.234, env_step=4225000, len=75, n/ep=80, n/st=6250, rew=-22.69]                            


Epoch #169: test_reward: -23.143089 ± 5.675607, best_reward: -19.561938 ± 5.183604 in #18


Epoch #170: 25001it [00:35, 696.96it/s, agent_0/loss=0.116, agent_1/loss=6.550, agent_2/loss=1.291, env_step=4250000, len=75, n/ep=80, n/st=6250, rew=-23.14]                           


Steps Policy Saved  850
Epoch #170: test_reward: -24.282327 ± 6.039897, best_reward: -19.561938 ± 5.183604 in #18


Epoch #171: 25001it [00:34, 714.61it/s, agent_0/loss=0.124, agent_1/loss=6.653, agent_2/loss=1.325, env_step=4275000, len=75, n/ep=90, n/st=6250, rew=-23.20]                           


Epoch #171: test_reward: -23.277258 ± 6.548444, best_reward: -19.561938 ± 5.183604 in #18


Epoch #172: 25001it [00:35, 699.63it/s, agent_0/loss=0.113, agent_1/loss=5.995, agent_2/loss=1.352, env_step=4300000, len=75, n/ep=80, n/st=6250, rew=-22.56]                            


Steps Policy Saved  860
Epoch #172: test_reward: -23.023154 ± 7.765280, best_reward: -19.561938 ± 5.183604 in #18


Epoch #173: 25001it [00:36, 678.20it/s, agent_0/loss=0.111, agent_1/loss=6.150, agent_2/loss=1.367, env_step=4325000, len=75, n/ep=80, n/st=6250, rew=-22.15]                           


Epoch #173: test_reward: -22.331586 ± 5.741615, best_reward: -19.561938 ± 5.183604 in #18


Epoch #174: 25001it [00:36, 681.47it/s, agent_0/loss=0.115, agent_1/loss=5.975, agent_2/loss=1.481, env_step=4350000, len=75, n/ep=90, n/st=6250, rew=-23.53]                           


Steps Policy Saved  870
Epoch #174: test_reward: -23.539597 ± 6.763303, best_reward: -19.561938 ± 5.183604 in #18


Epoch #175: 25001it [00:36, 676.85it/s, agent_0/loss=0.121, agent_1/loss=5.685, agent_2/loss=1.541, env_step=4375000, len=75, n/ep=80, n/st=6250, rew=-23.51]                           


Epoch #175: test_reward: -23.847809 ± 5.551110, best_reward: -19.561938 ± 5.183604 in #18


Epoch #176: 25001it [00:36, 689.69it/s, agent_0/loss=0.121, agent_1/loss=5.481, agent_2/loss=1.613, env_step=4400000, len=75, n/ep=80, n/st=6250, rew=-23.63]                           


Steps Policy Saved  880
Epoch #176: test_reward: -21.343905 ± 5.167458, best_reward: -19.561938 ± 5.183604 in #18


Epoch #177: 25001it [00:36, 680.02it/s, agent_0/loss=0.119, agent_1/loss=5.309, agent_2/loss=1.754, env_step=4425000, len=75, n/ep=90, n/st=6250, rew=-24.16]                           


Epoch #177: test_reward: -24.802887 ± 7.153807, best_reward: -19.561938 ± 5.183604 in #18


Epoch #178: 25001it [00:36, 679.31it/s, agent_0/loss=0.128, agent_1/loss=5.465, agent_2/loss=1.847, env_step=4450000, len=75, n/ep=80, n/st=6250, rew=-25.10]                           


Steps Policy Saved  890
Epoch #178: test_reward: -22.948554 ± 6.224735, best_reward: -19.561938 ± 5.183604 in #18


Epoch #179: 25001it [00:36, 679.42it/s, agent_0/loss=0.123, agent_1/loss=5.412, agent_2/loss=1.847, env_step=4475000, len=75, n/ep=80, n/st=6250, rew=-24.61]                           


Epoch #179: test_reward: -23.665119 ± 5.830498, best_reward: -19.561938 ± 5.183604 in #18


Epoch #180: 25001it [00:36, 678.98it/s, agent_0/loss=0.125, agent_1/loss=5.372, agent_2/loss=1.849, env_step=4500000, len=75, n/ep=90, n/st=6250, rew=-24.22]                           


Steps Policy Saved  900
Epoch #180: test_reward: -22.156512 ± 4.930647, best_reward: -19.561938 ± 5.183604 in #18


Epoch #181: 25001it [00:35, 709.15it/s, agent_0/loss=0.126, agent_1/loss=5.097, agent_2/loss=2.011, env_step=4525000, len=75, n/ep=80, n/st=6250, rew=-23.76]                           


Epoch #181: test_reward: -22.908979 ± 5.341692, best_reward: -19.561938 ± 5.183604 in #18


Epoch #182: 25001it [00:34, 720.48it/s, agent_0/loss=0.123, agent_1/loss=5.253, agent_2/loss=2.050, env_step=4550000, len=75, n/ep=80, n/st=6250, rew=-22.44]                            


Steps Policy Saved  910
Epoch #182: test_reward: -21.590033 ± 5.640022, best_reward: -19.561938 ± 5.183604 in #18


Epoch #183: 25001it [00:34, 726.00it/s, agent_0/loss=0.125, agent_1/loss=5.377, agent_2/loss=2.224, env_step=4575000, len=75, n/ep=90, n/st=6250, rew=-21.92]                            


Epoch #183: test_reward: -22.638071 ± 5.079784, best_reward: -19.561938 ± 5.183604 in #18


Epoch #184: 25001it [00:34, 718.80it/s, agent_0/loss=0.128, agent_1/loss=5.154, agent_2/loss=2.182, env_step=4600000, len=75, n/ep=80, n/st=6250, rew=-22.04]                           


Steps Policy Saved  920
Epoch #184: test_reward: -22.356080 ± 5.859852, best_reward: -19.561938 ± 5.183604 in #18


Epoch #185: 25001it [00:33, 743.47it/s, agent_0/loss=0.125, agent_1/loss=4.994, agent_2/loss=2.106, env_step=4625000, len=75, n/ep=80, n/st=6250, rew=-23.19]                            


Epoch #185: test_reward: -23.394141 ± 8.197020, best_reward: -19.561938 ± 5.183604 in #18


Epoch #186: 25001it [00:36, 677.20it/s, agent_0/loss=0.123, agent_1/loss=5.066, agent_2/loss=2.400, env_step=4650000, len=75, n/ep=90, n/st=6250, rew=-24.24]                           


Steps Policy Saved  930
Epoch #186: test_reward: -22.561078 ± 5.936533, best_reward: -19.561938 ± 5.183604 in #18


Epoch #187: 25001it [00:35, 708.58it/s, agent_0/loss=0.120, agent_1/loss=5.038, agent_2/loss=2.480, env_step=4675000, len=75, n/ep=80, n/st=6250, rew=-21.59]                           


Epoch #187: test_reward: -22.354056 ± 5.188515, best_reward: -19.561938 ± 5.183604 in #18


Epoch #188: 25001it [00:33, 753.24it/s, agent_0/loss=0.128, agent_1/loss=4.859, agent_2/loss=2.640, env_step=4700000, len=75, n/ep=80, n/st=6250, rew=-23.25]                            


Steps Policy Saved  940
Epoch #188: test_reward: -22.272031 ± 6.582320, best_reward: -19.561938 ± 5.183604 in #18


Epoch #189: 25001it [00:32, 761.78it/s, agent_0/loss=0.122, agent_1/loss=4.770, agent_2/loss=2.707, env_step=4725000, len=75, n/ep=90, n/st=6250, rew=-22.39]                            


Epoch #189: test_reward: -23.494139 ± 5.575799, best_reward: -19.561938 ± 5.183604 in #18


Epoch #190: 25001it [00:34, 714.63it/s, agent_0/loss=0.119, agent_1/loss=4.757, agent_2/loss=2.936, env_step=4750000, len=75, n/ep=80, n/st=6250, rew=-23.31]                            


Steps Policy Saved  950
Epoch #190: test_reward: -23.258517 ± 5.620782, best_reward: -19.561938 ± 5.183604 in #18


Epoch #191: 25001it [00:35, 701.74it/s, agent_0/loss=0.116, agent_1/loss=4.910, agent_2/loss=2.830, env_step=4775000, len=75, n/ep=80, n/st=6250, rew=-22.71]                           


Epoch #191: test_reward: -23.487002 ± 5.705182, best_reward: -19.561938 ± 5.183604 in #18


Epoch #192: 25001it [00:35, 708.93it/s, agent_0/loss=0.125, agent_1/loss=4.770, agent_2/loss=2.777, env_step=4800000, len=75, n/ep=90, n/st=6250, rew=-22.91]                           


Steps Policy Saved  960
Epoch #192: test_reward: -21.173844 ± 4.548594, best_reward: -19.561938 ± 5.183604 in #18


Epoch #193: 25001it [00:35, 698.90it/s, agent_0/loss=0.123, agent_1/loss=5.052, agent_2/loss=2.841, env_step=4825000, len=75, n/ep=80, n/st=6250, rew=-22.65]                           


Epoch #193: test_reward: -23.227503 ± 6.364078, best_reward: -19.561938 ± 5.183604 in #18


Epoch #194: 25001it [00:36, 691.10it/s, agent_0/loss=0.118, agent_1/loss=4.575, agent_2/loss=2.763, env_step=4850000, len=75, n/ep=80, n/st=6250, rew=-22.70]                           


Steps Policy Saved  970
Epoch #194: test_reward: -23.013107 ± 5.150618, best_reward: -19.561938 ± 5.183604 in #18


Epoch #195: 25001it [00:35, 698.75it/s, agent_0/loss=0.119, agent_1/loss=4.977, agent_2/loss=2.674, env_step=4875000, len=75, n/ep=90, n/st=6250, rew=-22.52]                           


Epoch #195: test_reward: -23.121834 ± 5.327790, best_reward: -19.561938 ± 5.183604 in #18


Epoch #196: 25001it [00:36, 692.41it/s, agent_0/loss=0.125, agent_1/loss=4.853, agent_2/loss=2.784, env_step=4900000, len=75, n/ep=80, n/st=6250, rew=-23.61]                           


Steps Policy Saved  980
Epoch #196: test_reward: -22.511536 ± 5.108747, best_reward: -19.561938 ± 5.183604 in #18


Epoch #197: 25001it [00:35, 698.48it/s, agent_0/loss=0.123, agent_1/loss=4.440, agent_2/loss=2.851, env_step=4925000, len=75, n/ep=80, n/st=6250, rew=-23.73]                           


Epoch #197: test_reward: -23.071806 ± 5.645987, best_reward: -19.561938 ± 5.183604 in #18


Epoch #198: 25001it [00:35, 696.19it/s, agent_0/loss=0.116, agent_1/loss=4.821, agent_2/loss=2.920, env_step=4950000, len=75, n/ep=90, n/st=6250, rew=-23.51]                           


Steps Policy Saved  990
Epoch #198: test_reward: -21.721226 ± 5.458633, best_reward: -19.561938 ± 5.183604 in #18


Epoch #199: 25001it [00:32, 758.14it/s, agent_0/loss=0.115, agent_1/loss=4.304, agent_2/loss=2.848, env_step=4975000, len=75, n/ep=80, n/st=6250, rew=-22.76]                            


Epoch #199: test_reward: -21.405403 ± 5.999276, best_reward: -19.561938 ± 5.183604 in #18


Epoch #200: 25001it [00:35, 707.83it/s, agent_0/loss=0.120, agent_1/loss=5.008, agent_2/loss=2.681, env_step=5000000, len=75, n/ep=80, n/st=6250, rew=-22.40]                            


Steps Policy Saved  1000
Epoch #200: test_reward: -22.016923 ± 4.210466, best_reward: -19.561938 ± 5.183604 in #18

{'duration': '7746.96s', 'train_time/model': '5057.65s', 'test_step': 753750, 'test_episode': 10050, 'test_time': '352.16s', 'test_speed': '2140.39 step/s', 'best_reward': -19.561938321409354, 'best_result': '-19.56 ± 5.18', 'train_step': 5000000, 'train_episode': 66660, 'train_time/collector': '2337.16s', 'train_speed': '676.15 step/s'}

(the trained policy can be accessed via policy.policies[agents[0]])
