In [1]:
from email import utils
import os
import datetime
from typing import Optional, Tuple
import json


os.environ["WANDB_NOTEBOOK_NAME"] = "Tianshow_Centralized_Training"

import numpy as np
import torch
from tianshou.data import Collector, VectorReplayBuffer, PrioritizedVectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
# from tianshou.env.pettingzoo_env_parallel import PettingZooParallelEnv

from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy, RainbowPolicy
from tianshou.trainer import OffpolicyTrainer
from torch.utils.tensorboard import SummaryWriter

# from pettingzoo.sisl import pursuit_v4
from pettingzoo.mpe import simple_spread_v3
import Mods.TaskSpreadEnv as TaskSpreadEnv

from TaskAllocation.RL_Policies.DNN_Spread import DNN_Spread
from TaskAllocation.RL_Policies.MPE_Task_MultiHead import MPE_Task_MultiHead

#import Mods.TaskPursuitEnv as TaskPursuitEnv
import Mods.ActionLoggerWrapper as ActionLoggerWrapper
# import Mods.VDNPolicy as VDNPolicy
# import Mods.PettingZooParallelEnv2 as PettingZooParallelEnv2
# import Mods.CollectorMA as CollectorMA

from TaskAllocation.RL_Policies.Custom_Classes import CustomNet
from TaskAllocation.RL_Policies.Custom_Classes import CustomCollector
from TaskAllocation.RL_Policies.Custom_Classes import CustomParallelToAECWrapper

# Add specific modification to tianshou
import wandb
from tianshou.utils import WandbLogger
from tianshou.utils.logger.base import LOG_DATA_TYPE

def new_write(self, step_type: str, step: int, data: LOG_DATA_TYPE) -> None:
    data[step_type] = step
    wandb.log(data)
    
WandbLogger.write = new_write 

from pettingzoo.utils import wrappers
import gym

class ActionLoggerWrapper(gym.Wrapper):
    def __init__(self, env):
        super(ActionLoggerWrapper, self).__init__(env)
        self.actions = []

    def step(self, action):
        self.actions.append(action)
        return self.env.step(action)

    def reset(self, **kwargs):      
        if self.actions:
            # Convert all actions to numpy arrays and standardize their shapes
            formatted_actions = [np.array(a).flatten() for a in self.actions]
            flattened_actions = np.concatenate(formatted_actions)

            try:
                # Compute the histogram
                hist_data, bin_edges = np.histogram(flattened_actions, bins='auto')

                # Log the actions as a histogram to wandb
                wandb.log({"actions_histogram": wandb.Histogram(np_histogram=(hist_data, bin_edges))})
            except Exception as e:
                pass#print("Error in logging histogram:", e)

            self.actions = []
        return self.env.reset(**kwargs)


#from tianshou_DQN import train
model  =  "MPE_Task_MultiHead" #"DNN_Spread"#"MPE_Task_MultiHead" # #"CNN_ATT_SISL" #"MultiHead_SISL" 
test_num  =  "_Desk_01_4feat_land_alli"
policyModel  =  "DQN"

train_env_num = 10
test_env_num = 10

name = model + test_num

# log
now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
log_name = name + str(now)
log_path = os.path.join('./', "Logs", "dqn_sisl", log_name)

#policy
load_policy_name = f'policy_MPE_Task_MultiHead_Desk_01_8feat240112-145703_29_BestRew.pth'
save_policy_name = f'policy_{log_name}'
policy_path = "dqn_Spread"

Policy_Config = {
    "same_policy" : True,
    "load_model" : False,
    "freeze_CNN" : False     
                }

Spread_Config = {
    "N": 3,                      # Default = 3
    "local_ratio": 0.5,          # Default = 0.5
    "max_cycles": 25,            # Default = 25
    "continuous_actions": False, # Default = False
    "render_mode": None          # Default = None 
}

max_cycles = Spread_Config["max_cycles"]
n_agents = Spread_Config["N"]

dqn_params = {"discount_factor": 0.99, 
              "estimation_step": 3, 
              "target_update_freq": 2400, #* max_cycles,
              "optminizer": "Adam",
              "lr": 0.00001 }

trainer_params = {"max_epoch": 200,
                  "step_per_epoch": 20000,#1000 * max_cycles,
                  "step_per_collect": 800,#20 * max_cycles,
                  "episode_per_test": 50,
                  "batch_size" :  1024,
                  "update_per_step": 0.0125,#1 / 75, #Only run after close a Collect (run many times as necessary to meet the value)
                  "tn_eps_max": 0.15,
                  "ts_eps_max": 0.0,
                  "warmup_size" : 10
                  }


runConfig = dqn_params
runConfig.update(Policy_Config)
runConfig.update(trainer_params) 
runConfig.update(Spread_Config)

model_load_path = os.path.join(policy_path, load_policy_name)  
model_save_path = os.path.join(policy_path, save_policy_name)        
os.makedirs(os.path.join(policy_path), exist_ok=True)  
os.makedirs(os.path.join(log_path), exist_ok=True)

def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
    policy_load_path = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    
    env = _get_env()       
    agent_observation_space = env.observation_space.shape
   
    action_shape = env.action_space
    
    device="cuda" if torch.cuda.is_available() else "cpu"  

    agents = []        
    
    if Policy_Config["same_policy"]:
        policies_number = 1
    else:
        policies_number = 3#len(env.agents)

    for _ in range(policies_number):                   

        if model == "DNN_Spread":
            net = DNN_Spread(
                obs_shape=agent_observation_space[0],                
                action_shape=5,                
                device="cuda" if torch.cuda.is_available() else "cpu"
                
            ).to(device)

        if model == "VDN_Spread":
            net = DNN_Spread(
                obs_shape=agent_observation_space[0],                
                action_shape=5,                
                device="cuda" if torch.cuda.is_available() else "cpu"
                
            ).to(device)

        if model == "MPE_Task_MultiHead":
            net = MPE_Task_MultiHead(                
                num_tasks=Spread_Config['N'] * 2 + 5,
                num_features_per_task = 8,#6 + 2 + 1,                
                device="cuda" if torch.cuda.is_available() else "cpu"
                
            ).to(device)

        optim = torch.optim.Adam(net.parameters(), lr=dqn_params["lr"], weight_decay=0.0, amsgrad= True )                

        if policyModel == "DQN":
            agent_learn = DQNPolicy(
                model=net,
                optim=optim,
                action_space = action_shape,
                discount_factor= dqn_params["discount_factor"],
                estimation_step=dqn_params["estimation_step"],
                target_update_freq=dqn_params["target_update_freq"],
                reward_normalization = False,
                clip_loss_grad = False 
            ) 
        
        if policyModel == "VDN":
            agent_learn = DQNPolicy(
                model=net,
                optim=optim,
                action_space = action_shape,
                discount_factor= dqn_params["discount_factor"],
                estimation_step=dqn_params["estimation_step"],
                target_update_freq=dqn_params["target_update_freq"],
                reward_normalization = False,
                clip_loss_grad = False,                
            ) 

        if Policy_Config["load_model"] is True:
            # Load the saved checkpoint             
            agent_learn.load_state_dict(torch.load(model_load_path))
            print(f'Loaded-> {model_load_path}')
                   
        #print(env.agents)
        #agents = [agent_learn for _ in range(len(env.agents))]
        
        agents.append(agent_learn)

    if Policy_Config["same_policy"]:
        agents = [agents[0] for _ in range(len(env.agents))]
    else:
        for _ in range(len(env.agents) - policies_number):
            agents.append(agents[0])

    # policy = VDNPolicy.VDNMAPolicy(policies = agents, env=env, device="cuda" if torch.cuda.is_available() else "cpu" )  
    policy = MultiAgentPolicyManager(policies = agents, env=env )  
        
    return policy, optim, env.agents

def _get_env(test=False):
    """This function is needed to provide callables for DummyVectorEnv."""   
    # env_paralell = MultiUAVEnv()  
    #env = pursuit_v4.env()    
    env = TaskSpreadEnv.env(
    # env = simple_spread_v3.parallel_env(
    # env = simple_spread_v3.env(
        max_cycles=Spread_Config["max_cycles"],
        local_ratio=Spread_Config["local_ratio"],
        N=Spread_Config["N"],
        continuous_actions=Spread_Config["continuous_actions"],
        render_mode=" human" #Spread_Config["render_mode"]
    )    
    
    # env = parallel_to_aec_wrapper(env_paralell)    
    # env = CustomParallelToAECWrapper(env_paralell)
    # env = ActionLoggerWrapper(env)
    env = PettingZooEnv(env) 
    # env = PettingZooParallelEnv(env)
       
    return  env

# print(json.dumps(runConfig, indent=4))


In [2]:
if __name__ == "__main__":
                        
    torch.set_grad_enabled(True) 
   
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(train_env_num)])
    test_envs = DummyVectorEnv([_get_env for _ in range(test_env_num)]) 

    # seed
    seed = 0
    np.random.seed(seed)
    
    torch.manual_seed(seed)

    train_envs.seed(seed)
    test_envs.seed(seed)

    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents()    

    if False:
        agents_buffers_training = {agent : 
                           PrioritizedVectorReplayBuffer( 30_000, 
                                                          len(train_envs), 
                                                          alpha=0.6, 
                                                          beta=0.4) 
                                                          for agent in agents
                         }
        agents_buffers_test = {agent : 
                           PrioritizedVectorReplayBuffer( 30_000, 
                                                          len(train_envs), 
                                                          alpha=0.6, 
                                                          beta=0.4) 
                                                          for agent in agents
                         }
    
        # ======== Step 3: Collector setup =========
        train_collector = CollectorMA.CollectorMA(
            policy,
            train_envs,
            agents_buffers_training,                        
            exploration_noise=True             
        )
        test_collector = CollectorMA.CollectorMA(policy, test_envs, agents_buffers_test, exploration_noise=True)

    if True:
         # ======== Step 3: Collector setup =========
        train_collector = Collector(
        policy,
        train_envs,
        # VectorReplayBuffer(300_000, len(train_envs)),
        PrioritizedVectorReplayBuffer( 30_000, len(train_envs), alpha=0.6, beta=0.4) , 
        #ListReplayBuffer(100000)       
        # buffer = StateMemoryVectorReplayBuffer(
        #         300_000,
        #         len(train_envs),  # Assuming train_envs is your vectorized environment
        #         memory_size=10,                
        #     ),
        exploration_noise=True             
        )
        test_collector = Collector(policy, test_envs, exploration_noise=True)
        
    print("Buffer Warming Up ")    
    for i in range(trainer_params["warmup_size"]):#int(trainer_params['batch_size'] / (300 * 10 ) )):
        
        train_collector.collect(n_episode=train_env_num)#,random=True) #trainer_params['batch_size'] * train_env_num))
        #train_collector.collect(n_step=300 * 10)
        print(".", end="") 
    
    # len_buffer = len(train_collector.buffer) / (Spread_Config["max_cycles"] * Spread_Config["N"])
    # print("\nBuffer Lenght: ", len_buffer ) 
    
    info = { "Buffer"  : "PriorizedReplayBuffer", " Warmup_ep" : runConfig["warmup_size"]}
    # ======== tensorboard logging setup =========                       
    logger = WandbLogger(
        train_interval = runConfig["max_cycles"] * runConfig["N"] ,
        test_interval = 1,#runConfig["max_cycles"] * runConfig["n_pursuers"],
        update_interval = runConfig["max_cycles"],
        save_interval = 1,
        write_flush = True,
        project = "Spread_Eval01",
        name = log_name,
        entity = None,
        run_id = log_name,
        config = runConfig,
        monitor_gym = True )
    
    writer = SummaryWriter(log_path)    
    writer.add_text("args", str(runConfig))    
    logger.load(writer)

    
    global_step_holder = [0] 
    
    
    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):                
        
        if Policy_Config["same_policy"]:
            torch.save(policy.policies[agents[0]].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_BestRew.pth")
            print("Best Saved Rew" , str(global_step_holder[0]))
        
        else:
            for n,agent in enumerate(agents):
                torch.save(policy.policies[agent].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_" + agent + ".pth")
            
            print("Bests Saved Rew" , str(global_step_holder[0]))
        
    def save_test_best_fn(policy):                
        
        if Policy_Config["same_policy"]:
            torch.save(policy.policies[agents[0]].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_BestLen.pth")
            print("Best Saved Length" , str(global_step_holder[0]))
        
        else:
            for n,agent in enumerate(agents):
                torch.save(policy.policies[agent].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_" + agent + ".pth")
            
            print("Best Saved Length" , str(global_step_holder[0]))
        

    def stop_fn(mean_rewards):
        return mean_rewards >= 99999939.0

    def train_fn(epoch, env_step):
        epsilon = trainer_params['tn_eps_max'] - (trainer_params['tn_eps_max'] - trainer_params['tn_eps_max']/100)*(epoch/trainer_params['max_epoch'])          
        if Policy_Config["same_policy"]:
            policy.policies[agents[0]].set_eps(epsilon)
        else:
            for agent in agents:
                policy.policies[agent].set_eps(epsilon)
                
        
        # if env_step % 500 == 0:
            # logger.write("train/env_step", env_step, {"train/eps": eps})


    def test_fn(epoch, env_step):
               
        epsilon = trainer_params['ts_eps_max']#0.01#max(0.001, 0.1 - epoch * 0.001)
        if Policy_Config["same_policy"]:
            policy.policies[agents[0]].set_eps(epsilon)
        else:            
            for agent in agents:                             
                 policy.policies[agent].set_eps(epsilon)
                
        
        if global_step_holder[0] % 10 == 0:
            
            if Policy_Config["same_policy"]:
                torch.save(policy.policies[agents[0]].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_Step.pth")
                print("Steps Policy Saved " , str(global_step_holder[0]))
            
            else:
                for n,agent in enumerate(agents):
                    torch.save(policy.policies[agent].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_" + agent + "Step" + str(global_step_holder[0]) + ".pth")
                
                print("Steps Policy Saved " , str(global_step_holder[0]))
        
    def reward_metric(rews):       
                
        global_step_holder[0] +=1 
        return rews


    # # ======== Step 5: Run the trainer =========
    offPolicyTrainer = OffpolicyTrainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,        
        max_epoch=trainer_params['max_epoch'],
        step_per_epoch=trainer_params['step_per_epoch'],
        step_per_collect=trainer_params['step_per_collect'],        
        episode_per_test= trainer_params['episode_per_test'],
        batch_size=trainer_params['batch_size'],
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        # save_test_best_fn=save_test_best_fn,
        update_per_step=trainer_params['update_per_step'],
        logger=logger,
        test_in_train=True,
        reward_metric=reward_metric,
        show_progress = True 
               
        )
    
    result = offPolicyTrainer.run()
    writer.close()
    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[0]])")

Buffer Warming Up 
.........



.

[34m[1mwandb[0m: Currently logged in as: [33mandrekuros[0m. Use [1m`wandb login --relogin`[0m to force relogin
  from IPython.core.display import HTML, display  # type: ignore


Steps Policy Saved  0
Best Saved Rew 1


Epoch #1: 20001it [00:37, 531.27it/s, agent_0/loss=0.259, agent_1/loss=0.271, agent_2/loss=0.264, env_step=20000, len=75, n/ep=10, n/st=800, rew=-16.28]                           


Best Saved Rew 27
Epoch #1: test_reward: -18.851344 ± 7.151752, best_reward: -18.851344 ± 7.151752 in #1


Epoch #2: 20001it [00:36, 551.23it/s, agent_0/loss=0.135, agent_1/loss=0.140, agent_2/loss=0.141, env_step=40000, len=75, n/ep=10, n/st=800, rew=-19.20]                           


Epoch #2: test_reward: -23.535522 ± 8.176556, best_reward: -18.851344 ± 7.151752 in #1


Epoch #3: 20001it [00:34, 576.14it/s, agent_0/loss=0.156, agent_1/loss=0.160, agent_2/loss=0.160, env_step=60000, len=75, n/ep=20, n/st=800, rew=-20.06]                           


Best Saved Rew 79
Epoch #3: test_reward: -17.444675 ± 7.063890, best_reward: -17.444675 ± 7.063890 in #3


Epoch #4: 20001it [00:34, 574.44it/s, agent_0/loss=0.165, agent_1/loss=0.165, agent_2/loss=0.171, env_step=80000, len=75, n/ep=10, n/st=800, rew=-20.97]                           


Epoch #4: test_reward: -19.836096 ± 7.138915, best_reward: -17.444675 ± 7.063890 in #3


Epoch #5: 20001it [00:34, 578.14it/s, agent_0/loss=0.160, agent_1/loss=0.167, agent_2/loss=0.164, env_step=100000, len=75, n/ep=10, n/st=800, rew=-21.56]                           


Steps Policy Saved  130
Best Saved Rew 131
Epoch #5: test_reward: -17.360558 ± 5.491127, best_reward: -17.360558 ± 5.491127 in #5


Epoch #6: 20001it [00:35, 568.57it/s, agent_0/loss=0.146, agent_1/loss=0.147, agent_2/loss=0.153, env_step=120000, len=75, n/ep=20, n/st=800, rew=-17.56]                           


Epoch #6: test_reward: -18.490801 ± 5.972650, best_reward: -17.360558 ± 5.491127 in #5


Epoch #7: 20001it [00:34, 574.33it/s, agent_0/loss=0.143, agent_1/loss=0.149, agent_2/loss=0.150, env_step=140000, len=75, n/ep=10, n/st=800, rew=-17.77]                           


Best Saved Rew 183
Epoch #7: test_reward: -17.147465 ± 7.686320, best_reward: -17.147465 ± 7.686320 in #7


Epoch #8: 20001it [00:35, 556.92it/s, agent_0/loss=0.107, agent_1/loss=0.113, agent_2/loss=0.115, env_step=160000, len=75, n/ep=10, n/st=800, rew=-18.75]                           


Best Saved Rew 209
Epoch #8: test_reward: -16.940000 ± 7.225419, best_reward: -16.940000 ± 7.225419 in #8


Epoch #9: 20001it [00:34, 573.79it/s, agent_0/loss=0.108, agent_1/loss=0.114, agent_2/loss=0.116, env_step=180000, len=75, n/ep=20, n/st=800, rew=-14.61]                           


Epoch #9: test_reward: -18.958657 ± 7.317113, best_reward: -16.940000 ± 7.225419 in #8


Epoch #10: 20001it [00:34, 576.10it/s, agent_0/loss=0.143, agent_1/loss=0.151, agent_2/loss=0.155, env_step=200000, len=75, n/ep=10, n/st=800, rew=-20.63]                           


Steps Policy Saved  260
Epoch #10: test_reward: -17.918890 ± 8.159875, best_reward: -16.940000 ± 7.225419 in #8


Epoch #11: 20001it [00:38, 513.82it/s, agent_0/loss=0.115, agent_1/loss=0.117, agent_2/loss=0.124, env_step=220000, len=75, n/ep=10, n/st=800, rew=-15.20]                           


Best Saved Rew 287
Epoch #11: test_reward: -15.156786 ± 6.711757, best_reward: -15.156786 ± 6.711757 in #11


Epoch #12: 20001it [00:35, 568.82it/s, agent_0/loss=0.117, agent_1/loss=0.124, agent_2/loss=0.128, env_step=240000, len=75, n/ep=20, n/st=800, rew=-16.70]                           


Epoch #12: test_reward: -17.662780 ± 8.321141, best_reward: -15.156786 ± 6.711757 in #11


Epoch #13: 20001it [00:35, 570.75it/s, agent_0/loss=0.141, agent_1/loss=0.156, agent_2/loss=0.158, env_step=260000, len=75, n/ep=10, n/st=800, rew=-21.10]                           


Epoch #13: test_reward: -17.120500 ± 8.186881, best_reward: -15.156786 ± 6.711757 in #11


Epoch #14: 20001it [00:35, 567.66it/s, agent_0/loss=0.129, agent_1/loss=0.154, agent_2/loss=0.154, env_step=280000, len=75, n/ep=10, n/st=800, rew=-19.32]                           


Epoch #14: test_reward: -18.598816 ± 9.579704, best_reward: -15.156786 ± 6.711757 in #11


Epoch #15: 20001it [00:35, 557.57it/s, agent_0/loss=0.129, agent_1/loss=0.152, agent_2/loss=0.154, env_step=300000, len=75, n/ep=20, n/st=800, rew=-18.29]                           


Steps Policy Saved  390
Epoch #15: test_reward: -18.526315 ± 7.616661, best_reward: -15.156786 ± 6.711757 in #11


Epoch #16: 20001it [00:35, 560.49it/s, agent_0/loss=0.126, agent_1/loss=0.142, agent_2/loss=0.147, env_step=320000, len=75, n/ep=10, n/st=800, rew=-14.96]                           


Epoch #16: test_reward: -18.597491 ± 10.846791, best_reward: -15.156786 ± 6.711757 in #11


Epoch #17: 20001it [00:34, 579.09it/s, agent_0/loss=0.143, agent_1/loss=0.168, agent_2/loss=0.170, env_step=340000, len=75, n/ep=10, n/st=800, rew=-24.06]                           


Epoch #17: test_reward: -17.577299 ± 7.701587, best_reward: -15.156786 ± 6.711757 in #11


Epoch #18: 20001it [00:34, 578.32it/s, agent_0/loss=0.124, agent_1/loss=0.164, agent_2/loss=0.156, env_step=360000, len=75, n/ep=20, n/st=800, rew=-17.59]                           


Epoch #18: test_reward: -17.382879 ± 8.189764, best_reward: -15.156786 ± 6.711757 in #11


Epoch #19: 20001it [00:34, 578.41it/s, agent_0/loss=0.117, agent_1/loss=0.148, agent_2/loss=0.156, env_step=380000, len=75, n/ep=10, n/st=800, rew=-15.40]                           


Epoch #19: test_reward: -19.563680 ± 7.581879, best_reward: -15.156786 ± 6.711757 in #11


Epoch #20: 20001it [00:34, 579.74it/s, agent_0/loss=0.137, agent_1/loss=0.170, agent_2/loss=0.188, env_step=400000, len=75, n/ep=10, n/st=800, rew=-20.37]                           


Steps Policy Saved  520
Epoch #20: test_reward: -18.892167 ± 8.200212, best_reward: -15.156786 ± 6.711757 in #11


Epoch #21: 20001it [00:34, 575.12it/s, agent_0/loss=0.138, agent_1/loss=0.184, agent_2/loss=0.190, env_step=420000, len=75, n/ep=20, n/st=800, rew=-19.53]                           


Epoch #21: test_reward: -18.989800 ± 6.713043, best_reward: -15.156786 ± 6.711757 in #11


Epoch #22: 20001it [00:34, 573.58it/s, agent_0/loss=0.125, agent_1/loss=0.173, agent_2/loss=0.168, env_step=440000, len=75, n/ep=10, n/st=800, rew=-18.62]                           


Epoch #22: test_reward: -16.510295 ± 7.361907, best_reward: -15.156786 ± 6.711757 in #11


Epoch #23: 20001it [00:35, 562.30it/s, agent_0/loss=0.142, agent_1/loss=0.180, agent_2/loss=0.185, env_step=460000, len=75, n/ep=10, n/st=800, rew=-17.73]                           


Epoch #23: test_reward: -15.326543 ± 5.879495, best_reward: -15.156786 ± 6.711757 in #11


Epoch #24: 20001it [00:34, 577.68it/s, agent_0/loss=0.137, agent_1/loss=0.174, agent_2/loss=0.182, env_step=480000, len=75, n/ep=20, n/st=800, rew=-15.60]                           


Epoch #24: test_reward: -16.227349 ± 8.288442, best_reward: -15.156786 ± 6.711757 in #11


Epoch #25: 20001it [00:34, 572.71it/s, agent_0/loss=0.128, agent_1/loss=0.170, agent_2/loss=0.168, env_step=500000, len=75, n/ep=10, n/st=800, rew=-17.06]                           


Steps Policy Saved  650
Epoch #25: test_reward: -17.463149 ± 9.819872, best_reward: -15.156786 ± 6.711757 in #11


Epoch #26: 20001it [00:34, 577.89it/s, agent_0/loss=0.165, agent_1/loss=0.222, agent_2/loss=0.211, env_step=520000, len=75, n/ep=10, n/st=800, rew=-14.63]                           


Epoch #26: test_reward: -15.597110 ± 6.576033, best_reward: -15.156786 ± 6.711757 in #11


Epoch #27: 20001it [00:34, 579.52it/s, agent_0/loss=0.140, agent_1/loss=0.191, agent_2/loss=0.187, env_step=540000, len=75, n/ep=20, n/st=800, rew=-13.50]                           


Best Saved Rew 703
Epoch #27: test_reward: -12.611545 ± 6.474710, best_reward: -12.611545 ± 6.474710 in #27


Epoch #28: 20001it [00:34, 574.25it/s, agent_0/loss=0.142, agent_1/loss=0.198, agent_2/loss=0.203, env_step=560000, len=75, n/ep=10, n/st=800, rew=-13.54]                           


Epoch #28: test_reward: -14.277900 ± 7.311873, best_reward: -12.611545 ± 6.474710 in #27


Epoch #29: 20001it [00:34, 578.19it/s, agent_0/loss=0.168, agent_1/loss=0.234, agent_2/loss=0.240, env_step=580000, len=75, n/ep=10, n/st=800, rew=-15.36]                           


Epoch #29: test_reward: -15.086500 ± 8.043775, best_reward: -12.611545 ± 6.474710 in #27


Epoch #30: 20001it [00:34, 576.20it/s, agent_0/loss=0.163, agent_1/loss=0.234, agent_2/loss=0.228, env_step=600000, len=75, n/ep=20, n/st=800, rew=-15.51]                           


Steps Policy Saved  780
Epoch #30: test_reward: -12.884762 ± 5.883568, best_reward: -12.611545 ± 6.474710 in #27


Epoch #31: 20001it [00:34, 584.18it/s, agent_0/loss=0.160, agent_1/loss=0.215, agent_2/loss=0.217, env_step=620000, len=75, n/ep=10, n/st=800, rew=-15.43]                           


Epoch #31: test_reward: -16.853491 ± 8.635331, best_reward: -12.611545 ± 6.474710 in #27


Epoch #32: 20001it [00:34, 583.17it/s, agent_0/loss=0.156, agent_1/loss=0.212, agent_2/loss=0.213, env_step=640000, len=75, n/ep=10, n/st=800, rew=-21.21]                           


Epoch #32: test_reward: -15.216094 ± 7.270135, best_reward: -12.611545 ± 6.474710 in #27


Epoch #33: 20001it [00:34, 583.23it/s, agent_0/loss=0.168, agent_1/loss=0.238, agent_2/loss=0.231, env_step=660000, len=75, n/ep=20, n/st=800, rew=-14.86]                           


Epoch #33: test_reward: -16.351363 ± 8.226359, best_reward: -12.611545 ± 6.474710 in #27


Epoch #34: 20001it [00:33, 591.79it/s, agent_0/loss=0.148, agent_1/loss=0.228, agent_2/loss=0.227, env_step=680000, len=75, n/ep=10, n/st=800, rew=-14.02]                           


Epoch #34: test_reward: -13.404053 ± 8.663321, best_reward: -12.611545 ± 6.474710 in #27


Epoch #35: 20001it [00:34, 585.25it/s, agent_0/loss=0.140, agent_1/loss=0.201, agent_2/loss=0.216, env_step=700000, len=75, n/ep=10, n/st=800, rew=-13.23]                           


Steps Policy Saved  910
Epoch #35: test_reward: -15.330479 ± 7.854981, best_reward: -12.611545 ± 6.474710 in #27


Epoch #36: 20001it [00:34, 587.36it/s, agent_0/loss=0.171, agent_1/loss=0.241, agent_2/loss=0.243, env_step=720000, len=75, n/ep=20, n/st=800, rew=-15.86]                           


Epoch #36: test_reward: -14.901518 ± 7.355478, best_reward: -12.611545 ± 6.474710 in #27


Epoch #37: 20001it [00:34, 585.35it/s, agent_0/loss=0.179, agent_1/loss=0.223, agent_2/loss=0.237, env_step=740000, len=75, n/ep=10, n/st=800, rew=-14.75]                           


Epoch #37: test_reward: -14.567665 ± 7.925979, best_reward: -12.611545 ± 6.474710 in #27


Epoch #38: 20001it [00:34, 581.54it/s, agent_0/loss=0.161, agent_1/loss=0.205, agent_2/loss=0.228, env_step=760000, len=75, n/ep=10, n/st=800, rew=-14.96]                           


Epoch #38: test_reward: -12.825742 ± 7.057582, best_reward: -12.611545 ± 6.474710 in #27


Epoch #39: 20001it [00:33, 588.83it/s, agent_0/loss=0.209, agent_1/loss=0.277, agent_2/loss=0.282, env_step=780000, len=75, n/ep=20, n/st=800, rew=-13.49]                           


Epoch #39: test_reward: -13.748420 ± 6.252888, best_reward: -12.611545 ± 6.474710 in #27


Epoch #40: 20001it [00:34, 579.34it/s, agent_0/loss=0.190, agent_1/loss=0.251, agent_2/loss=0.259, env_step=800000, len=75, n/ep=10, n/st=800, rew=-14.85]                           


Steps Policy Saved  1040
Epoch #40: test_reward: -15.414245 ± 10.057166, best_reward: -12.611545 ± 6.474710 in #27


Epoch #41: 20001it [00:34, 582.98it/s, agent_0/loss=0.177, agent_1/loss=0.249, agent_2/loss=0.251, env_step=820000, len=75, n/ep=10, n/st=800, rew=-14.53]                           


Epoch #41: test_reward: -13.092808 ± 6.957101, best_reward: -12.611545 ± 6.474710 in #27


Epoch #42: 20001it [00:34, 586.29it/s, agent_0/loss=0.203, agent_1/loss=0.289, agent_2/loss=0.282, env_step=840000, len=75, n/ep=20, n/st=800, rew=-12.93]                           


Best Saved Rew 1093
Epoch #42: test_reward: -11.043515 ± 5.017869, best_reward: -11.043515 ± 5.017869 in #42


Epoch #43: 20001it [00:33, 591.35it/s, agent_0/loss=0.190, agent_1/loss=0.270, agent_2/loss=0.280, env_step=860000, len=75, n/ep=10, n/st=800, rew=-11.09]                           


Epoch #43: test_reward: -13.043082 ± 6.166365, best_reward: -11.043515 ± 5.017869 in #42


Epoch #44: 20001it [00:34, 579.47it/s, agent_0/loss=0.187, agent_1/loss=0.254, agent_2/loss=0.275, env_step=880000, len=75, n/ep=10, n/st=800, rew=-16.35]                           


Epoch #44: test_reward: -13.551426 ± 7.014577, best_reward: -11.043515 ± 5.017869 in #42


Epoch #45: 20001it [00:34, 585.50it/s, agent_0/loss=0.208, agent_1/loss=0.285, agent_2/loss=0.290, env_step=900000, len=75, n/ep=20, n/st=800, rew=-14.46]                           


Steps Policy Saved  1170
Epoch #45: test_reward: -16.762025 ± 7.104546, best_reward: -11.043515 ± 5.017869 in #42


Epoch #46: 20001it [00:34, 585.89it/s, agent_0/loss=0.207, agent_1/loss=0.291, agent_2/loss=0.278, env_step=920000, len=75, n/ep=10, n/st=800, rew=-15.12]                           


Epoch #46: test_reward: -13.689791 ± 6.640342, best_reward: -11.043515 ± 5.017869 in #42


Epoch #47: 20001it [00:34, 583.61it/s, agent_0/loss=0.202, agent_1/loss=0.293, agent_2/loss=0.285, env_step=940000, len=75, n/ep=10, n/st=800, rew=-13.73]                           


Epoch #47: test_reward: -14.328737 ± 6.985687, best_reward: -11.043515 ± 5.017869 in #42


Epoch #48: 20001it [00:34, 586.48it/s, agent_0/loss=0.196, agent_1/loss=0.303, agent_2/loss=0.286, env_step=960000, len=75, n/ep=20, n/st=800, rew=-15.12]                           


Epoch #48: test_reward: -13.199630 ± 6.175133, best_reward: -11.043515 ± 5.017869 in #42


Epoch #49: 20001it [00:34, 578.98it/s, agent_0/loss=0.221, agent_1/loss=0.318, agent_2/loss=0.316, env_step=980000, len=75, n/ep=10, n/st=800, rew=-17.98]                           


Epoch #49: test_reward: -15.764013 ± 6.969645, best_reward: -11.043515 ± 5.017869 in #42


Epoch #50: 20001it [00:34, 585.31it/s, agent_0/loss=0.211, agent_1/loss=0.301, agent_2/loss=0.307, env_step=1000000, len=75, n/ep=10, n/st=800, rew=-16.71]                           


Steps Policy Saved  1300
Epoch #50: test_reward: -13.316969 ± 6.643016, best_reward: -11.043515 ± 5.017869 in #42


Epoch #51: 20001it [00:34, 587.83it/s, agent_0/loss=0.220, agent_1/loss=0.305, agent_2/loss=0.304, env_step=1020000, len=75, n/ep=20, n/st=800, rew=-14.78]                           


Epoch #51: test_reward: -13.986399 ± 6.091827, best_reward: -11.043515 ± 5.017869 in #42


Epoch #52: 20001it [00:34, 584.94it/s, agent_0/loss=0.237, agent_1/loss=0.342, agent_2/loss=0.344, env_step=1040000, len=75, n/ep=10, n/st=800, rew=-14.50]                           


Epoch #52: test_reward: -13.206723 ± 6.875438, best_reward: -11.043515 ± 5.017869 in #42


Epoch #53: 20001it [00:34, 586.10it/s, agent_0/loss=0.237, agent_1/loss=0.314, agent_2/loss=0.319, env_step=1060000, len=75, n/ep=10, n/st=800, rew=-15.47]                           


Epoch #53: test_reward: -16.096437 ± 9.024568, best_reward: -11.043515 ± 5.017869 in #42


Epoch #54: 20001it [00:34, 583.16it/s, agent_0/loss=0.234, agent_1/loss=0.329, agent_2/loss=0.340, env_step=1080000, len=75, n/ep=20, n/st=800, rew=-16.06]                           


Epoch #54: test_reward: -14.127627 ± 6.931253, best_reward: -11.043515 ± 5.017869 in #42


Epoch #55: 20001it [00:35, 564.22it/s, agent_0/loss=0.264, agent_1/loss=0.373, agent_2/loss=0.377, env_step=1100000, len=75, n/ep=10, n/st=800, rew=-14.55]                           


Steps Policy Saved  1430
Epoch #55: test_reward: -12.420759 ± 5.118742, best_reward: -11.043515 ± 5.017869 in #42


Epoch #56: 20001it [00:36, 552.67it/s, agent_0/loss=0.252, agent_1/loss=0.384, agent_2/loss=0.386, env_step=1120000, len=75, n/ep=10, n/st=800, rew=-12.44]                           


Epoch #56: test_reward: -16.496498 ± 8.534962, best_reward: -11.043515 ± 5.017869 in #42


Epoch #57: 20001it [00:37, 538.25it/s, agent_0/loss=0.247, agent_1/loss=0.359, agent_2/loss=0.374, env_step=1140000, len=75, n/ep=20, n/st=800, rew=-13.23]                           


Epoch #57: test_reward: -12.580334 ± 7.167077, best_reward: -11.043515 ± 5.017869 in #42


Epoch #58: 20001it [00:34, 578.55it/s, agent_0/loss=0.268, agent_1/loss=0.406, agent_2/loss=0.384, env_step=1160000, len=75, n/ep=10, n/st=800, rew=-10.44]                           


Epoch #58: test_reward: -11.322626 ± 5.939105, best_reward: -11.043515 ± 5.017869 in #42


Epoch #59: 20001it [00:34, 580.65it/s, agent_0/loss=0.264, agent_1/loss=0.354, agent_2/loss=0.344, env_step=1180000, len=75, n/ep=10, n/st=800, rew=-15.00]                           


Epoch #59: test_reward: -12.887982 ± 7.731156, best_reward: -11.043515 ± 5.017869 in #42


Epoch #60: 20001it [00:34, 572.23it/s, agent_0/loss=0.265, agent_1/loss=0.377, agent_2/loss=0.367, env_step=1200000, len=75, n/ep=20, n/st=800, rew=-13.15]                           


Steps Policy Saved  1560
Epoch #60: test_reward: -14.635018 ± 7.950731, best_reward: -11.043515 ± 5.017869 in #42


Epoch #61: 20001it [00:34, 581.01it/s, agent_0/loss=0.289, agent_1/loss=0.376, agent_2/loss=0.378, env_step=1220000, len=75, n/ep=10, n/st=800, rew=-12.50]                           


Epoch #61: test_reward: -13.767555 ± 6.603706, best_reward: -11.043515 ± 5.017869 in #42


Epoch #62: 20001it [00:34, 572.83it/s, agent_0/loss=0.296, agent_1/loss=0.398, agent_2/loss=0.414, env_step=1240000, len=75, n/ep=10, n/st=800, rew=-13.21]                           


Epoch #62: test_reward: -13.518346 ± 6.330955, best_reward: -11.043515 ± 5.017869 in #42


Epoch #63: 20001it [00:34, 575.68it/s, agent_0/loss=0.268, agent_1/loss=0.385, agent_2/loss=0.399, env_step=1260000, len=75, n/ep=20, n/st=800, rew=-13.59]                           


Epoch #63: test_reward: -13.100167 ± 6.763961, best_reward: -11.043515 ± 5.017869 in #42


Epoch #64: 20001it [00:35, 563.29it/s, agent_0/loss=0.271, agent_1/loss=0.372, agent_2/loss=0.386, env_step=1280000, len=75, n/ep=10, n/st=800, rew=-12.68]                           


Epoch #64: test_reward: -12.554373 ± 6.169741, best_reward: -11.043515 ± 5.017869 in #42


Epoch #65: 20001it [00:34, 572.88it/s, agent_0/loss=0.311, agent_1/loss=0.409, agent_2/loss=0.431, env_step=1300000, len=75, n/ep=10, n/st=800, rew=-12.38]                           


Steps Policy Saved  1690
Epoch #65: test_reward: -13.805037 ± 7.867755, best_reward: -11.043515 ± 5.017869 in #42


Epoch #66: 20001it [00:34, 579.46it/s, agent_0/loss=0.298, agent_1/loss=0.427, agent_2/loss=0.410, env_step=1320000, len=75, n/ep=20, n/st=800, rew=-15.57]                           


Epoch #66: test_reward: -13.270596 ± 6.317469, best_reward: -11.043515 ± 5.017869 in #42


Epoch #67: 20001it [00:34, 578.21it/s, agent_0/loss=0.275, agent_1/loss=0.387, agent_2/loss=0.403, env_step=1340000, len=75, n/ep=10, n/st=800, rew=-14.89]                           


Epoch #67: test_reward: -13.578004 ± 7.348245, best_reward: -11.043515 ± 5.017869 in #42


Epoch #68: 20001it [00:35, 571.34it/s, agent_0/loss=0.304, agent_1/loss=0.429, agent_2/loss=0.440, env_step=1360000, len=75, n/ep=10, n/st=800, rew=-13.26]                           


Epoch #68: test_reward: -13.123690 ± 5.494127, best_reward: -11.043515 ± 5.017869 in #42


Epoch #69: 20001it [00:35, 570.80it/s, agent_0/loss=0.289, agent_1/loss=0.409, agent_2/loss=0.427, env_step=1380000, len=75, n/ep=20, n/st=800, rew=-14.82]                           


Epoch #69: test_reward: -13.412743 ± 6.241280, best_reward: -11.043515 ± 5.017869 in #42


Epoch #70: 20001it [00:34, 582.02it/s, agent_0/loss=0.308, agent_1/loss=0.453, agent_2/loss=0.443, env_step=1400000, len=75, n/ep=10, n/st=800, rew=-15.45]                           


Steps Policy Saved  1820
Epoch #70: test_reward: -12.738128 ± 7.822553, best_reward: -11.043515 ± 5.017869 in #42


Epoch #71: 20001it [00:35, 570.03it/s, agent_0/loss=0.312, agent_1/loss=0.423, agent_2/loss=0.445, env_step=1420000, len=75, n/ep=10, n/st=800, rew=-9.93]                            


Epoch #71: test_reward: -12.433844 ± 5.221982, best_reward: -11.043515 ± 5.017869 in #42


Epoch #72: 20001it [00:34, 576.08it/s, agent_0/loss=0.316, agent_1/loss=0.428, agent_2/loss=0.438, env_step=1440000, len=75, n/ep=20, n/st=800, rew=-12.22]                           


Epoch #72: test_reward: -13.664111 ± 6.810185, best_reward: -11.043515 ± 5.017869 in #42


Epoch #73: 20001it [00:34, 578.32it/s, agent_0/loss=0.305, agent_1/loss=0.427, agent_2/loss=0.419, env_step=1460000, len=75, n/ep=10, n/st=800, rew=-10.58]                           


Epoch #73: test_reward: -11.837762 ± 5.825406, best_reward: -11.043515 ± 5.017869 in #42


Epoch #74: 20001it [00:35, 567.33it/s, agent_0/loss=0.304, agent_1/loss=0.447, agent_2/loss=0.444, env_step=1480000, len=75, n/ep=10, n/st=800, rew=-16.61]                           


Epoch #74: test_reward: -11.730436 ± 6.199725, best_reward: -11.043515 ± 5.017869 in #42


Epoch #75: 20001it [00:34, 581.44it/s, agent_0/loss=0.306, agent_1/loss=0.425, agent_2/loss=0.462, env_step=1500000, len=75, n/ep=20, n/st=800, rew=-13.15]                           


Steps Policy Saved  1950
Epoch #75: test_reward: -14.404898 ± 6.896298, best_reward: -11.043515 ± 5.017869 in #42


Epoch #76: 20001it [00:35, 564.71it/s, agent_0/loss=0.304, agent_1/loss=0.465, agent_2/loss=0.467, env_step=1520000, len=75, n/ep=10, n/st=800, rew=-10.92]                           


Epoch #76: test_reward: -12.951003 ± 6.204547, best_reward: -11.043515 ± 5.017869 in #42


Epoch #77: 20001it [00:39, 507.02it/s, agent_0/loss=0.295, agent_1/loss=0.469, agent_2/loss=0.428, env_step=1540000, len=75, n/ep=10, n/st=800, rew=-14.69]                           


Epoch #77: test_reward: -11.953252 ± 5.656091, best_reward: -11.043515 ± 5.017869 in #42


Epoch #78: 20001it [00:35, 565.00it/s, agent_0/loss=0.310, agent_1/loss=0.461, agent_2/loss=0.458, env_step=1560000, len=75, n/ep=20, n/st=800, rew=-14.43]                           


Epoch #78: test_reward: -13.674564 ± 8.085698, best_reward: -11.043515 ± 5.017869 in #42


Epoch #79: 20001it [00:35, 565.58it/s, agent_0/loss=0.307, agent_1/loss=0.471, agent_2/loss=0.430, env_step=1580000, len=75, n/ep=10, n/st=800, rew=-13.92]                           


Epoch #79: test_reward: -13.660847 ± 7.462176, best_reward: -11.043515 ± 5.017869 in #42


Epoch #80: 20001it [00:35, 567.13it/s, agent_0/loss=0.327, agent_1/loss=0.484, agent_2/loss=0.444, env_step=1600000, len=75, n/ep=10, n/st=800, rew=-12.19]                           


Steps Policy Saved  2080
Epoch #80: test_reward: -14.741714 ± 9.330006, best_reward: -11.043515 ± 5.017869 in #42


Epoch #81: 20001it [00:35, 570.10it/s, agent_0/loss=0.352, agent_1/loss=0.485, agent_2/loss=0.503, env_step=1620000, len=75, n/ep=20, n/st=800, rew=-11.28]                           


Epoch #81: test_reward: -13.712855 ± 7.230678, best_reward: -11.043515 ± 5.017869 in #42


Epoch #82: 20001it [00:35, 563.10it/s, agent_0/loss=0.354, agent_1/loss=0.475, agent_2/loss=0.474, env_step=1640000, len=75, n/ep=10, n/st=800, rew=-14.06]                           


Epoch #82: test_reward: -12.702710 ± 7.643124, best_reward: -11.043515 ± 5.017869 in #42


Epoch #83: 20001it [00:35, 567.22it/s, agent_0/loss=0.344, agent_1/loss=0.463, agent_2/loss=0.478, env_step=1660000, len=75, n/ep=10, n/st=800, rew=-9.59]                            


Epoch #83: test_reward: -11.854295 ± 5.935349, best_reward: -11.043515 ± 5.017869 in #42


Epoch #84: 20001it [00:35, 569.56it/s, agent_0/loss=0.343, agent_1/loss=0.450, agent_2/loss=0.510, env_step=1680000, len=75, n/ep=20, n/st=800, rew=-10.10]                           


Epoch #84: test_reward: -11.751647 ± 6.056054, best_reward: -11.043515 ± 5.017869 in #42


Epoch #85: 20001it [00:35, 570.90it/s, agent_0/loss=0.344, agent_1/loss=0.446, agent_2/loss=0.473, env_step=1700000, len=75, n/ep=10, n/st=800, rew=-12.55]                           


Steps Policy Saved  2210
Best Saved Rew 2211
Epoch #85: test_reward: -10.866622 ± 6.004604, best_reward: -10.866622 ± 6.004604 in #85


Epoch #86: 20001it [00:35, 570.93it/s, agent_0/loss=0.325, agent_1/loss=0.420, agent_2/loss=0.451, env_step=1720000, len=75, n/ep=10, n/st=800, rew=-10.62]                           


Epoch #86: test_reward: -11.443381 ± 4.749487, best_reward: -10.866622 ± 6.004604 in #85


Epoch #87: 20001it [00:34, 573.84it/s, agent_0/loss=0.334, agent_1/loss=0.452, agent_2/loss=0.462, env_step=1740000, len=75, n/ep=20, n/st=800, rew=-14.40]                           


Epoch #87: test_reward: -11.832654 ± 5.780268, best_reward: -10.866622 ± 6.004604 in #85


Epoch #88: 20001it [00:35, 566.68it/s, agent_0/loss=0.349, agent_1/loss=0.475, agent_2/loss=0.503, env_step=1760000, len=75, n/ep=10, n/st=800, rew=-8.63]                            


Epoch #88: test_reward: -12.576067 ± 6.637888, best_reward: -10.866622 ± 6.004604 in #85


Epoch #89: 20001it [00:36, 546.60it/s, agent_0/loss=0.331, agent_1/loss=0.473, agent_2/loss=0.506, env_step=1780000, len=75, n/ep=10, n/st=800, rew=-12.78]                           


Epoch #89: test_reward: -11.882980 ± 6.178765, best_reward: -10.866622 ± 6.004604 in #85


Epoch #90: 20001it [00:34, 571.84it/s, agent_0/loss=0.345, agent_1/loss=0.460, agent_2/loss=0.490, env_step=1800000, len=75, n/ep=20, n/st=800, rew=-10.21]                           


Steps Policy Saved  2340
Epoch #90: test_reward: -11.643105 ± 5.622509, best_reward: -10.866622 ± 6.004604 in #85


Epoch #91: 20001it [00:35, 567.87it/s, agent_0/loss=0.332, agent_1/loss=0.447, agent_2/loss=0.466, env_step=1820000, len=75, n/ep=10, n/st=800, rew=-11.53]                           


Epoch #91: test_reward: -13.220094 ± 6.703675, best_reward: -10.866622 ± 6.004604 in #85


Epoch #92: 20001it [00:35, 557.87it/s, agent_0/loss=0.347, agent_1/loss=0.503, agent_2/loss=0.487, env_step=1840000, len=75, n/ep=10, n/st=800, rew=-12.07]                           


Epoch #92: test_reward: -12.248351 ± 5.544505, best_reward: -10.866622 ± 6.004604 in #85


Epoch #93: 20001it [00:35, 569.67it/s, agent_0/loss=0.342, agent_1/loss=0.468, agent_2/loss=0.459, env_step=1860000, len=75, n/ep=20, n/st=800, rew=-12.33]                           


Epoch #93: test_reward: -11.564010 ± 5.980436, best_reward: -10.866622 ± 6.004604 in #85


Epoch #94: 20001it [00:35, 570.92it/s, agent_0/loss=0.329, agent_1/loss=0.453, agent_2/loss=0.470, env_step=1880000, len=75, n/ep=10, n/st=800, rew=-12.47]                           


Epoch #94: test_reward: -11.476759 ± 6.470915, best_reward: -10.866622 ± 6.004604 in #85


Epoch #95: 20001it [00:34, 575.06it/s, agent_0/loss=0.348, agent_1/loss=0.490, agent_2/loss=0.488, env_step=1900000, len=75, n/ep=10, n/st=800, rew=-13.56]                           


Steps Policy Saved  2470
Epoch #95: test_reward: -11.327754 ± 5.715237, best_reward: -10.866622 ± 6.004604 in #85


Epoch #96: 20001it [00:35, 570.68it/s, agent_0/loss=0.350, agent_1/loss=0.481, agent_2/loss=0.480, env_step=1920000, len=75, n/ep=20, n/st=800, rew=-11.81]                           


Best Saved Rew 2497
Epoch #96: test_reward: -10.839567 ± 4.527871, best_reward: -10.839567 ± 4.527871 in #96


Epoch #97: 20001it [00:34, 573.84it/s, agent_0/loss=0.352, agent_1/loss=0.460, agent_2/loss=0.446, env_step=1940000, len=75, n/ep=10, n/st=800, rew=-14.56]                           


Epoch #97: test_reward: -11.980600 ± 6.465296, best_reward: -10.839567 ± 4.527871 in #96


Epoch #98: 20001it [00:35, 570.85it/s, agent_0/loss=0.356, agent_1/loss=0.444, agent_2/loss=0.456, env_step=1960000, len=75, n/ep=10, n/st=800, rew=-10.06]                           


Epoch #98: test_reward: -12.297905 ± 5.084365, best_reward: -10.839567 ± 4.527871 in #96


Epoch #99: 20001it [00:35, 568.82it/s, agent_0/loss=0.329, agent_1/loss=0.443, agent_2/loss=0.447, env_step=1980000, len=75, n/ep=20, n/st=800, rew=-12.48]                           


Epoch #99: test_reward: -11.533498 ± 7.055826, best_reward: -10.839567 ± 4.527871 in #96


Epoch #100: 20001it [00:34, 573.31it/s, agent_0/loss=0.342, agent_1/loss=0.447, agent_2/loss=0.470, env_step=2000000, len=75, n/ep=10, n/st=800, rew=-8.83]                            


Steps Policy Saved  2600
Best Saved Rew 2601
Epoch #100: test_reward: -9.272048 ± 5.842043, best_reward: -9.272048 ± 5.842043 in #100


Epoch #101: 20001it [00:34, 573.45it/s, agent_0/loss=0.312, agent_1/loss=0.421, agent_2/loss=0.442, env_step=2020000, len=75, n/ep=10, n/st=800, rew=-7.14]                            


Epoch #101: test_reward: -11.314835 ± 4.963106, best_reward: -9.272048 ± 5.842043 in #100


Epoch #102: 20001it [00:34, 576.58it/s, agent_0/loss=0.321, agent_1/loss=0.430, agent_2/loss=0.433, env_step=2040000, len=75, n/ep=20, n/st=800, rew=-11.05]                           


Epoch #102: test_reward: -11.365619 ± 4.896710, best_reward: -9.272048 ± 5.842043 in #100


Epoch #103: 20001it [00:35, 570.26it/s, agent_0/loss=0.349, agent_1/loss=0.414, agent_2/loss=0.448, env_step=2060000, len=75, n/ep=10, n/st=800, rew=-13.35]                           


Epoch #103: test_reward: -10.964331 ± 5.869108, best_reward: -9.272048 ± 5.842043 in #100


Epoch #104: 20001it [00:34, 572.50it/s, agent_0/loss=0.349, agent_1/loss=0.408, agent_2/loss=0.438, env_step=2080000, len=75, n/ep=10, n/st=800, rew=-13.29]                           


Epoch #104: test_reward: -11.765620 ± 6.491712, best_reward: -9.272048 ± 5.842043 in #100


Epoch #105: 20001it [00:35, 569.42it/s, agent_0/loss=0.327, agent_1/loss=0.412, agent_2/loss=0.414, env_step=2100000, len=75, n/ep=20, n/st=800, rew=-10.76]                           


Steps Policy Saved  2730
Epoch #105: test_reward: -9.967134 ± 4.971542, best_reward: -9.272048 ± 5.842043 in #100


Epoch #106: 20001it [00:34, 571.80it/s, agent_0/loss=0.305, agent_1/loss=0.401, agent_2/loss=0.393, env_step=2120000, len=75, n/ep=10, n/st=800, rew=-8.45]                            


Epoch #106: test_reward: -11.740309 ± 6.586666, best_reward: -9.272048 ± 5.842043 in #100


Epoch #107: 20001it [00:35, 562.97it/s, agent_0/loss=0.325, agent_1/loss=0.388, agent_2/loss=0.400, env_step=2140000, len=75, n/ep=10, n/st=800, rew=-9.95]                            


Epoch #107: test_reward: -11.305383 ± 6.895594, best_reward: -9.272048 ± 5.842043 in #100


Epoch #108: 20001it [00:36, 554.81it/s, agent_0/loss=0.323, agent_1/loss=0.388, agent_2/loss=0.413, env_step=2160000, len=75, n/ep=20, n/st=800, rew=-12.44]                           


Epoch #108: test_reward: -10.001176 ± 5.083861, best_reward: -9.272048 ± 5.842043 in #100


Epoch #109: 20001it [00:34, 578.86it/s, agent_0/loss=0.332, agent_1/loss=0.423, agent_2/loss=0.423, env_step=2180000, len=75, n/ep=10, n/st=800, rew=-9.39]                            


Epoch #109: test_reward: -10.610229 ± 4.671438, best_reward: -9.272048 ± 5.842043 in #100


Epoch #110: 20001it [00:34, 575.34it/s, agent_0/loss=0.339, agent_1/loss=0.415, agent_2/loss=0.418, env_step=2200000, len=75, n/ep=10, n/st=800, rew=-10.39]                           


Steps Policy Saved  2860
Epoch #110: test_reward: -10.973770 ± 4.962072, best_reward: -9.272048 ± 5.842043 in #100


Epoch #111: 20001it [00:34, 572.26it/s, agent_0/loss=0.336, agent_1/loss=0.406, agent_2/loss=0.409, env_step=2220000, len=75, n/ep=20, n/st=800, rew=-10.66]                           


Epoch #111: test_reward: -11.098862 ± 4.817202, best_reward: -9.272048 ± 5.842043 in #100


Epoch #112: 20001it [00:35, 569.59it/s, agent_0/loss=0.304, agent_1/loss=0.389, agent_2/loss=0.389, env_step=2240000, len=75, n/ep=10, n/st=800, rew=-9.93]                            


Epoch #112: test_reward: -10.095122 ± 5.231123, best_reward: -9.272048 ± 5.842043 in #100


Epoch #113: 20001it [00:34, 576.28it/s, agent_0/loss=0.289, agent_1/loss=0.347, agent_2/loss=0.393, env_step=2260000, len=75, n/ep=10, n/st=800, rew=-11.93]                           


Epoch #113: test_reward: -10.267037 ± 5.264308, best_reward: -9.272048 ± 5.842043 in #100


Epoch #114: 20001it [00:34, 575.78it/s, agent_0/loss=0.295, agent_1/loss=0.350, agent_2/loss=0.382, env_step=2280000, len=75, n/ep=20, n/st=800, rew=-11.56]                           


Epoch #114: test_reward: -10.058566 ± 4.642512, best_reward: -9.272048 ± 5.842043 in #100


Epoch #115: 20001it [00:34, 572.45it/s, agent_0/loss=0.300, agent_1/loss=0.376, agent_2/loss=0.371, env_step=2300000, len=75, n/ep=10, n/st=800, rew=-9.52]                            


Steps Policy Saved  2990
Epoch #115: test_reward: -9.878284 ± 4.198153, best_reward: -9.272048 ± 5.842043 in #100


Epoch #116: 20001it [00:34, 575.03it/s, agent_0/loss=0.289, agent_1/loss=0.370, agent_2/loss=0.374, env_step=2320000, len=75, n/ep=10, n/st=800, rew=-10.28]                           


Epoch #116: test_reward: -9.808738 ± 7.049243, best_reward: -9.272048 ± 5.842043 in #100


Epoch #117: 20001it [00:37, 530.87it/s, agent_0/loss=0.278, agent_1/loss=0.386, agent_2/loss=0.412, env_step=2340000, len=75, n/ep=20, n/st=800, rew=-9.99]                            


Epoch #117: test_reward: -10.222614 ± 4.715020, best_reward: -9.272048 ± 5.842043 in #100


Epoch #118: 20001it [00:35, 563.28it/s, agent_0/loss=0.277, agent_1/loss=0.367, agent_2/loss=0.381, env_step=2360000, len=75, n/ep=10, n/st=800, rew=-8.91]                            


Epoch #118: test_reward: -11.179644 ± 4.670053, best_reward: -9.272048 ± 5.842043 in #100


Epoch #119: 20001it [00:35, 562.07it/s, agent_0/loss=0.282, agent_1/loss=0.363, agent_2/loss=0.344, env_step=2380000, len=75, n/ep=10, n/st=800, rew=-8.58]                            


Epoch #119: test_reward: -9.575688 ± 4.451048, best_reward: -9.272048 ± 5.842043 in #100


Epoch #120: 20001it [00:35, 562.79it/s, agent_0/loss=0.275, agent_1/loss=0.360, agent_2/loss=0.348, env_step=2400000, len=75, n/ep=20, n/st=800, rew=-9.37]                            


Steps Policy Saved  3120
Epoch #120: test_reward: -10.312085 ± 7.130776, best_reward: -9.272048 ± 5.842043 in #100


Epoch #121: 20001it [00:34, 572.18it/s, agent_0/loss=0.260, agent_1/loss=0.334, agent_2/loss=0.341, env_step=2420000, len=75, n/ep=10, n/st=800, rew=-8.47]                            


Best Saved Rew 3147
Epoch #121: test_reward: -8.012583 ± 3.209611, best_reward: -8.012583 ± 3.209611 in #121


Epoch #122: 20001it [00:34, 575.42it/s, agent_0/loss=0.264, agent_1/loss=0.375, agent_2/loss=0.342, env_step=2440000, len=75, n/ep=10, n/st=800, rew=-10.42]                           


Epoch #122: test_reward: -10.185394 ± 4.011883, best_reward: -8.012583 ± 3.209611 in #121


Epoch #123: 20001it [00:34, 580.62it/s, agent_0/loss=0.269, agent_1/loss=0.343, agent_2/loss=0.349, env_step=2460000, len=75, n/ep=20, n/st=800, rew=-11.11]                           


Epoch #123: test_reward: -9.183181 ± 5.313994, best_reward: -8.012583 ± 3.209611 in #121


Epoch #124: 20001it [00:34, 572.41it/s, agent_0/loss=0.270, agent_1/loss=0.328, agent_2/loss=0.347, env_step=2480000, len=75, n/ep=10, n/st=800, rew=-8.43]                            


Epoch #124: test_reward: -11.421088 ± 6.141855, best_reward: -8.012583 ± 3.209611 in #121


Epoch #125: 20001it [00:34, 572.79it/s, agent_0/loss=0.259, agent_1/loss=0.307, agent_2/loss=0.317, env_step=2500000, len=75, n/ep=10, n/st=800, rew=-10.12]                           


Steps Policy Saved  3250
Epoch #125: test_reward: -10.524530 ± 5.310665, best_reward: -8.012583 ± 3.209611 in #121


Epoch #126: 20001it [00:37, 534.99it/s, agent_0/loss=0.263, agent_1/loss=0.331, agent_2/loss=0.324, env_step=2520000, len=75, n/ep=20, n/st=800, rew=-10.78]                           


Epoch #126: test_reward: -9.112960 ± 5.022667, best_reward: -8.012583 ± 3.209611 in #121


Epoch #127: 20001it [00:35, 561.87it/s, agent_0/loss=0.267, agent_1/loss=0.333, agent_2/loss=0.331, env_step=2540000, len=75, n/ep=10, n/st=800, rew=-11.61]                           


Epoch #127: test_reward: -9.726134 ± 5.166072, best_reward: -8.012583 ± 3.209611 in #121


Epoch #128: 20001it [00:33, 596.20it/s, agent_0/loss=0.266, agent_1/loss=0.344, agent_2/loss=0.324, env_step=2560000, len=75, n/ep=10, n/st=800, rew=-8.04]                            


Epoch #128: test_reward: -9.050146 ± 4.193676, best_reward: -8.012583 ± 3.209611 in #121


Epoch #129: 20001it [00:33, 589.95it/s, agent_0/loss=0.267, agent_1/loss=0.329, agent_2/loss=0.324, env_step=2580000, len=75, n/ep=20, n/st=800, rew=-8.36]                            


Epoch #129: test_reward: -8.901389 ± 4.845243, best_reward: -8.012583 ± 3.209611 in #121


Epoch #130: 20001it [00:33, 590.08it/s, agent_0/loss=0.261, agent_1/loss=0.341, agent_2/loss=0.352, env_step=2600000, len=75, n/ep=10, n/st=800, rew=-10.37]                           


Steps Policy Saved  3380
Epoch #130: test_reward: -9.470214 ± 5.040372, best_reward: -8.012583 ± 3.209611 in #121


Epoch #131: 20001it [00:33, 593.81it/s, agent_0/loss=0.239, agent_1/loss=0.321, agent_2/loss=0.323, env_step=2620000, len=75, n/ep=10, n/st=800, rew=-10.67]                           


Epoch #131: test_reward: -9.677211 ± 4.215299, best_reward: -8.012583 ± 3.209611 in #121


Epoch #132: 20001it [00:33, 592.42it/s, agent_0/loss=0.232, agent_1/loss=0.312, agent_2/loss=0.295, env_step=2640000, len=75, n/ep=20, n/st=800, rew=-11.39]                           


Epoch #132: test_reward: -8.674048 ± 4.302035, best_reward: -8.012583 ± 3.209611 in #121


Epoch #133: 20001it [00:36, 547.89it/s, agent_0/loss=0.285, agent_1/loss=0.323, agent_2/loss=0.327, env_step=2660000, len=75, n/ep=10, n/st=800, rew=-7.58]                            


Epoch #133: test_reward: -8.785231 ± 4.109295, best_reward: -8.012583 ± 3.209611 in #121


Epoch #134: 20001it [00:34, 579.96it/s, agent_0/loss=0.243, agent_1/loss=0.303, agent_2/loss=0.291, env_step=2680000, len=75, n/ep=10, n/st=800, rew=-8.57]                            


Epoch #134: test_reward: -9.564940 ± 4.172010, best_reward: -8.012583 ± 3.209611 in #121


Epoch #135: 20001it [00:34, 579.56it/s, agent_0/loss=0.230, agent_1/loss=0.298, agent_2/loss=0.281, env_step=2700000, len=75, n/ep=20, n/st=800, rew=-10.22]                           


Steps Policy Saved  3510
Epoch #135: test_reward: -9.604478 ± 4.373172, best_reward: -8.012583 ± 3.209611 in #121


Epoch #136: 20001it [00:35, 565.82it/s, agent_0/loss=0.226, agent_1/loss=0.281, agent_2/loss=0.301, env_step=2720000, len=75, n/ep=10, n/st=800, rew=-9.63]                            


Epoch #136: test_reward: -9.403795 ± 4.078298, best_reward: -8.012583 ± 3.209611 in #121


Epoch #137:  40%|####      | 8000/20000 [00:14<00:23, 520.73it/s, agent_0/loss=0.242, agent_1/loss=0.301, agent_2/loss=0.325, env_step=2728000, len=75, n/ep=10, n/st=800, rew=-8.08] 