In [1]:
from email import utils
import os
import datetime
from typing import Optional, Tuple
import json


os.environ["WANDB_NOTEBOOK_NAME"] = "Tianshow_Centralized_Training"

import numpy as np
import torch
from tianshou.data import Collector, VectorReplayBuffer, PrioritizedVectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.env.pettingzoo_env_parallel import PettingZooParallelEnv

from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy, RainbowPolicy
from tianshou.trainer import OffpolicyTrainer
from torch.utils.tensorboard import SummaryWriter

# from pettingzoo.sisl import pursuit_v4
from pettingzoo.mpe import simple_spread_v3
#import Mods.TaskSpreadEnv as TaskSpreadEnv

from TaskAllocation.RL_Policies.DNN_Spread import DNN_Spread
from TaskAllocation.RL_Policies.MPE_Task_MultiHead import MPE_Task_MultiHead

#import Mods.TaskPursuitEnv as TaskPursuitEnv
import Mods.ActionLoggerWrapper as ActionLoggerWrapper
import Mods.VDNPolicy as VDNPolicy
import Mods.PettingZooParallelEnv2 as PettingZooParallelEnv2
import Mods.CollectorMA as CollectorMA

from TaskAllocation.RL_Policies.Custom_Classes import CustomNet
from TaskAllocation.RL_Policies.Custom_Classes import CustomCollector
from TaskAllocation.RL_Policies.Custom_Classes import CustomParallelToAECWrapper

# Add specific modification to tianshou
import wandb
from tianshou.utils import WandbLogger
from tianshou.utils.logger.base import LOG_DATA_TYPE

def new_write(self, step_type: str, step: int, data: LOG_DATA_TYPE) -> None:
    data[step_type] = step
    wandb.log(data)
    
WandbLogger.write = new_write 

from pettingzoo.utils import wrappers
import gym

class ActionLoggerWrapper(gym.Wrapper):
    def __init__(self, env):
        super(ActionLoggerWrapper, self).__init__(env)
        self.actions = []

    def step(self, action):
        self.actions.append(action)
        return self.env.step(action)

    def reset(self, **kwargs):      
        if self.actions:
            # Convert all actions to numpy arrays and standardize their shapes
            formatted_actions = [np.array(a).flatten() for a in self.actions]
            flattened_actions = np.concatenate(formatted_actions)

            try:
                # Compute the histogram
                hist_data, bin_edges = np.histogram(flattened_actions, bins='auto')

                # Log the actions as a histogram to wandb
                wandb.log({"actions_histogram": wandb.Histogram(np_histogram=(hist_data, bin_edges))})
            except Exception as e:
                pass#print("Error in logging histogram:", e)

            self.actions = []
        return self.env.reset(**kwargs)


#from tianshou_DQN import train
model  =  "DNN_Spread" #"DNN_Spread"#"MPE_Task_MultiHead" # #"CNN_ATT_SISL" #"MultiHead_SISL" 
test_num  =  "_Desk_01_8feat"
policyModel  =  "DQN"

train_env_num = 10
test_env_num = 10

name = model + test_num

# log
now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
log_name = name + str(now)
log_path = os.path.join('./', "Logs", "dqn_sisl", log_name)

#policy
load_policy_name = f'policy_MPE_Task_MultiHead_Desk_01_8feat240112-145703_29_BestRew.pth'
save_policy_name = f'policy_{log_name}'
policy_path = "dqn_Spread"

Policy_Config = {
    "same_policy" : True,
    "load_model" : False,
    "freeze_CNN" : False     
                }

Spread_Config = {
    "N": 3,                      # Default = 3
    "local_ratio": 0.5,          # Default = 0.5
    "max_cycles": 25,            # Default = 25
    "continuous_actions": False, # Default = False
    "render_mode": None          # Default = None 
}

max_cycles = Spread_Config["max_cycles"]
n_agents = Spread_Config["N"]

dqn_params = {"discount_factor": 0.99, 
              "estimation_step": 5, 
              "target_update_freq": 40 * max_cycles,
              "optminizer": "Adam",
              "lr": 0.000001 }

trainer_params = {"max_epoch": 500,
                  "step_per_epoch": 1000 * max_cycles,
                  "step_per_collect": 200 * max_cycles,
                  "episode_per_test": 50,
                  "batch_size" :  max_cycles,
                  "update_per_step": 1 / 250, #Only run after close a Collect (run many times as necessary to meet the value)
                  "tn_eps_max": 0.2,
                  "ts_eps_max": 0.0,
                  "warmup_size" : 10
                  }


runConfig = dqn_params
runConfig.update(Policy_Config)
runConfig.update(trainer_params) 
runConfig.update(Spread_Config)

model_load_path = os.path.join(policy_path, load_policy_name)  
model_save_path = os.path.join(policy_path, save_policy_name)        
os.makedirs(os.path.join(policy_path), exist_ok=True)  
os.makedirs(os.path.join(log_path), exist_ok=True)

def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
    policy_load_path = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    
    env = _get_env()       
    agent_observation_space = env.observation_space.shape
   
    action_shape = env.action_space
    
    device="cuda" if torch.cuda.is_available() else "cpu"  

    agents = []        
    
    if Policy_Config["same_policy"]:
        policies_number = 1
    else:
        policies_number = 3#len(env.agents)

    for _ in range(policies_number):                   

        if model == "DNN_Spread":
            net = DNN_Spread(
                obs_shape=agent_observation_space[0],                
                action_shape=5,                
                device="cuda" if torch.cuda.is_available() else "cpu"
                
            ).to(device)

        if model == "VDN_Spread":
            net = DNN_Spread(
                obs_shape=agent_observation_space[0],                
                action_shape=5,                
                device="cuda" if torch.cuda.is_available() else "cpu"
                
            ).to(device)

        if model == "MPE_Task_MultiHead":
            net = MPE_Task_MultiHead(                
                num_tasks=Spread_Config['N'] * 2 + 5,
                num_features_per_task = 2,#6 + 2 + 1,                
                device="cuda" if torch.cuda.is_available() else "cpu"
                
            ).to(device)

        optim = torch.optim.Adam(net.parameters(), lr=dqn_params["lr"], weight_decay=0.0, amsgrad= True )                

        if policyModel == "DQN":
            agent_learn = DQNPolicy(
                model=net,
                optim=optim,
                action_space = action_shape,
                discount_factor= dqn_params["discount_factor"],
                estimation_step=dqn_params["estimation_step"],
                target_update_freq=dqn_params["target_update_freq"],
                reward_normalization = False,
                clip_loss_grad = False 
            ) 
        
        if policyModel == "VDN":
            agent_learn = DQNPolicy(
                model=net,
                optim=optim,
                action_space = action_shape,
                discount_factor= dqn_params["discount_factor"],
                estimation_step=dqn_params["estimation_step"],
                target_update_freq=dqn_params["target_update_freq"],
                reward_normalization = False,
                clip_loss_grad = False,                
            ) 

        if Policy_Config["load_model"] is True:
            # Load the saved checkpoint             
            agent_learn.load_state_dict(torch.load(model_load_path))
            print(f'Loaded-> {model_load_path}')
                   
        #print(env.agents)
        #agents = [agent_learn for _ in range(len(env.agents))]
        
        agents.append(agent_learn)

    if Policy_Config["same_policy"]:
        agents = [agents[0] for _ in range(len(env.agents))]
    else:
        for _ in range(len(env.agents) - policies_number):
            agents.append(agents[0])

    # policy = VDNPolicy.VDNMAPolicy(policies = agents, env=env, device="cuda" if torch.cuda.is_available() else "cpu" )  
    policy = MultiAgentPolicyManager(policies = agents, env=env )  
        
    return policy, optim, env.agents

def _get_env(test=False):
    """This function is needed to provide callables for DummyVectorEnv."""   
    # env_paralell = MultiUAVEnv()  
    #env = pursuit_v4.env()    
    #env = TaskSpreadEnv.env(
    # env = simple_spread_v3.parallel_env(
    env = simple_spread_v3.env(
        max_cycles=Spread_Config["max_cycles"],
        local_ratio=Spread_Config["local_ratio"],
        N=Spread_Config["N"],
        continuous_actions=Spread_Config["continuous_actions"],
        render_mode=" human" #Spread_Config["render_mode"]
    )    
    
    # env = parallel_to_aec_wrapper(env_paralell)    
    # env = CustomParallelToAECWrapper(env_paralell)
    # env = ActionLoggerWrapper(env)
    env = PettingZooEnv(env) 
    # env = PettingZooParallelEnv(env)
       
    return  env

# print(json.dumps(runConfig, indent=4))


In [2]:
if __name__ == "__main__":
                        
    torch.set_grad_enabled(True) 
   
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(train_env_num)])
    test_envs = DummyVectorEnv([_get_env for _ in range(test_env_num)]) 

    # seed
    seed = 0
    np.random.seed(seed)
    
    torch.manual_seed(seed)

    train_envs.seed(seed)
    test_envs.seed(seed)

    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents()    

    if False:
        agents_buffers_training = {agent : 
                           PrioritizedVectorReplayBuffer( 30_000, 
                                                          len(train_envs), 
                                                          alpha=0.6, 
                                                          beta=0.4) 
                                                          for agent in agents
                         }
        agents_buffers_test = {agent : 
                           PrioritizedVectorReplayBuffer( 30_000, 
                                                          len(train_envs), 
                                                          alpha=0.6, 
                                                          beta=0.4) 
                                                          for agent in agents
                         }
    
        # ======== Step 3: Collector setup =========
        train_collector = CollectorMA.CollectorMA(
            policy,
            train_envs,
            agents_buffers_training,                        
            exploration_noise=True             
        )
        test_collector = CollectorMA.CollectorMA(policy, test_envs, agents_buffers_test, exploration_noise=True)

    if True:
         # ======== Step 3: Collector setup =========
        train_collector = Collector(
        policy,
        train_envs,
        # VectorReplayBuffer(300_000, len(train_envs)),
        PrioritizedVectorReplayBuffer( 300_000, len(train_envs), alpha=0.6, beta=0.4) , 
        #ListReplayBuffer(100000)       
        # buffer = StateMemoryVectorReplayBuffer(
        #         300_000,
        #         len(train_envs),  # Assuming train_envs is your vectorized environment
        #         memory_size=10,                
        #     ),
        exploration_noise=True             
        )
        test_collector = Collector(policy, test_envs, exploration_noise=True)
        
    print("Buffer Warming Up ")    
    for i in range(trainer_params["warmup_size"]):#int(trainer_params['batch_size'] / (300 * 10 ) )):
        
        train_collector.collect(n_episode=train_env_num)#,random=True) #trainer_params['batch_size'] * train_env_num))
        #train_collector.collect(n_step=300 * 10)
        print(".", end="") 
    
    # len_buffer = len(train_collector.buffer) / (Spread_Config["max_cycles"] * Spread_Config["N"])
    # print("\nBuffer Lenght: ", len_buffer ) 
    
    info = { "Buffer"  : "PriorizedReplayBuffer", " Warmup_ep" : runConfig["warmup_size"]}
    # ======== tensorboard logging setup =========                       
    logger = WandbLogger(
        train_interval = runConfig["max_cycles"] * runConfig["N"] ,
        test_interval = 1,#runConfig["max_cycles"] * runConfig["n_pursuers"],
        update_interval = runConfig["max_cycles"],
        save_interval = 1,
        write_flush = True,
        project = "Spread_Eval01",
        name = log_name,
        entity = None,
        run_id = log_name,
        config = runConfig,
        monitor_gym = True )
    
    writer = SummaryWriter(log_path)    
    writer.add_text("args", str(runConfig))    
    logger.load(writer)

    
    global_step_holder = [0] 
    
    
    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):                
        
        if Policy_Config["same_policy"]:
            torch.save(policy.policies[agents[0]].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_BestRew.pth")
            print("Best Saved Rew" , str(global_step_holder[0]))
        
        else:
            for n,agent in enumerate(agents):
                torch.save(policy.policies[agent].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_" + agent + ".pth")
            
            print("Bests Saved Rew" , str(global_step_holder[0]))
        
    def save_test_best_fn(policy):                
        
        if Policy_Config["same_policy"]:
            torch.save(policy.policies[agents[0]].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_BestLen.pth")
            print("Best Saved Length" , str(global_step_holder[0]))
        
        else:
            for n,agent in enumerate(agents):
                torch.save(policy.policies[agent].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_" + agent + ".pth")
            
            print("Best Saved Length" , str(global_step_holder[0]))
        

    def stop_fn(mean_rewards):
        return mean_rewards >= 99999939.0

    def train_fn(epoch, env_step):
        epsilon = trainer_params['tn_eps_max'] - (trainer_params['tn_eps_max'] - trainer_params['tn_eps_max']/100)*(epoch/trainer_params['max_epoch'])          
        if Policy_Config["same_policy"]:
            policy.policies[agents[0]].set_eps(epsilon)
        else:
            for agent in agents:
                policy.policies[agent].set_eps(epsilon)
                
        
        # if env_step % 500 == 0:
            # logger.write("train/env_step", env_step, {"train/eps": eps})


    def test_fn(epoch, env_step):
               
        epsilon = trainer_params['ts_eps_max']#0.01#max(0.001, 0.1 - epoch * 0.001)
        if Policy_Config["same_policy"]:
            policy.policies[agents[0]].set_eps(epsilon)
        else:            
            for agent in agents:                             
                 policy.policies[agent].set_eps(epsilon)
                
        
        if global_step_holder[0] % 10 == 0:
            
            if Policy_Config["same_policy"]:
                torch.save(policy.policies[agents[0]].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_Step.pth")
                print("Steps Policy Saved " , str(global_step_holder[0]))
            
            else:
                for n,agent in enumerate(agents):
                    torch.save(policy.policies[agent].state_dict(), model_save_path + "_" + str(global_step_holder[0]) + "_" + agent + "Step" + str(global_step_holder[0]) + ".pth")
                
                print("Steps Policy Saved " , str(global_step_holder[0]))
        
    def reward_metric(rews):       
                
        global_step_holder[0] +=1 
        return rews


    # # ======== Step 5: Run the trainer =========
    offPolicyTrainer = OffpolicyTrainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,        
        max_epoch=trainer_params['max_epoch'],
        step_per_epoch=trainer_params['step_per_epoch'],
        step_per_collect=trainer_params['step_per_collect'],        
        episode_per_test= trainer_params['episode_per_test'],
        batch_size=trainer_params['batch_size'],
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        # save_test_best_fn=save_test_best_fn,
        update_per_step=trainer_params['update_per_step'],
        logger=logger,
        test_in_train=True,
        reward_metric=reward_metric,
        show_progress = True 
               
        )
    
    result = offPolicyTrainer.run()
    writer.close()
    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[0]])")



Buffer Warming Up 
.........



.

[34m[1mwandb[0m: Currently logged in as: [33mandrekuros[0m. Use [1m`wandb login --relogin`[0m to force relogin
  from IPython.core.display import HTML, display  # type: ignore


Steps Policy Saved  0
Best Saved Rew 1


Epoch #1: 25001it [00:15, 1641.40it/s, agent_0/loss=2.093, agent_1/loss=8.336, agent_2/loss=9.349, env_step=25000, len=75, n/ep=70, n/st=5000, rew=-32.42]                           


Best Saved Rew 7
Epoch #1: test_reward: -32.786428 ± 8.368494, best_reward: -32.786428 ± 8.368494 in #1


Epoch #2: 25001it [00:12, 1960.13it/s, agent_0/loss=1.806, agent_1/loss=7.717, agent_2/loss=7.150, env_step=50000, len=75, n/ep=60, n/st=5000, rew=-27.72]                           


Best Saved Rew 13
Epoch #2: test_reward: -27.106016 ± 7.294407, best_reward: -27.106016 ± 7.294407 in #2


Epoch #3: 25001it [00:14, 1758.72it/s, agent_0/loss=1.567, agent_1/loss=6.088, agent_2/loss=6.913, env_step=75000, len=75, n/ep=70, n/st=5000, rew=-27.16]                           


Best Saved Rew 19
Epoch #3: test_reward: -24.415293 ± 7.123224, best_reward: -24.415293 ± 7.123224 in #3


Epoch #4: 25001it [00:13, 1803.79it/s, agent_0/loss=1.474, agent_1/loss=5.658, agent_2/loss=6.277, env_step=100000, len=75, n/ep=70, n/st=5000, rew=-26.43]                           


Epoch #4: test_reward: -26.412855 ± 9.111578, best_reward: -24.415293 ± 7.123224 in #3


Epoch #5: 25001it [00:14, 1783.26it/s, agent_0/loss=1.275, agent_1/loss=5.464, agent_2/loss=5.939, env_step=125000, len=75, n/ep=60, n/st=5000, rew=-26.20]                           


Steps Policy Saved  30
Best Saved Rew 31
Epoch #5: test_reward: -21.800845 ± 4.701187, best_reward: -21.800845 ± 4.701187 in #5


Epoch #6: 25001it [00:14, 1783.42it/s, agent_0/loss=1.202, agent_1/loss=5.088, agent_2/loss=5.258, env_step=150000, len=75, n/ep=70, n/st=5000, rew=-24.65]                           


Epoch #6: test_reward: -23.998107 ± 6.512105, best_reward: -21.800845 ± 4.701187 in #5


Epoch #7: 25001it [00:13, 1873.66it/s, agent_0/loss=1.166, agent_1/loss=4.909, agent_2/loss=5.229, env_step=175000, len=75, n/ep=70, n/st=5000, rew=-24.28]                           


Epoch #7: test_reward: -24.399072 ± 8.803554, best_reward: -21.800845 ± 4.701187 in #5


Epoch #8: 25001it [00:13, 1844.99it/s, agent_0/loss=1.162, agent_1/loss=4.751, agent_2/loss=5.025, env_step=200000, len=75, n/ep=60, n/st=5000, rew=-25.52]                           


Epoch #8: test_reward: -23.500906 ± 7.101769, best_reward: -21.800845 ± 4.701187 in #5


Epoch #9: 25001it [00:13, 1792.59it/s, agent_0/loss=1.075, agent_1/loss=4.536, agent_2/loss=4.653, env_step=225000, len=75, n/ep=70, n/st=5000, rew=-25.53]                           


Epoch #9: test_reward: -25.646778 ± 6.469532, best_reward: -21.800845 ± 4.701187 in #5


Epoch #10: 25001it [00:13, 1855.46it/s, agent_0/loss=1.031, agent_1/loss=4.524, agent_2/loss=4.806, env_step=250000, len=75, n/ep=70, n/st=5000, rew=-25.59]                           


Steps Policy Saved  60
Epoch #10: test_reward: -25.502821 ± 7.334415, best_reward: -21.800845 ± 4.701187 in #5


Epoch #11: 25001it [00:13, 1830.22it/s, agent_0/loss=1.160, agent_1/loss=4.773, agent_2/loss=4.300, env_step=275000, len=75, n/ep=60, n/st=5000, rew=-25.13]                           


Epoch #11: test_reward: -26.371300 ± 9.714925, best_reward: -21.800845 ± 4.701187 in #5


Epoch #12: 25001it [00:13, 1888.72it/s, agent_0/loss=1.131, agent_1/loss=4.695, agent_2/loss=4.439, env_step=300000, len=75, n/ep=70, n/st=5000, rew=-26.95]                           


Epoch #12: test_reward: -29.565189 ± 12.858147, best_reward: -21.800845 ± 4.701187 in #5


Epoch #13: 25001it [00:13, 1861.32it/s, agent_0/loss=1.074, agent_1/loss=4.320, agent_2/loss=4.597, env_step=325000, len=75, n/ep=70, n/st=5000, rew=-25.88]                           


Epoch #13: test_reward: -27.986512 ± 8.464801, best_reward: -21.800845 ± 4.701187 in #5


Epoch #14: 25001it [00:13, 1793.21it/s, agent_0/loss=1.047, agent_1/loss=4.289, agent_2/loss=4.218, env_step=350000, len=75, n/ep=60, n/st=5000, rew=-25.95]                           


Epoch #14: test_reward: -27.743299 ± 10.934744, best_reward: -21.800845 ± 4.701187 in #5


Epoch #15: 25001it [00:14, 1671.57it/s, agent_0/loss=1.054, agent_1/loss=4.057, agent_2/loss=4.316, env_step=375000, len=75, n/ep=70, n/st=5000, rew=-29.56]                           


Steps Policy Saved  90
Epoch #15: test_reward: -27.990545 ± 14.498297, best_reward: -21.800845 ± 4.701187 in #5


Epoch #16: 25001it [00:13, 1821.39it/s, agent_0/loss=1.065, agent_1/loss=4.475, agent_2/loss=4.532, env_step=400000, len=75, n/ep=70, n/st=5000, rew=-25.81]                           


Epoch #16: test_reward: -30.264157 ± 12.392549, best_reward: -21.800845 ± 4.701187 in #5


Epoch #17: 25001it [00:12, 1931.68it/s, agent_0/loss=1.086, agent_1/loss=4.314, agent_2/loss=4.489, env_step=425000, len=75, n/ep=60, n/st=5000, rew=-25.75]                           


Epoch #17: test_reward: -25.671774 ± 10.192229, best_reward: -21.800845 ± 4.701187 in #5


Epoch #18: 25001it [00:12, 1981.03it/s, agent_0/loss=1.136, agent_1/loss=4.666, agent_2/loss=4.823, env_step=450000, len=75, n/ep=70, n/st=5000, rew=-25.77]                           


Epoch #18: test_reward: -26.386160 ± 7.609387, best_reward: -21.800845 ± 4.701187 in #5


Epoch #19: 25001it [00:13, 1794.50it/s, agent_0/loss=1.162, agent_1/loss=4.479, agent_2/loss=4.871, env_step=475000, len=75, n/ep=70, n/st=5000, rew=-24.92]                           


Epoch #19: test_reward: -23.976548 ± 7.185846, best_reward: -21.800845 ± 4.701187 in #5


Epoch #20: 25001it [00:12, 1923.31it/s, agent_0/loss=1.085, agent_1/loss=4.391, agent_2/loss=4.323, env_step=500000, len=75, n/ep=60, n/st=5000, rew=-24.27]                           


Steps Policy Saved  120
Epoch #20: test_reward: -25.291711 ± 7.316747, best_reward: -21.800845 ± 4.701187 in #5


Epoch #21: 25001it [00:13, 1823.85it/s, agent_0/loss=1.286, agent_1/loss=4.734, agent_2/loss=4.776, env_step=525000, len=75, n/ep=70, n/st=5000, rew=-23.33]                           


Epoch #21: test_reward: -26.673417 ± 8.046495, best_reward: -21.800845 ± 4.701187 in #5


Epoch #22: 25001it [00:13, 1862.20it/s, agent_0/loss=1.128, agent_1/loss=4.517, agent_2/loss=4.782, env_step=550000, len=75, n/ep=70, n/st=5000, rew=-23.74]                           


Epoch #22: test_reward: -24.029168 ± 8.668722, best_reward: -21.800845 ± 4.701187 in #5


Epoch #23: 25001it [00:14, 1782.30it/s, agent_0/loss=1.121, agent_1/loss=4.224, agent_2/loss=4.455, env_step=575000, len=75, n/ep=60, n/st=5000, rew=-24.74]                           


Epoch #23: test_reward: -22.595780 ± 5.885327, best_reward: -21.800845 ± 4.701187 in #5


Epoch #24: 25001it [00:13, 1852.71it/s, agent_0/loss=1.140, agent_1/loss=4.522, agent_2/loss=4.485, env_step=600000, len=75, n/ep=70, n/st=5000, rew=-24.46]                           


Epoch #24: test_reward: -24.002158 ± 6.681380, best_reward: -21.800845 ± 4.701187 in #5


Epoch #25: 25001it [00:13, 1811.51it/s, agent_0/loss=1.143, agent_1/loss=4.584, agent_2/loss=4.325, env_step=625000, len=75, n/ep=70, n/st=5000, rew=-24.03]                           


Steps Policy Saved  150
Epoch #25: test_reward: -25.982572 ± 9.189334, best_reward: -21.800845 ± 4.701187 in #5


Epoch #26: 25001it [00:14, 1756.93it/s, agent_0/loss=1.060, agent_1/loss=4.098, agent_2/loss=4.251, env_step=650000, len=75, n/ep=60, n/st=5000, rew=-24.41]                           


Epoch #26: test_reward: -26.897340 ± 7.662981, best_reward: -21.800845 ± 4.701187 in #5


Epoch #27: 25001it [00:14, 1701.17it/s, agent_0/loss=1.019, agent_1/loss=3.936, agent_2/loss=4.104, env_step=675000, len=75, n/ep=70, n/st=5000, rew=-24.82]                           


Epoch #27: test_reward: -24.472946 ± 7.668221, best_reward: -21.800845 ± 4.701187 in #5


Epoch #28: 25001it [00:13, 1898.17it/s, agent_0/loss=1.051, agent_1/loss=4.037, agent_2/loss=4.136, env_step=700000, len=75, n/ep=70, n/st=5000, rew=-25.17]                           


Epoch #28: test_reward: -24.678652 ± 8.631463, best_reward: -21.800845 ± 4.701187 in #5


Epoch #29: 25001it [00:12, 1932.83it/s, agent_0/loss=1.085, agent_1/loss=3.914, agent_2/loss=4.052, env_step=725000, len=75, n/ep=60, n/st=5000, rew=-25.18]                           


Epoch #29: test_reward: -24.885482 ± 7.550542, best_reward: -21.800845 ± 4.701187 in #5


Epoch #30: 25001it [00:13, 1848.04it/s, agent_0/loss=1.005, agent_1/loss=4.077, agent_2/loss=4.023, env_step=750000, len=75, n/ep=70, n/st=5000, rew=-26.43]                           


Steps Policy Saved  180
Epoch #30: test_reward: -24.014413 ± 8.541785, best_reward: -21.800845 ± 4.701187 in #5


Epoch #31: 25001it [00:13, 1888.47it/s, agent_0/loss=0.989, agent_1/loss=4.005, agent_2/loss=3.934, env_step=775000, len=75, n/ep=70, n/st=5000, rew=-26.86]                           


Epoch #31: test_reward: -24.614041 ± 6.711253, best_reward: -21.800845 ± 4.701187 in #5


Epoch #32: 25001it [00:12, 1931.14it/s, agent_0/loss=0.921, agent_1/loss=4.016, agent_2/loss=4.111, env_step=800000, len=75, n/ep=60, n/st=5000, rew=-24.23]                           


Epoch #32: test_reward: -22.708160 ± 6.329089, best_reward: -21.800845 ± 4.701187 in #5


Epoch #33: 25001it [00:13, 1853.60it/s, agent_0/loss=0.989, agent_1/loss=3.994, agent_2/loss=4.164, env_step=825000, len=75, n/ep=70, n/st=5000, rew=-22.76]                           


Epoch #33: test_reward: -27.105521 ± 8.758653, best_reward: -21.800845 ± 4.701187 in #5


Epoch #34: 25001it [00:12, 1936.01it/s, agent_0/loss=0.963, agent_1/loss=4.098, agent_2/loss=3.804, env_step=850000, len=75, n/ep=70, n/st=5000, rew=-22.70]                           


Epoch #34: test_reward: -24.024483 ± 8.087622, best_reward: -21.800845 ± 4.701187 in #5


Epoch #35: 25001it [00:12, 1959.41it/s, agent_0/loss=1.016, agent_1/loss=3.745, agent_2/loss=3.930, env_step=875000, len=75, n/ep=60, n/st=5000, rew=-23.49]                           


Steps Policy Saved  210
Epoch #35: test_reward: -23.778391 ± 6.603443, best_reward: -21.800845 ± 4.701187 in #5


Epoch #36: 25001it [00:13, 1798.51it/s, agent_0/loss=0.946, agent_1/loss=4.089, agent_2/loss=3.797, env_step=900000, len=75, n/ep=70, n/st=5000, rew=-24.04]                           


Epoch #36: test_reward: -24.687897 ± 7.822769, best_reward: -21.800845 ± 4.701187 in #5


Epoch #37: 25001it [00:13, 1860.22it/s, agent_0/loss=0.996, agent_1/loss=3.830, agent_2/loss=3.744, env_step=925000, len=75, n/ep=70, n/st=5000, rew=-22.49]                           


Epoch #37: test_reward: -22.999036 ± 5.675187, best_reward: -21.800845 ± 4.701187 in #5


Epoch #38: 25001it [00:13, 1864.15it/s, agent_0/loss=0.965, agent_1/loss=3.743, agent_2/loss=3.835, env_step=950000, len=75, n/ep=60, n/st=5000, rew=-23.08]                           


Epoch #38: test_reward: -22.445793 ± 6.700429, best_reward: -21.800845 ± 4.701187 in #5


Epoch #39: 25001it [00:14, 1770.71it/s, agent_0/loss=0.936, agent_1/loss=3.931, agent_2/loss=3.527, env_step=975000, len=75, n/ep=70, n/st=5000, rew=-23.40]                           


Epoch #39: test_reward: -23.406530 ± 7.120128, best_reward: -21.800845 ± 4.701187 in #5


Epoch #40: 25001it [00:14, 1702.70it/s, agent_0/loss=0.867, agent_1/loss=3.563, agent_2/loss=3.665, env_step=1000000, len=75, n/ep=70, n/st=5000, rew=-23.69]                           


Steps Policy Saved  240
Epoch #40: test_reward: -24.770804 ± 7.131076, best_reward: -21.800845 ± 4.701187 in #5


Epoch #41: 25001it [00:14, 1756.07it/s, agent_0/loss=0.935, agent_1/loss=3.726, agent_2/loss=3.725, env_step=1025000, len=75, n/ep=60, n/st=5000, rew=-23.24]                           


Epoch #41: test_reward: -23.196835 ± 6.527367, best_reward: -21.800845 ± 4.701187 in #5


Epoch #42: 25001it [00:13, 1793.38it/s, agent_0/loss=0.926, agent_1/loss=3.525, agent_2/loss=3.605, env_step=1050000, len=75, n/ep=70, n/st=5000, rew=-23.09]                           


Epoch #42: test_reward: -22.350942 ± 6.121632, best_reward: -21.800845 ± 4.701187 in #5


Epoch #43: 25001it [00:14, 1776.55it/s, agent_0/loss=0.910, agent_1/loss=3.559, agent_2/loss=3.537, env_step=1075000, len=75, n/ep=70, n/st=5000, rew=-22.86]                           


Epoch #43: test_reward: -24.328476 ± 6.809702, best_reward: -21.800845 ± 4.701187 in #5


Epoch #44:  40%|####      | 10000/25000 [00:07<00:11, 1277.41it/s, agent_0/loss=0.915, agent_1/loss=3.451, agent_2/loss=3.639, env_step=1085000, len=75, n/ep=60, n/st=5000, rew=-23.81]


KeyboardInterrupt: 