In [1]:
import os
from typing import Optional, Tuple

import gym
import numpy as np
import torch
from tianshou.data import Collector, VectorReplayBuffer, PrioritizedVectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import offpolicy_trainer
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger

from Custom_Classes import CustomNet
from Custom_Classes import CustomCollector
from Custom_Classes import CustomParallelToAECWrapper

#from CustomClass_multi_head import CustomNet
from Custom_Classes_simplified import CustomNetSimple
#from Custom_Classes_simplified import CustomCollectorSimple
#from Custom_Classes_simplified import CustomParallelToAECWrapperSimple

from CustomClasses_Transformer_Reduced import CustomNetReduced
from CustomClass_MultiHead_Transformer import CustomNetMultiHead
import importlib

from DroneEnv import MultiDroneEnv
from tianshou_DQN import train


model = "CustomNetSimple" # "CustomNet" or "CustomNetSimple" or "CustomNetReduced" or "CustomNetMultiHead"
test_num = "Eval_TBTA_03_pre_process_N"

train_env_num = 5
test_env_num = 5

name = model + test_num

load_policy_name = f'policy_CustomNetSimple1605_01_1_Priorized_1605_01_1_Priorized.pth'
save_policy_name = f'policy_{name}.pth'
policy_path = "dqn_Custom"
load_model = False

log_path = os.path.join('./', "Logs", "dqn", name)

dqn_params = {"discount_factor": 0.98, 
              "estimation_step": 1, 
              "target_update_freq": 100,
              "optminizer": "Adam",
              "lr": 1e-4  }

trainer_params = {"max_epoch": 200,
                  "step_per_epoch": 200 * train_env_num,
                  "step_per_collect": 100 * train_env_num,
                  "episode_per_test": 10 * test_env_num,
                  "batch_size" : 32,
                  "update_per_step": 0.1,
                  "tn_eps_max": 0.60,
                  "ts_eps_max": 0.001,
                  }

Run_Data = f'{name}\n\
        Loaded_Model: {load_policy_name if load_model == True else "no"} \n\
        log_path: {log_path} \n\
        train/test_env_num: {train_env_num} / {test_env_num} \n\
        model: {model} \n\
        dqn_params: {dqn_params} \n\
        trainer_params: {trainer_params} \n\
        obs: Task Info -> Dist / Quality for own drone \
            Agents_info -> Post_next / Time_next / Type \
            Scene:  F1:6, R1:6 | Rec:16, Att:4'

model_load_path = os.path.join(policy_path, load_policy_name)  
model_save_path = os.path.join(policy_path, save_policy_name)        
os.makedirs(os.path.join(policy_path), exist_ok=True)  
os.makedirs(os.path.join(log_path), exist_ok=True)


def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
    policy_load_path = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    
    env = _get_env()
    agent_name = env.agents[0]  # Get the name of the first agent

    #print(env.observation_space )
    agent_observation_space = env.observation_space # assuming 'agent0' is a valid agent name
    state_shape_agent_position = agent_observation_space["agent_position"].shape[0]
    state_shape_agent_state = agent_observation_space["agent_state"].shape[0]
    state_shape_agent_type = agent_observation_space["agent_type"].shape[0]
    state_shape_next_free_time = agent_observation_space["next_free_time"].shape[0]
    state_shape_position_after_last_task = agent_observation_space["position_after_last_task"].shape[0]       
    #state_shape_agent_relay_area = agent_observation_space["agent_relay_area"].shape[0]
        
    state_shape_agent = (state_shape_agent_position + state_shape_agent_state +
                     state_shape_agent_type+ state_shape_next_free_time + state_shape_position_after_last_task #+                     
                     #state_shape_agent_relay_area
                     )                 
    

    state_shape_task = env.observation_space["tasks_info"].shape[0]
                  
    action_shape = env.action_space[agent_name].shape[0]
    #action_shape = env.action_space[agent_name].n
               
    if agent_learn is None:
        # model
        if model == "CustomNet":        
            net = CustomNet(
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        if model == "CustomNetSimple":
            net = CustomNetSimple(            
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        if model == "CustomNetReduced":
            net = CustomNetReduced(            
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        if model == "CustomNetMultiHead":
            net = CustomNetMultiHead(
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")

    
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=dqn_params["lr"])
    
        agent_learn = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor= dqn_params["discount_factor"],
            estimation_step=dqn_params["estimation_step"],
            target_update_freq=dqn_params["target_update_freq"],
        )  
        
        if load_model == True:
            # Load the saved checkpoint             
            agent_learn.load_state_dict(torch.load(model_load_path))
            print(f'Loaded-> {model_load_path}')
            
        
        agents = [agent_learn for _ in range(len(env.agents))]
        
    policy = MultiAgentPolicyManager(agents, env)    
        
    return policy, optim, env.agents


def _get_env():
    """This function is needed to provide callables for DummyVectorEnv."""
    env_paralell = MultiDroneEnv()
    #env = parallel_to_aec_wrapper(env_paralell)    
    env = CustomParallelToAECWrapper(env_paralell)
    
    return PettingZooEnv(env)

print(Run_Data)

  from .autonotebook import tqdm as notebook_tqdm


CustomNetSimpleEval_TBTA_03_pre_process_N
        Loaded_Model: no 
        log_path: ./Logs\dqn\CustomNetSimpleEval_TBTA_03_pre_process_N 
        train/test_env_num: 5 / 5 
        model: CustomNetSimple 
        dqn_params: {'discount_factor': 0.98, 'estimation_step': 1, 'target_update_freq': 100, 'optminizer': 'Adam', 'lr': 0.0001} 
        trainer_params: {'max_epoch': 200, 'step_per_epoch': 1000, 'step_per_collect': 500, 'episode_per_test': 50, 'batch_size': 32, 'update_per_step': 0.1, 'tn_eps_max': 0.6, 'ts_eps_max': 0.001} 
        obs: Task Info -> Dist / Quality for own drone             Agents_info -> Post_next / Time_next / Type             Scene:  F1:6, R1:6 | Rec:16, Att:4


In [2]:
if __name__ == "__main__":
                        
    torch.set_grad_enabled(True) 
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(train_env_num)])
    test_envs = DummyVectorEnv([_get_env for _ in range(test_env_num)]) 

    # seed
    seed = 1
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)

    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents()
    

    # ======== Step 3: Collector setup =========
    train_collector = CustomCollector(
        policy,
        train_envs,
        #VectorReplayBuffer(100_000, len(train_envs)),
        PrioritizedVectorReplayBuffer( 100_000, len(train_envs), alpha=0.6, beta=0.4) ,       
        exploration_noise=True        
    )
    test_collector = CustomCollector(policy, test_envs, exploration_noise=True)
     
    train_collector.collect(n_step=trainer_params['batch_size'] * train_env_num)
    #test_collector.collect(n_step=trainer_params['batch size'] * train_env_num)
    
    # ======== tensorboard logging setup =========
    #         
    writer = SummaryWriter(log_path)
    writer.add_text(name, str(Run_Data))
    logger = TensorboardLogger(writer)
        
    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):                
        torch.save(policy.policies[agents[0]].state_dict(), model_save_path)

    def stop_fn(mean_rewards):
        return mean_rewards >= 9939.0

    def train_fn(epoch, env_step):
        epsilon = trainer_params['tn_eps_max'] - (trainer_params['tn_eps_max'] - trainer_params['tn_eps_max']/100)*(epoch/trainer_params['max_epoch'])  
        policy.policies[agents[0]].set_eps(epsilon)

    def test_fn(epoch, env_step):
        epsilon = trainer_params['ts_eps_max']#0.01#max(0.001, 0.1 - epoch * 0.001)
        policy.policies[agents[0]].set_eps(epsilon)
        
    def reward_metric(rews):       
        #print(rews)
        return rews.mean()#[:,0]
                           
    # ======== Step 5: Run the trainer =========
    result = offpolicy_trainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,        
        max_epoch=trainer_params['max_epoch'],
        step_per_epoch=trainer_params['step_per_epoch'],
        step_per_collect=trainer_params['step_per_collect'],
        episode_per_test= trainer_params['episode_per_test'],
        batch_size=trainer_params['batch_size'],
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        update_per_step=trainer_params['update_per_step'],
        logger=logger,
        test_in_train=False,
        reward_metric=reward_metric,
        show_progress = True
        )

    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[0]])")



.


Epoch #1: 1001it [00:21, 47.20it/s, agent0/loss=68.014, agent1/loss=82.025, agent2/loss=74.386, agent3/loss=78.707, agent4/loss=65.482, agent5/loss=58.189, agent6/loss=49.161, agent7/loss=48.764, agent8/loss=46.854, agent9/loss=39.563, env_step=1000, len=210, n/ep=2, n/st=500, rew=-1075.65]                          


Epoch #1: test_reward: -1832.960508 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #2: 1001it [00:26, 37.18it/s, agent0/loss=40.354, agent1/loss=49.283, agent2/loss=52.088, agent3/loss=46.185, agent4/loss=40.325, agent5/loss=34.769, agent6/loss=36.361, agent7/loss=42.740, agent8/loss=36.504, agent9/loss=29.772, env_step=2000, len=253, n/ep=0, n/st=500, rew=-1055.07]                          


Epoch #2: test_reward: -2698.064065 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #3: 1001it [00:30, 33.24it/s, agent0/loss=33.551, agent1/loss=45.373, agent2/loss=48.619, agent3/loss=35.407, agent4/loss=42.933, agent5/loss=31.529, agent6/loss=29.015, agent7/loss=31.573, agent8/loss=29.767, agent9/loss=20.154, env_step=3000, len=300, n/ep=1, n/st=500, rew=-448.47]                          


Epoch #3: test_reward: -2417.949159 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #4: 1001it [00:27, 35.86it/s, agent0/loss=27.575, agent1/loss=38.270, agent2/loss=32.348, agent3/loss=31.446, agent4/loss=33.673, agent5/loss=22.348, agent6/loss=28.108, agent7/loss=27.811, agent8/loss=26.385, agent9/loss=24.790, env_step=4000, len=298, n/ep=4, n/st=500, rew=-622.60]                          


Epoch #4: test_reward: -2410.824242 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #5: 1001it [00:25, 38.80it/s, agent0/loss=33.681, agent1/loss=35.433, agent2/loss=44.242, agent3/loss=30.001, agent4/loss=42.466, agent5/loss=26.595, agent6/loss=26.481, agent7/loss=30.357, agent8/loss=21.244, agent9/loss=28.516, env_step=5000, len=299, n/ep=0, n/st=500, rew=-735.34]                          


Epoch #5: test_reward: -2131.469569 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #6: 1001it [00:35, 28.41it/s, agent0/loss=26.638, agent1/loss=37.593, agent2/loss=37.589, agent3/loss=28.464, agent4/loss=26.340, agent5/loss=33.756, agent6/loss=28.149, agent7/loss=26.377, agent8/loss=22.480, agent9/loss=24.802, env_step=6000, len=300, n/ep=1, n/st=500, rew=-1176.80]                          


Epoch #6: test_reward: -9655.864522 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #7: 1001it [00:29, 34.07it/s, agent0/loss=29.741, agent1/loss=40.405, agent2/loss=40.385, agent3/loss=44.789, agent4/loss=47.571, agent5/loss=27.271, agent6/loss=29.373, agent7/loss=29.128, agent8/loss=26.704, agent9/loss=34.084, env_step=7000, len=257, n/ep=2, n/st=500, rew=-559.55]                           


Epoch #7: test_reward: -9664.522223 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #8: 1001it [00:28, 35.06it/s, agent0/loss=28.833, agent1/loss=43.937, agent2/loss=43.885, agent3/loss=32.577, agent4/loss=34.141, agent5/loss=40.906, agent6/loss=33.174, agent7/loss=33.800, agent8/loss=30.665, agent9/loss=44.347, env_step=8000, len=282, n/ep=3, n/st=500, rew=-1087.92]                          


Epoch #8: test_reward: -9668.641488 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #9: 1001it [00:27, 36.16it/s, agent0/loss=36.545, agent1/loss=43.454, agent2/loss=61.187, agent3/loss=39.163, agent4/loss=42.769, agent5/loss=35.206, agent6/loss=44.540, agent7/loss=28.399, agent8/loss=28.442, agent9/loss=42.768, env_step=9000, len=300, n/ep=0, n/st=500, rew=-1246.77]                          


Epoch #9: test_reward: -3003.057964 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #10: 1001it [00:35, 27.97it/s, agent0/loss=34.245, agent1/loss=43.777, agent2/loss=35.547, agent3/loss=31.062, agent4/loss=41.418, agent5/loss=40.008, agent6/loss=21.558, agent7/loss=42.850, agent8/loss=40.140, agent9/loss=39.066, env_step=10000, len=294, n/ep=1, n/st=500, rew=-1161.52]                          


Epoch #10: test_reward: -4181.885357 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #11: 1001it [00:33, 29.92it/s, agent0/loss=48.708, agent1/loss=52.661, agent2/loss=38.300, agent3/loss=52.621, agent4/loss=35.710, agent5/loss=30.597, agent6/loss=43.073, agent7/loss=29.575, agent8/loss=37.348, agent9/loss=43.648, env_step=11000, len=300, n/ep=3, n/st=500, rew=-996.88]                          


Epoch #11: test_reward: -4176.018700 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #12: 1001it [00:34, 29.33it/s, agent0/loss=40.409, agent1/loss=48.569, agent2/loss=42.356, agent3/loss=51.710, agent4/loss=40.512, agent5/loss=44.892, agent6/loss=36.429, agent7/loss=31.790, agent8/loss=37.145, agent9/loss=25.076, env_step=12000, len=300, n/ep=1, n/st=500, rew=-477.42]                          


Epoch #12: test_reward: -4189.435768 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #13: 1001it [00:29, 34.48it/s, agent0/loss=30.855, agent1/loss=56.614, agent2/loss=48.825, agent3/loss=44.669, agent4/loss=38.755, agent5/loss=39.129, agent6/loss=39.177, agent7/loss=47.715, agent8/loss=48.768, agent9/loss=32.265, env_step=13000, len=277, n/ep=0, n/st=500, rew=-976.76]                          


Epoch #13: test_reward: -4178.302246 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #14: 1001it [00:31, 32.03it/s, agent0/loss=35.961, agent1/loss=44.919, agent2/loss=37.726, agent3/loss=40.423, agent4/loss=35.450, agent5/loss=41.648, agent6/loss=29.755, agent7/loss=33.050, agent8/loss=40.867, agent9/loss=33.972, env_step=14000, len=297, n/ep=4, n/st=500, rew=-878.31]                           


Epoch #14: test_reward: -4172.305697 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #15: 1001it [00:32, 30.45it/s, agent0/loss=35.587, agent1/loss=45.891, agent2/loss=42.371, agent3/loss=33.254, agent4/loss=35.346, agent5/loss=33.570, agent6/loss=34.633, agent7/loss=31.042, agent8/loss=35.270, agent9/loss=30.110, env_step=15000, len=223, n/ep=3, n/st=500, rew=-965.03]                          


Epoch #15: test_reward: -4173.408549 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #16: 1001it [00:31, 31.56it/s, agent0/loss=37.815, agent1/loss=45.611, agent2/loss=32.344, agent3/loss=32.684, agent4/loss=32.943, agent5/loss=34.523, agent6/loss=34.524, agent7/loss=44.131, agent8/loss=35.472, agent9/loss=35.032, env_step=16000, len=298, n/ep=0, n/st=500, rew=-993.13]                          


Epoch #16: test_reward: -4179.286732 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #17: 1001it [00:32, 31.22it/s, agent0/loss=34.469, agent1/loss=37.749, agent2/loss=31.845, agent3/loss=40.486, agent4/loss=31.445, agent5/loss=37.390, agent6/loss=34.908, agent7/loss=30.530, agent8/loss=41.499, agent9/loss=24.028, env_step=17000, len=300, n/ep=2, n/st=500, rew=-921.35]                          


Epoch #17: test_reward: -4175.025959 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #18: 1001it [00:30, 33.15it/s, agent0/loss=39.504, agent1/loss=49.512, agent2/loss=40.592, agent3/loss=42.661, agent4/loss=38.300, agent5/loss=35.406, agent6/loss=34.207, agent7/loss=28.883, agent8/loss=29.962, agent9/loss=26.174, env_step=18000, len=300, n/ep=3, n/st=500, rew=-860.52]                          


Epoch #18: test_reward: -4184.497188 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #19: 1001it [00:36, 27.31it/s, agent0/loss=34.946, agent1/loss=36.878, agent2/loss=40.726, agent3/loss=53.783, agent4/loss=44.243, agent5/loss=41.630, agent6/loss=42.124, agent7/loss=29.952, agent8/loss=29.523, agent9/loss=29.833, env_step=19000, len=300, n/ep=0, n/st=500, rew=-1086.17]                          


Epoch #19: test_reward: -4180.508369 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #20: 1001it [00:30, 33.10it/s, agent0/loss=41.502, agent1/loss=41.553, agent2/loss=33.704, agent3/loss=38.015, agent4/loss=29.295, agent5/loss=28.448, agent6/loss=32.467, agent7/loss=27.529, agent8/loss=29.795, agent9/loss=33.145, env_step=20000, len=300, n/ep=2, n/st=500, rew=-994.84]                          


Epoch #20: test_reward: -4184.413620 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #21: 1001it [00:30, 33.36it/s, agent0/loss=30.047, agent1/loss=33.241, agent2/loss=41.059, agent3/loss=34.544, agent4/loss=40.261, agent5/loss=35.864, agent6/loss=30.027, agent7/loss=26.501, agent8/loss=28.976, agent9/loss=25.773, env_step=21000, len=300, n/ep=3, n/st=500, rew=-1029.06]                          


Epoch #21: test_reward: -4176.811168 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #22: 1001it [00:28, 34.52it/s, agent0/loss=42.029, agent1/loss=44.867, agent2/loss=42.480, agent3/loss=39.559, agent4/loss=37.392, agent5/loss=29.876, agent6/loss=39.819, agent7/loss=38.494, agent8/loss=26.170, agent9/loss=28.494, env_step=22000, len=300, n/ep=0, n/st=500, rew=-1049.12]                          


Epoch #22: test_reward: -2999.941675 ± 0.000000, best_reward: -1832.960508 ± 0.000000 in #1


Epoch #23: 1001it [00:29, 33.90it/s, agent0/loss=40.321, agent1/loss=44.201, agent2/loss=27.410, agent3/loss=38.472, agent4/loss=33.622, agent5/loss=32.124, agent6/loss=31.919, agent7/loss=33.764, agent8/loss=37.978, agent9/loss=24.266, env_step=23000, len=300, n/ep=2, n/st=500, rew=-1028.83]                          


Epoch #23: test_reward: -260.124235 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #24: 1001it [00:31, 32.07it/s, agent0/loss=40.892, agent1/loss=32.012, agent2/loss=45.424, agent3/loss=38.802, agent4/loss=29.759, agent5/loss=21.824, agent6/loss=29.302, agent7/loss=23.860, agent8/loss=36.189, agent9/loss=31.316, env_step=24000, len=300, n/ep=3, n/st=500, rew=-729.19]                           


Epoch #24: test_reward: -271.023204 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #25: 1001it [00:32, 31.05it/s, agent0/loss=31.788, agent1/loss=36.800, agent2/loss=32.762, agent3/loss=36.114, agent4/loss=36.538, agent5/loss=34.630, agent6/loss=36.247, agent7/loss=25.826, agent8/loss=28.444, agent9/loss=29.016, env_step=25000, len=300, n/ep=0, n/st=500, rew=-645.58]                          


Epoch #25: test_reward: -267.089168 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #26: 1001it [00:32, 31.02it/s, agent0/loss=43.830, agent1/loss=34.015, agent2/loss=42.219, agent3/loss=38.557, agent4/loss=25.635, agent5/loss=33.015, agent6/loss=32.188, agent7/loss=23.688, agent8/loss=31.880, agent9/loss=32.634, env_step=26000, len=300, n/ep=2, n/st=500, rew=-365.13]                          


Epoch #26: test_reward: -267.982022 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #27: 1001it [00:31, 31.41it/s, agent0/loss=30.688, agent1/loss=40.637, agent2/loss=36.686, agent3/loss=33.372, agent4/loss=27.170, agent5/loss=32.600, agent6/loss=21.073, agent7/loss=31.458, agent8/loss=21.953, agent9/loss=29.025, env_step=27000, len=292, n/ep=3, n/st=500, rew=-381.71]                          


Epoch #27: test_reward: -271.121704 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #28: 1001it [00:34, 28.96it/s, agent0/loss=33.737, agent1/loss=39.032, agent2/loss=33.211, agent3/loss=32.480, agent4/loss=37.394, agent5/loss=33.795, agent6/loss=26.983, agent7/loss=24.408, agent8/loss=30.087, agent9/loss=18.846, env_step=28000, len=300, n/ep=0, n/st=500, rew=-371.42]                          


Epoch #28: test_reward: -8730.530025 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #29: 1001it [00:31, 31.45it/s, agent0/loss=21.788, agent1/loss=38.143, agent2/loss=37.421, agent3/loss=42.277, agent4/loss=34.867, agent5/loss=36.422, agent6/loss=35.405, agent7/loss=27.537, agent8/loss=33.911, agent9/loss=23.777, env_step=29000, len=300, n/ep=2, n/st=500, rew=-381.07]                          


Epoch #29: test_reward: -8681.680307 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #30: 1001it [00:32, 30.74it/s, agent0/loss=33.190, agent1/loss=40.539, agent2/loss=43.451, agent3/loss=33.102, agent4/loss=33.591, agent5/loss=27.735, agent6/loss=30.016, agent7/loss=31.101, agent8/loss=24.401, agent9/loss=26.980, env_step=30000, len=284, n/ep=3, n/st=500, rew=-1336.60]                          


Epoch #30: test_reward: -8713.940747 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #31: 1001it [00:32, 31.28it/s, agent0/loss=36.852, agent1/loss=39.718, agent2/loss=32.957, agent3/loss=51.628, agent4/loss=38.939, agent5/loss=35.517, agent6/loss=39.933, agent7/loss=34.783, agent8/loss=32.434, agent9/loss=28.460, env_step=31000, len=299, n/ep=0, n/st=500, rew=-1382.00]                          


Epoch #31: test_reward: -8735.516828 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #32: 1001it [00:32, 30.85it/s, agent0/loss=38.990, agent1/loss=39.789, agent2/loss=39.324, agent3/loss=31.708, agent4/loss=31.137, agent5/loss=30.974, agent6/loss=33.739, agent7/loss=37.430, agent8/loss=26.154, agent9/loss=21.477, env_step=32000, len=297, n/ep=2, n/st=500, rew=-1525.07]                          


Epoch #32: test_reward: -8723.371565 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #33: 1001it [00:32, 30.89it/s, agent0/loss=32.249, agent1/loss=47.754, agent2/loss=31.310, agent3/loss=37.155, agent4/loss=41.160, agent5/loss=33.413, agent6/loss=33.490, agent7/loss=29.985, agent8/loss=39.579, agent9/loss=28.518, env_step=33000, len=281, n/ep=3, n/st=500, rew=-1729.92]                          


Epoch #33: test_reward: -8703.245074 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #34: 1001it [00:32, 31.10it/s, agent0/loss=35.936, agent1/loss=44.937, agent2/loss=43.136, agent3/loss=37.631, agent4/loss=34.285, agent5/loss=39.276, agent6/loss=32.025, agent7/loss=27.570, agent8/loss=28.500, agent9/loss=27.273, env_step=34000, len=264, n/ep=0, n/st=500, rew=-1554.55]                          


Epoch #34: test_reward: -8728.704306 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #35: 1001it [00:32, 30.51it/s, agent0/loss=42.038, agent1/loss=40.017, agent2/loss=42.085, agent3/loss=50.451, agent4/loss=36.767, agent5/loss=35.919, agent6/loss=55.264, agent7/loss=35.452, agent8/loss=36.867, agent9/loss=29.659, env_step=35000, len=300, n/ep=2, n/st=500, rew=-1766.40]                          


Epoch #35: test_reward: -8733.136426 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #36: 1001it [00:34, 29.10it/s, agent0/loss=53.420, agent1/loss=42.459, agent2/loss=43.740, agent3/loss=32.091, agent4/loss=48.797, agent5/loss=46.366, agent6/loss=47.700, agent7/loss=44.544, agent8/loss=42.077, agent9/loss=30.707, env_step=36000, len=300, n/ep=3, n/st=500, rew=-1421.50]                          


Epoch #36: test_reward: -8740.928514 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #37: 1001it [00:32, 30.60it/s, agent0/loss=45.929, agent1/loss=54.056, agent2/loss=47.374, agent3/loss=40.081, agent4/loss=49.669, agent5/loss=37.230, agent6/loss=45.426, agent7/loss=45.816, agent8/loss=42.929, agent9/loss=36.280, env_step=37000, len=293, n/ep=0, n/st=500, rew=-1224.79]                          


Epoch #37: test_reward: -8706.535730 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #38: 1001it [00:32, 31.03it/s, agent0/loss=54.948, agent1/loss=55.010, agent2/loss=47.594, agent3/loss=47.198, agent4/loss=40.655, agent5/loss=45.122, agent6/loss=35.894, agent7/loss=25.309, agent8/loss=32.462, agent9/loss=32.838, env_step=38000, len=300, n/ep=2, n/st=500, rew=-1454.02]                          


Epoch #38: test_reward: -8709.092031 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #39: 1001it [00:32, 30.95it/s, agent0/loss=54.139, agent1/loss=48.886, agent2/loss=49.873, agent3/loss=51.074, agent4/loss=47.999, agent5/loss=39.639, agent6/loss=41.186, agent7/loss=37.374, agent8/loss=31.815, agent9/loss=29.096, env_step=39000, len=300, n/ep=3, n/st=500, rew=-1240.92]                          


Epoch #39: test_reward: -8720.956702 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #40: 1001it [00:32, 31.08it/s, agent0/loss=53.416, agent1/loss=61.027, agent2/loss=55.309, agent3/loss=33.978, agent4/loss=55.184, agent5/loss=41.341, agent6/loss=39.769, agent7/loss=32.678, agent8/loss=38.115, agent9/loss=38.836, env_step=40000, len=300, n/ep=0, n/st=500, rew=-1425.33]                          


Epoch #40: test_reward: -8732.110493 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #41: 1001it [00:32, 30.93it/s, agent0/loss=44.997, agent1/loss=64.603, agent2/loss=43.831, agent3/loss=44.982, agent4/loss=42.253, agent5/loss=35.652, agent6/loss=30.547, agent7/loss=45.174, agent8/loss=46.105, agent9/loss=38.340, env_step=41000, len=287, n/ep=2, n/st=500, rew=-1404.00]                          


Epoch #41: test_reward: -8724.013558 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #42: 1001it [00:32, 30.43it/s, agent0/loss=42.916, agent1/loss=71.434, agent2/loss=37.553, agent3/loss=39.693, agent4/loss=62.436, agent5/loss=41.591, agent6/loss=40.897, agent7/loss=40.302, agent8/loss=48.081, agent9/loss=38.640, env_step=42000, len=295, n/ep=3, n/st=500, rew=-1921.97]                          


Epoch #42: test_reward: -8722.470745 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #43: 1001it [00:32, 31.26it/s, agent0/loss=51.661, agent1/loss=54.798, agent2/loss=59.357, agent3/loss=47.667, agent4/loss=41.924, agent5/loss=49.241, agent6/loss=43.564, agent7/loss=47.981, agent8/loss=37.962, agent9/loss=37.823, env_step=43000, len=300, n/ep=0, n/st=500, rew=-1608.68]                          


Epoch #43: test_reward: -8733.695778 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #44: 1001it [00:32, 30.48it/s, agent0/loss=53.939, agent1/loss=60.613, agent2/loss=50.495, agent3/loss=56.234, agent4/loss=42.538, agent5/loss=52.401, agent6/loss=41.703, agent7/loss=49.569, agent8/loss=50.094, agent9/loss=39.532, env_step=44000, len=300, n/ep=2, n/st=500, rew=-1561.34]                          


Epoch #44: test_reward: -8750.782059 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #45: 1001it [00:32, 30.48it/s, agent0/loss=56.217, agent1/loss=60.126, agent2/loss=61.156, agent3/loss=67.317, agent4/loss=42.773, agent5/loss=42.706, agent6/loss=38.439, agent7/loss=58.402, agent8/loss=47.732, agent9/loss=42.047, env_step=45000, len=300, n/ep=2, n/st=500, rew=-1438.71]                          


Epoch #45: test_reward: -8735.211342 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #46: 1001it [00:32, 30.98it/s, agent0/loss=45.381, agent1/loss=55.542, agent2/loss=49.334, agent3/loss=54.131, agent4/loss=55.972, agent5/loss=33.156, agent6/loss=60.380, agent7/loss=49.844, agent8/loss=38.790, agent9/loss=43.494, env_step=46000, len=255, n/ep=1, n/st=500, rew=-1807.94]                          


Epoch #46: test_reward: -8697.389255 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #47: 1001it [00:34, 29.04it/s, agent0/loss=45.253, agent1/loss=62.893, agent2/loss=55.969, agent3/loss=48.640, agent4/loss=60.348, agent5/loss=53.835, agent6/loss=44.793, agent7/loss=45.914, agent8/loss=39.881, agent9/loss=40.382, env_step=47000, len=286, n/ep=2, n/st=500, rew=-1787.67]                          


Epoch #47: test_reward: -8713.776470 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #48: 1001it [00:32, 30.40it/s, agent0/loss=42.103, agent1/loss=61.915, agent2/loss=50.483, agent3/loss=50.527, agent4/loss=56.888, agent5/loss=37.663, agent6/loss=31.840, agent7/loss=42.925, agent8/loss=45.407, agent9/loss=40.492, env_step=48000, len=300, n/ep=2, n/st=500, rew=-1952.61]                          


Epoch #48: test_reward: -8705.670663 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #49: 1001it [00:32, 30.37it/s, agent0/loss=55.250, agent1/loss=62.816, agent2/loss=53.996, agent3/loss=50.752, agent4/loss=57.445, agent5/loss=43.618, agent6/loss=46.503, agent7/loss=53.980, agent8/loss=51.415, agent9/loss=48.206, env_step=49000, len=281, n/ep=1, n/st=500, rew=-1538.31]                          


Epoch #49: test_reward: -8732.962756 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #50: 1001it [00:34, 28.84it/s, agent0/loss=56.299, agent1/loss=55.436, agent2/loss=67.342, agent3/loss=47.942, agent4/loss=57.965, agent5/loss=49.685, agent6/loss=59.146, agent7/loss=58.112, agent8/loss=50.775, agent9/loss=44.066, env_step=50000, len=300, n/ep=2, n/st=500, rew=-1827.39]                          


Epoch #50: test_reward: -8734.403040 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #51: 1001it [00:33, 29.71it/s, agent0/loss=64.154, agent1/loss=52.426, agent2/loss=39.808, agent3/loss=47.840, agent4/loss=42.802, agent5/loss=50.671, agent6/loss=60.499, agent7/loss=62.677, agent8/loss=54.496, agent9/loss=45.580, env_step=51000, len=300, n/ep=2, n/st=500, rew=-2239.36]                          


Epoch #51: test_reward: -8720.543934 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #52: 1001it [00:33, 29.88it/s, agent0/loss=60.590, agent1/loss=56.506, agent2/loss=44.985, agent3/loss=59.280, agent4/loss=67.325, agent5/loss=45.048, agent6/loss=46.721, agent7/loss=53.347, agent8/loss=47.218, agent9/loss=53.874, env_step=52000, len=300, n/ep=1, n/st=500, rew=-1947.00]                          


Epoch #52: test_reward: -8751.263443 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #53: 1001it [00:34, 28.90it/s, agent0/loss=64.450, agent1/loss=60.822, agent2/loss=56.057, agent3/loss=52.955, agent4/loss=64.857, agent5/loss=56.244, agent6/loss=50.056, agent7/loss=51.571, agent8/loss=48.711, agent9/loss=54.211, env_step=53000, len=300, n/ep=1, n/st=500, rew=-1012.20]                          


Epoch #53: test_reward: -8737.699016 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #54: 1001it [00:34, 29.28it/s, agent0/loss=64.183, agent1/loss=62.544, agent2/loss=59.199, agent3/loss=61.971, agent4/loss=59.288, agent5/loss=61.538, agent6/loss=48.273, agent7/loss=47.560, agent8/loss=45.773, agent9/loss=66.695, env_step=54000, len=288, n/ep=3, n/st=500, rew=-1712.42]                          


Epoch #54: test_reward: -8720.242554 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #55: 1001it [00:34, 29.32it/s, agent0/loss=47.420, agent1/loss=56.969, agent2/loss=61.469, agent3/loss=41.933, agent4/loss=42.859, agent5/loss=43.447, agent6/loss=53.331, agent7/loss=65.088, agent8/loss=54.147, agent9/loss=54.791, env_step=55000, len=300, n/ep=1, n/st=500, rew=-1825.08]                          


Epoch #55: test_reward: -8731.213479 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #56: 1001it [00:35, 28.15it/s, agent0/loss=57.332, agent1/loss=63.048, agent2/loss=62.178, agent3/loss=48.142, agent4/loss=51.707, agent5/loss=55.539, agent6/loss=55.399, agent7/loss=57.399, agent8/loss=46.299, agent9/loss=48.186, env_step=56000, len=300, n/ep=1, n/st=500, rew=-1634.73]                          


Epoch #56: test_reward: -8719.528939 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #57: 1001it [00:36, 27.62it/s, agent0/loss=61.618, agent1/loss=69.213, agent2/loss=62.663, agent3/loss=52.437, agent4/loss=65.409, agent5/loss=54.838, agent6/loss=56.366, agent7/loss=44.925, agent8/loss=61.037, agent9/loss=54.039, env_step=57000, len=290, n/ep=4, n/st=500, rew=-1923.46]                          


Epoch #57: test_reward: -8712.300620 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #58: 1001it [00:35, 28.55it/s, agent0/loss=50.604, agent1/loss=76.060, agent2/loss=60.753, agent3/loss=58.104, agent4/loss=73.068, agent5/loss=48.276, agent6/loss=57.600, agent7/loss=51.998, agent8/loss=52.798, agent9/loss=47.357, env_step=58000, len=300, n/ep=1, n/st=500, rew=-1708.65]                          


Epoch #58: test_reward: -8738.596054 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #59: 1001it [00:36, 27.52it/s, agent0/loss=47.560, agent1/loss=61.646, agent2/loss=57.364, agent3/loss=64.124, agent4/loss=53.924, agent5/loss=54.893, agent6/loss=71.186, agent7/loss=62.881, agent8/loss=47.537, agent9/loss=63.865, env_step=59000, len=299, n/ep=0, n/st=500, rew=-2484.80]                          


Epoch #59: test_reward: -8707.248893 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #60: 1001it [00:38, 25.67it/s, agent0/loss=58.644, agent1/loss=53.101, agent2/loss=68.224, agent3/loss=70.661, agent4/loss=76.668, agent5/loss=61.438, agent6/loss=68.720, agent7/loss=46.987, agent8/loss=58.545, agent9/loss=64.007, env_step=60000, len=300, n/ep=4, n/st=500, rew=-2152.47]                          


Epoch #60: test_reward: -8724.580523 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #61: 1001it [00:36, 27.48it/s, agent0/loss=59.889, agent1/loss=40.866, agent2/loss=57.551, agent3/loss=57.216, agent4/loss=52.177, agent5/loss=54.272, agent6/loss=74.064, agent7/loss=48.882, agent8/loss=49.533, agent9/loss=69.168, env_step=61000, len=300, n/ep=1, n/st=500, rew=-1934.40]                          


Epoch #61: test_reward: -8716.276416 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #62: 1001it [00:43, 23.03it/s, agent0/loss=72.241, agent1/loss=74.676, agent2/loss=58.806, agent3/loss=60.819, agent4/loss=73.256, agent5/loss=61.592, agent6/loss=44.603, agent7/loss=56.704, agent8/loss=57.141, agent9/loss=55.169, env_step=62000, len=285, n/ep=0, n/st=500, rew=-1917.59]                          


Epoch #62: test_reward: -8727.978806 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #63: 1001it [00:40, 24.86it/s, agent0/loss=52.439, agent1/loss=63.821, agent2/loss=73.755, agent3/loss=59.181, agent4/loss=82.716, agent5/loss=52.589, agent6/loss=50.547, agent7/loss=58.549, agent8/loss=55.774, agent9/loss=50.520, env_step=63000, len=300, n/ep=3, n/st=500, rew=-2258.21]                          


Epoch #63: test_reward: -8739.605065 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #64: 1001it [00:44, 22.63it/s, agent0/loss=76.287, agent1/loss=77.715, agent2/loss=68.838, agent3/loss=58.497, agent4/loss=56.592, agent5/loss=52.502, agent6/loss=61.598, agent7/loss=42.189, agent8/loss=60.586, agent9/loss=59.865, env_step=64000, len=290, n/ep=3, n/st=500, rew=-1881.31]                          


Epoch #64: test_reward: -8735.330027 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #65: 1001it [00:46, 21.43it/s, agent0/loss=67.856, agent1/loss=57.004, agent2/loss=52.527, agent3/loss=64.649, agent4/loss=53.507, agent5/loss=63.023, agent6/loss=63.136, agent7/loss=60.046, agent8/loss=76.006, agent9/loss=49.136, env_step=65000, len=300, n/ep=0, n/st=500, rew=-2731.11]                          


Epoch #65: test_reward: -8703.906344 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #66: 1001it [00:51, 19.46it/s, agent0/loss=63.591, agent1/loss=68.152, agent2/loss=51.558, agent3/loss=67.958, agent4/loss=58.350, agent5/loss=67.931, agent6/loss=45.224, agent7/loss=61.531, agent8/loss=52.019, agent9/loss=53.652, env_step=66000, len=300, n/ep=2, n/st=500, rew=-1488.08]                          


Epoch #66: test_reward: -8734.976753 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #67: 1001it [00:45, 21.98it/s, agent0/loss=70.266, agent1/loss=93.521, agent2/loss=70.918, agent3/loss=80.416, agent4/loss=65.674, agent5/loss=58.791, agent6/loss=62.906, agent7/loss=68.410, agent8/loss=72.897, agent9/loss=65.149, env_step=67000, len=300, n/ep=3, n/st=500, rew=-2113.37]                          


Epoch #67: test_reward: -8724.238238 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #68: 1001it [00:55, 17.88it/s, agent0/loss=51.052, agent1/loss=80.109, agent2/loss=81.316, agent3/loss=67.613, agent4/loss=47.978, agent5/loss=60.425, agent6/loss=59.527, agent7/loss=69.184, agent8/loss=54.439, agent9/loss=50.339, env_step=68000, len=298, n/ep=0, n/st=500, rew=-2204.25]                          


Epoch #68: test_reward: -8717.671648 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #69: 1001it [00:47, 21.21it/s, agent0/loss=72.223, agent1/loss=67.304, agent2/loss=70.142, agent3/loss=76.097, agent4/loss=69.820, agent5/loss=59.094, agent6/loss=55.095, agent7/loss=64.746, agent8/loss=53.757, agent9/loss=58.458, env_step=69000, len=299, n/ep=1, n/st=500, rew=-2051.43]                          


Epoch #69: test_reward: -8728.930591 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #70: 1001it [01:02, 15.90it/s, agent0/loss=61.191, agent1/loss=70.084, agent2/loss=69.876, agent3/loss=61.721, agent4/loss=66.065, agent5/loss=75.097, agent6/loss=67.074, agent7/loss=64.934, agent8/loss=42.309, agent9/loss=64.855, env_step=70000, len=262, n/ep=3, n/st=500, rew=-2267.83]                          


Epoch #70: test_reward: -8730.197809 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #71: 1001it [00:59, 16.70it/s, agent0/loss=63.671, agent1/loss=82.174, agent2/loss=81.597, agent3/loss=76.502, agent4/loss=58.052, agent5/loss=65.904, agent6/loss=66.614, agent7/loss=51.444, agent8/loss=62.591, agent9/loss=50.188, env_step=71000, len=280, n/ep=1, n/st=500, rew=-2184.96]                          


Epoch #71: test_reward: -8713.035401 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #72: 1001it [00:49, 20.20it/s, agent0/loss=74.803, agent1/loss=87.130, agent2/loss=63.542, agent3/loss=66.073, agent4/loss=78.889, agent5/loss=63.401, agent6/loss=82.431, agent7/loss=61.292, agent8/loss=62.966, agent9/loss=60.122, env_step=72000, len=296, n/ep=1, n/st=500, rew=-2294.47]                          


Epoch #72: test_reward: -8721.389127 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #73: 1001it [00:50, 19.66it/s, agent0/loss=67.141, agent1/loss=70.629, agent2/loss=73.551, agent3/loss=60.927, agent4/loss=47.567, agent5/loss=58.334, agent6/loss=57.511, agent7/loss=81.338, agent8/loss=72.423, agent9/loss=66.229, env_step=73000, len=300, n/ep=3, n/st=500, rew=-2227.19]                          


Epoch #73: test_reward: -8707.747682 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #74: 1001it [00:46, 21.64it/s, agent0/loss=71.990, agent1/loss=74.889, agent2/loss=53.116, agent3/loss=60.368, agent4/loss=63.732, agent5/loss=55.150, agent6/loss=69.841, agent7/loss=74.589, agent8/loss=59.100, agent9/loss=60.807, env_step=74000, len=300, n/ep=1, n/st=500, rew=-3132.48]                          


Epoch #74: test_reward: -8729.431010 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #75: 1001it [00:43, 22.92it/s, agent0/loss=70.205, agent1/loss=71.270, agent2/loss=82.171, agent3/loss=62.689, agent4/loss=59.424, agent5/loss=76.095, agent6/loss=60.743, agent7/loss=50.984, agent8/loss=67.834, agent9/loss=56.803, env_step=75000, len=300, n/ep=1, n/st=500, rew=-2708.19]                          


Epoch #75: test_reward: -8741.423441 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #76: 1001it [00:44, 22.71it/s, agent0/loss=69.920, agent1/loss=78.593, agent2/loss=85.079, agent3/loss=76.012, agent4/loss=72.137, agent5/loss=67.990, agent6/loss=63.456, agent7/loss=66.850, agent8/loss=56.942, agent9/loss=63.223, env_step=76000, len=273, n/ep=4, n/st=500, rew=-2086.96]                          


Epoch #76: test_reward: -8729.025016 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #77: 1001it [00:45, 22.01it/s, agent0/loss=85.103, agent1/loss=76.163, agent2/loss=57.375, agent3/loss=64.206, agent4/loss=73.729, agent5/loss=58.651, agent6/loss=73.702, agent7/loss=50.793, agent8/loss=72.413, agent9/loss=59.860, env_step=77000, len=275, n/ep=0, n/st=500, rew=-1411.61]                          


Epoch #77: test_reward: -8738.502938 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #78: 1001it [00:46, 21.60it/s, agent0/loss=81.915, agent1/loss=75.597, agent2/loss=65.300, agent3/loss=70.072, agent4/loss=70.132, agent5/loss=60.975, agent6/loss=64.235, agent7/loss=78.805, agent8/loss=71.344, agent9/loss=58.916, env_step=78000, len=300, n/ep=1, n/st=500, rew=-1750.63]                          


Epoch #78: test_reward: -8718.118863 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #79: 1001it [00:46, 21.49it/s, agent0/loss=78.742, agent1/loss=66.944, agent2/loss=65.935, agent3/loss=74.703, agent4/loss=53.437, agent5/loss=76.618, agent6/loss=78.878, agent7/loss=74.988, agent8/loss=52.938, agent9/loss=60.990, env_step=79000, len=300, n/ep=3, n/st=500, rew=-2333.20]                          


Epoch #79: test_reward: -8728.858250 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #80: 1001it [01:16, 13.03it/s, agent0/loss=85.056, agent1/loss=70.045, agent2/loss=57.683, agent3/loss=54.195, agent4/loss=66.782, agent5/loss=66.429, agent6/loss=63.625, agent7/loss=59.864, agent8/loss=64.732, agent9/loss=49.274, env_step=80000, len=282, n/ep=3, n/st=500, rew=-2219.17]                          


Epoch #80: test_reward: -8740.743024 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #81: 1001it [00:56, 17.87it/s, agent0/loss=78.613, agent1/loss=84.703, agent2/loss=84.439, agent3/loss=63.751, agent4/loss=59.850, agent5/loss=71.341, agent6/loss=73.926, agent7/loss=51.084, agent8/loss=78.642, agent9/loss=60.484, env_step=81000, len=280, n/ep=1, n/st=500, rew=-2102.54]                          


Epoch #81: test_reward: -8739.771650 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #82: 1001it [00:56, 17.81it/s, agent0/loss=79.981, agent1/loss=78.169, agent2/loss=74.943, agent3/loss=80.492, agent4/loss=66.905, agent5/loss=71.491, agent6/loss=71.077, agent7/loss=62.924, agent8/loss=59.529, agent9/loss=59.880, env_step=82000, len=300, n/ep=1, n/st=500, rew=-2557.23]                          


Epoch #82: test_reward: -8744.646374 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #83: 1001it [00:58, 17.05it/s, agent0/loss=79.966, agent1/loss=108.606, agent2/loss=73.044, agent3/loss=72.863, agent4/loss=70.312, agent5/loss=74.283, agent6/loss=61.369, agent7/loss=59.027, agent8/loss=73.478, agent9/loss=72.345, env_step=83000, len=300, n/ep=3, n/st=500, rew=-2339.66]                          


Epoch #83: test_reward: -8736.859827 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #84: 1001it [01:24, 11.85it/s, agent0/loss=71.822, agent1/loss=81.986, agent2/loss=68.346, agent3/loss=74.580, agent4/loss=85.169, agent5/loss=82.129, agent6/loss=68.415, agent7/loss=58.870, agent8/loss=67.647, agent9/loss=76.485, env_step=84000, len=300, n/ep=1, n/st=500, rew=-2230.26]                           


Epoch #84: test_reward: -8728.273804 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #85: 1001it [01:01, 16.37it/s, agent0/loss=71.901, agent1/loss=82.215, agent2/loss=79.557, agent3/loss=70.878, agent4/loss=71.856, agent5/loss=68.362, agent6/loss=72.724, agent7/loss=66.244, agent8/loss=74.923, agent9/loss=60.667, env_step=85000, len=300, n/ep=1, n/st=500, rew=-1460.58]                          


Epoch #85: test_reward: -8747.719323 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #86: 1001it [00:59, 16.70it/s, agent0/loss=95.710, agent1/loss=88.895, agent2/loss=76.728, agent3/loss=80.541, agent4/loss=65.221, agent5/loss=71.535, agent6/loss=65.586, agent7/loss=82.843, agent8/loss=77.130, agent9/loss=61.698, env_step=86000, len=300, n/ep=3, n/st=500, rew=-2715.41]                          


Epoch #86: test_reward: -8699.186861 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #87: 1001it [00:52, 19.01it/s, agent0/loss=83.888, agent1/loss=75.645, agent2/loss=74.382, agent3/loss=88.826, agent4/loss=75.756, agent5/loss=79.698, agent6/loss=73.416, agent7/loss=71.432, agent8/loss=70.996, agent9/loss=71.868, env_step=87000, len=300, n/ep=1, n/st=500, rew=-2341.90]                          


Epoch #87: test_reward: -8725.069946 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #88: 1001it [00:53, 18.85it/s, agent0/loss=76.606, agent1/loss=75.750, agent2/loss=82.452, agent3/loss=74.679, agent4/loss=71.506, agent5/loss=56.601, agent6/loss=66.972, agent7/loss=84.634, agent8/loss=61.979, agent9/loss=81.982, env_step=88000, len=300, n/ep=1, n/st=500, rew=-2808.17]                          


Epoch #88: test_reward: -8739.204218 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #89: 1001it [00:53, 18.69it/s, agent0/loss=79.976, agent1/loss=69.092, agent2/loss=72.117, agent3/loss=92.682, agent4/loss=77.122, agent5/loss=65.541, agent6/loss=77.587, agent7/loss=69.801, agent8/loss=79.903, agent9/loss=54.709, env_step=89000, len=300, n/ep=3, n/st=500, rew=-2788.04]                          


Epoch #89: test_reward: -8747.742756 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #90: 1001it [00:54, 18.28it/s, agent0/loss=78.949, agent1/loss=107.203, agent2/loss=84.232, agent3/loss=80.783, agent4/loss=58.857, agent5/loss=71.870, agent6/loss=70.389, agent7/loss=93.979, agent8/loss=70.769, agent9/loss=76.541, env_step=90000, len=300, n/ep=1, n/st=500, rew=-2778.76]                          


Epoch #90: test_reward: -8733.517589 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #91: 1001it [00:54, 18.50it/s, agent0/loss=74.771, agent1/loss=87.681, agent2/loss=82.785, agent3/loss=84.533, agent4/loss=76.360, agent5/loss=68.518, agent6/loss=80.839, agent7/loss=68.276, agent8/loss=74.446, agent9/loss=80.121, env_step=91000, len=300, n/ep=1, n/st=500, rew=-1894.61]                          


Epoch #91: test_reward: -8736.943697 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #92: 1001it [00:55, 18.09it/s, agent0/loss=75.540, agent1/loss=88.168, agent2/loss=82.437, agent3/loss=93.879, agent4/loss=62.020, agent5/loss=77.193, agent6/loss=79.920, agent7/loss=76.806, agent8/loss=85.535, agent9/loss=65.974, env_step=92000, len=297, n/ep=3, n/st=500, rew=-2419.03]                          


Epoch #92: test_reward: -8707.835870 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #93: 1001it [00:55, 17.89it/s, agent0/loss=79.422, agent1/loss=76.321, agent2/loss=80.498, agent3/loss=57.370, agent4/loss=72.171, agent5/loss=66.683, agent6/loss=70.220, agent7/loss=78.877, agent8/loss=79.585, agent9/loss=79.621, env_step=93000, len=278, n/ep=2, n/st=500, rew=-2330.83]                          


Epoch #93: test_reward: -8704.599467 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #94: 1001it [00:58, 17.10it/s, agent0/loss=103.314, agent1/loss=72.572, agent2/loss=90.304, agent3/loss=82.875, agent4/loss=89.819, agent5/loss=83.951, agent6/loss=83.841, agent7/loss=78.642, agent8/loss=69.875, agent9/loss=77.442, env_step=94000, len=300, n/ep=1, n/st=500, rew=-3287.64]                          


Epoch #94: test_reward: -8729.265268 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #95: 1001it [00:57, 17.38it/s, agent0/loss=99.487, agent1/loss=80.163, agent2/loss=80.597, agent3/loss=74.705, agent4/loss=62.177, agent5/loss=87.852, agent6/loss=62.741, agent7/loss=83.350, agent8/loss=96.219, agent9/loss=63.442, env_step=95000, len=300, n/ep=2, n/st=500, rew=-3016.34]                           


Epoch #95: test_reward: -8745.236899 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #96: 1001it [01:58,  8.45it/s, agent0/loss=83.070, agent1/loss=72.931, agent2/loss=96.457, agent3/loss=83.228, agent4/loss=68.712, agent5/loss=86.161, agent6/loss=67.856, agent7/loss=75.477, agent8/loss=79.414, agent9/loss=59.784, env_step=96000, len=300, n/ep=2, n/st=500, rew=-2045.00]                           


Epoch #96: test_reward: -8724.099161 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #97: 1001it [01:15, 13.34it/s, agent0/loss=83.065, agent1/loss=89.273, agent2/loss=77.153, agent3/loss=73.344, agent4/loss=59.759, agent5/loss=71.694, agent6/loss=88.352, agent7/loss=64.785, agent8/loss=67.501, agent9/loss=67.352, env_step=97000, len=300, n/ep=1, n/st=500, rew=-2131.99]                          


Epoch #97: test_reward: -8735.122019 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #98: 1001it [02:11,  7.62it/s, agent0/loss=91.295, agent1/loss=101.990, agent2/loss=87.440, agent3/loss=99.901, agent4/loss=101.258, agent5/loss=69.347, agent6/loss=76.378, agent7/loss=81.635, agent8/loss=68.758, agent9/loss=77.464, env_step=98000, len=293, n/ep=2, n/st=500, rew=-2307.41]                          


Epoch #98: test_reward: -8725.550525 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #99: 1001it [01:32, 10.83it/s, agent0/loss=68.729, agent1/loss=75.756, agent2/loss=68.584, agent3/loss=68.810, agent4/loss=79.695, agent5/loss=78.713, agent6/loss=85.371, agent7/loss=62.558, agent8/loss=66.097, agent9/loss=69.179, env_step=99000, len=300, n/ep=2, n/st=500, rew=-2338.53]                          


Epoch #99: test_reward: -8725.948378 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #100: 1001it [01:37, 10.27it/s, agent0/loss=95.465, agent1/loss=80.618, agent2/loss=70.465, agent3/loss=66.075, agent4/loss=79.126, agent5/loss=87.447, agent6/loss=84.995, agent7/loss=75.871, agent8/loss=77.557, agent9/loss=73.881, env_step=100000, len=277, n/ep=1, n/st=500, rew=-2976.43]                          


Epoch #100: test_reward: -8736.059148 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #101: 1001it [01:29, 11.20it/s, agent0/loss=73.544, agent1/loss=92.277, agent2/loss=88.028, agent3/loss=104.678, agent4/loss=83.516, agent5/loss=80.138, agent6/loss=82.210, agent7/loss=78.835, agent8/loss=75.017, agent9/loss=70.636, env_step=101000, len=300, n/ep=2, n/st=500, rew=-3293.91]                          


Epoch #101: test_reward: -8741.291154 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #102: 1001it [01:36, 10.41it/s, agent0/loss=79.729, agent1/loss=95.969, agent2/loss=79.803, agent3/loss=67.312, agent4/loss=95.404, agent5/loss=65.529, agent6/loss=63.800, agent7/loss=70.788, agent8/loss=75.547, agent9/loss=89.227, env_step=102000, len=300, n/ep=2, n/st=500, rew=-2939.62]                          


Epoch #102: test_reward: -8715.867887 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #103: 1001it [01:36, 10.35it/s, agent0/loss=90.061, agent1/loss=87.997, agent2/loss=90.632, agent3/loss=86.111, agent4/loss=66.932, agent5/loss=75.169, agent6/loss=82.938, agent7/loss=80.411, agent8/loss=78.996, agent9/loss=77.766, env_step=103000, len=300, n/ep=1, n/st=500, rew=-3073.95]                           


Epoch #103: test_reward: -8706.343390 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #104: 1001it [02:03,  8.09it/s, agent0/loss=92.047, agent1/loss=80.516, agent2/loss=71.765, agent3/loss=94.475, agent4/loss=86.607, agent5/loss=86.048, agent6/loss=90.951, agent7/loss=93.478, agent8/loss=88.116, agent9/loss=99.769, env_step=104000, len=289, n/ep=3, n/st=500, rew=-2584.63]                          


Epoch #104: test_reward: -8750.039257 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #105: 1001it [01:32, 10.86it/s, agent0/loss=89.214, agent1/loss=110.249, agent2/loss=89.853, agent3/loss=90.676, agent4/loss=97.541, agent5/loss=83.424, agent6/loss=94.344, agent7/loss=83.333, agent8/loss=77.029, agent9/loss=86.600, env_step=105000, len=281, n/ep=2, n/st=500, rew=-2718.03]                          


Epoch #105: test_reward: -8719.935864 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #106: 1001it [01:31, 10.92it/s, agent0/loss=97.393, agent1/loss=91.382, agent2/loss=91.400, agent3/loss=86.344, agent4/loss=76.244, agent5/loss=82.533, agent6/loss=82.449, agent7/loss=86.621, agent8/loss=82.581, agent9/loss=97.499, env_step=106000, len=300, n/ep=0, n/st=500, rew=-2636.66]                          


Epoch #106: test_reward: -8716.629226 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #107: 1001it [01:19, 12.60it/s, agent0/loss=101.852, agent1/loss=90.174, agent2/loss=97.370, agent3/loss=102.758, agent4/loss=87.517, agent5/loss=85.332, agent6/loss=82.643, agent7/loss=109.080, agent8/loss=73.937, agent9/loss=91.995, env_step=107000, len=300, n/ep=3, n/st=500, rew=-2615.37]                          


Epoch #107: test_reward: -8747.258811 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #108: 1001it [01:22, 12.10it/s, agent0/loss=87.019, agent1/loss=96.939, agent2/loss=97.183, agent3/loss=87.808, agent4/loss=107.202, agent5/loss=76.634, agent6/loss=109.725, agent7/loss=74.278, agent8/loss=87.969, agent9/loss=80.950, env_step=108000, len=294, n/ep=2, n/st=500, rew=-2661.13]                          


Epoch #108: test_reward: -8720.033635 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #109: 1001it [02:06,  7.89it/s, agent0/loss=84.434, agent1/loss=94.236, agent2/loss=86.360, agent3/loss=103.239, agent4/loss=92.083, agent5/loss=72.192, agent6/loss=103.735, agent7/loss=103.047, agent8/loss=111.740, agent9/loss=88.709, env_step=109000, len=265, n/ep=2, n/st=500, rew=-2778.51]                          


Epoch #109: test_reward: -8717.609452 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #110: 1001it [01:36, 10.37it/s, agent0/loss=111.975, agent1/loss=103.151, agent2/loss=89.113, agent3/loss=101.612, agent4/loss=106.808, agent5/loss=96.204, agent6/loss=100.149, agent7/loss=84.241, agent8/loss=94.744, agent9/loss=92.081, env_step=110000, len=297, n/ep=3, n/st=500, rew=-2615.21]                          


Epoch #110: test_reward: -8719.623653 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #111: 1001it [01:29, 11.25it/s, agent0/loss=113.677, agent1/loss=108.855, agent2/loss=102.708, agent3/loss=88.400, agent4/loss=89.339, agent5/loss=78.615, agent6/loss=90.411, agent7/loss=82.022, agent8/loss=99.224, agent9/loss=74.820, env_step=111000, len=300, n/ep=0, n/st=500, rew=-3291.16]                          


Epoch #111: test_reward: -8736.737437 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #112: 1001it [01:27, 11.38it/s, agent0/loss=118.807, agent1/loss=91.679, agent2/loss=92.357, agent3/loss=83.477, agent4/loss=98.080, agent5/loss=88.298, agent6/loss=116.025, agent7/loss=87.857, agent8/loss=86.925, agent9/loss=88.852, env_step=112000, len=300, n/ep=2, n/st=500, rew=-2689.16]                           


Epoch #112: test_reward: -8732.635726 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #113: 1001it [01:30, 11.02it/s, agent0/loss=108.599, agent1/loss=94.683, agent2/loss=114.767, agent3/loss=93.777, agent4/loss=101.382, agent5/loss=87.237, agent6/loss=101.090, agent7/loss=88.374, agent8/loss=82.901, agent9/loss=108.261, env_step=113000, len=300, n/ep=3, n/st=500, rew=-3376.06]                          


Epoch #113: test_reward: -8745.633678 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #114: 1001it [01:28, 11.35it/s, agent0/loss=94.305, agent1/loss=103.288, agent2/loss=96.745, agent3/loss=137.387, agent4/loss=113.467, agent5/loss=96.793, agent6/loss=89.588, agent7/loss=84.697, agent8/loss=88.253, agent9/loss=78.715, env_step=114000, len=300, n/ep=0, n/st=500, rew=-2978.09]                            


Epoch #114: test_reward: -8743.622875 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #115: 1001it [01:30, 11.03it/s, agent0/loss=101.207, agent1/loss=79.526, agent2/loss=106.174, agent3/loss=103.576, agent4/loss=90.444, agent5/loss=109.947, agent6/loss=105.465, agent7/loss=87.851, agent8/loss=105.688, agent9/loss=82.561, env_step=115000, len=300, n/ep=2, n/st=500, rew=-2582.08]                          


Epoch #115: test_reward: -8742.250870 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #116: 1001it [01:30, 11.04it/s, agent0/loss=108.769, agent1/loss=98.665, agent2/loss=99.944, agent3/loss=110.631, agent4/loss=115.195, agent5/loss=100.977, agent6/loss=103.512, agent7/loss=100.304, agent8/loss=105.389, agent9/loss=89.716, env_step=116000, len=300, n/ep=3, n/st=500, rew=-3350.52]                          


Epoch #116: test_reward: -8733.964770 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #117: 1001it [02:06,  7.94it/s, agent0/loss=118.765, agent1/loss=112.202, agent2/loss=105.462, agent3/loss=113.800, agent4/loss=102.708, agent5/loss=100.828, agent6/loss=108.200, agent7/loss=93.613, agent8/loss=97.058, agent9/loss=120.264, env_step=117000, len=296, n/ep=0, n/st=500, rew=-3264.61]                          


Epoch #117: test_reward: -8751.310473 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #118: 1001it [01:31, 10.92it/s, agent0/loss=101.147, agent1/loss=100.980, agent2/loss=105.694, agent3/loss=121.790, agent4/loss=88.536, agent5/loss=102.894, agent6/loss=105.087, agent7/loss=104.137, agent8/loss=127.168, agent9/loss=102.084, env_step=118000, len=300, n/ep=2, n/st=500, rew=-3018.48]                           


Epoch #118: test_reward: -8729.321262 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #119: 1001it [01:29, 11.24it/s, agent0/loss=104.709, agent1/loss=122.484, agent2/loss=99.409, agent3/loss=122.808, agent4/loss=101.200, agent5/loss=107.361, agent6/loss=88.149, agent7/loss=101.732, agent8/loss=98.676, agent9/loss=86.778, env_step=119000, len=300, n/ep=3, n/st=500, rew=-3455.16]                          


Epoch #119: test_reward: -8743.896250 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #120: 1001it [01:29, 11.20it/s, agent0/loss=120.238, agent1/loss=116.103, agent2/loss=112.665, agent3/loss=117.526, agent4/loss=117.227, agent5/loss=76.415, agent6/loss=109.963, agent7/loss=92.979, agent8/loss=91.042, agent9/loss=119.677, env_step=120000, len=300, n/ep=0, n/st=500, rew=-3974.28]                            


Epoch #120: test_reward: -8708.089854 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #121: 1001it [01:28, 11.27it/s, agent0/loss=134.272, agent1/loss=97.092, agent2/loss=127.483, agent3/loss=97.025, agent4/loss=106.439, agent5/loss=102.105, agent6/loss=98.757, agent7/loss=124.245, agent8/loss=84.810, agent9/loss=112.663, env_step=121000, len=300, n/ep=2, n/st=500, rew=-3866.26]                           


Epoch #121: test_reward: -8732.912833 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #122: 1001it [01:30, 11.07it/s, agent0/loss=106.230, agent1/loss=131.266, agent2/loss=114.310, agent3/loss=113.981, agent4/loss=114.856, agent5/loss=103.541, agent6/loss=120.434, agent7/loss=122.198, agent8/loss=131.517, agent9/loss=79.793, env_step=122000, len=300, n/ep=3, n/st=500, rew=-3518.76]                          


Epoch #122: test_reward: -8729.421122 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #123: 1001it [01:34, 10.56it/s, agent0/loss=113.920, agent1/loss=105.897, agent2/loss=122.994, agent3/loss=120.853, agent4/loss=109.208, agent5/loss=132.179, agent6/loss=127.250, agent7/loss=131.919, agent8/loss=119.576, agent9/loss=105.805, env_step=123000, len=300, n/ep=0, n/st=500, rew=-3026.38]                          


Epoch #123: test_reward: -8698.888579 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #124: 1001it [01:28, 11.26it/s, agent0/loss=114.063, agent1/loss=112.801, agent2/loss=87.843, agent3/loss=97.814, agent4/loss=125.895, agent5/loss=98.816, agent6/loss=114.880, agent7/loss=113.903, agent8/loss=106.680, agent9/loss=116.457, env_step=124000, len=300, n/ep=2, n/st=500, rew=-3254.65]                            


Epoch #124: test_reward: -8718.632175 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #125: 1001it [01:25, 11.76it/s, agent0/loss=126.628, agent1/loss=100.596, agent2/loss=85.553, agent3/loss=113.688, agent4/loss=110.526, agent5/loss=101.854, agent6/loss=124.042, agent7/loss=118.964, agent8/loss=108.753, agent9/loss=102.187, env_step=125000, len=287, n/ep=3, n/st=500, rew=-2682.54]                          


Epoch #125: test_reward: -8754.878282 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #126: 1001it [01:14, 13.48it/s, agent0/loss=118.273, agent1/loss=108.115, agent2/loss=107.960, agent3/loss=130.541, agent4/loss=122.053, agent5/loss=108.282, agent6/loss=107.260, agent7/loss=122.234, agent8/loss=123.345, agent9/loss=110.660, env_step=126000, len=282, n/ep=0, n/st=500, rew=-3023.16]                          


Epoch #126: test_reward: -8716.990715 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #127: 1001it [01:03, 15.82it/s, agent0/loss=131.905, agent1/loss=102.509, agent2/loss=127.440, agent3/loss=109.828, agent4/loss=107.352, agent5/loss=100.930, agent6/loss=125.055, agent7/loss=118.254, agent8/loss=119.093, agent9/loss=110.053, env_step=127000, len=300, n/ep=2, n/st=500, rew=-3369.58]                          


Epoch #127: test_reward: -8716.944636 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #128: 1001it [01:03, 15.80it/s, agent0/loss=118.764, agent1/loss=130.216, agent2/loss=113.576, agent3/loss=110.937, agent4/loss=115.113, agent5/loss=118.066, agent6/loss=122.647, agent7/loss=126.454, agent8/loss=113.131, agent9/loss=112.277, env_step=128000, len=300, n/ep=3, n/st=500, rew=-3621.00]                          


Epoch #128: test_reward: -8719.640338 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #129: 1001it [01:00, 16.56it/s, agent0/loss=131.230, agent1/loss=137.540, agent2/loss=111.093, agent3/loss=114.166, agent4/loss=107.349, agent5/loss=122.942, agent6/loss=118.643, agent7/loss=126.917, agent8/loss=103.025, agent9/loss=118.014, env_step=129000, len=300, n/ep=0, n/st=500, rew=-3912.17]                          


Epoch #129: test_reward: -8730.670372 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #130: 1001it [01:00, 16.41it/s, agent0/loss=144.198, agent1/loss=131.976, agent2/loss=132.192, agent3/loss=147.446, agent4/loss=118.823, agent5/loss=109.005, agent6/loss=131.694, agent7/loss=135.469, agent8/loss=115.565, agent9/loss=106.859, env_step=130000, len=300, n/ep=2, n/st=500, rew=-3925.55]                          


Epoch #130: test_reward: -8712.042308 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #131: 1001it [01:02, 16.05it/s, agent0/loss=111.570, agent1/loss=123.059, agent2/loss=123.215, agent3/loss=116.219, agent4/loss=105.546, agent5/loss=118.964, agent6/loss=119.527, agent7/loss=132.145, agent8/loss=124.098, agent9/loss=104.927, env_step=131000, len=300, n/ep=3, n/st=500, rew=-3468.67]                          


Epoch #131: test_reward: -8731.848764 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #132: 1001it [01:00, 16.59it/s, agent0/loss=115.120, agent1/loss=144.001, agent2/loss=114.928, agent3/loss=126.688, agent4/loss=122.022, agent5/loss=123.318, agent6/loss=126.584, agent7/loss=131.467, agent8/loss=113.663, agent9/loss=128.765, env_step=132000, len=294, n/ep=0, n/st=500, rew=-2970.77]                          


Epoch #132: test_reward: -8725.168452 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #133: 1001it [01:01, 16.18it/s, agent0/loss=115.995, agent1/loss=140.083, agent2/loss=112.817, agent3/loss=109.594, agent4/loss=103.680, agent5/loss=105.119, agent6/loss=125.846, agent7/loss=110.264, agent8/loss=114.876, agent9/loss=96.123, env_step=133000, len=300, n/ep=2, n/st=500, rew=-3262.94]                          


Epoch #133: test_reward: -8698.205940 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #134: 1001it [01:00, 16.49it/s, agent0/loss=121.990, agent1/loss=132.013, agent2/loss=125.688, agent3/loss=131.992, agent4/loss=120.372, agent5/loss=124.559, agent6/loss=118.723, agent7/loss=127.444, agent8/loss=107.269, agent9/loss=134.342, env_step=134000, len=300, n/ep=3, n/st=500, rew=-3575.51]                          


Epoch #134: test_reward: -8725.491940 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #135: 1001it [01:01, 16.40it/s, agent0/loss=113.470, agent1/loss=146.985, agent2/loss=125.118, agent3/loss=140.064, agent4/loss=112.734, agent5/loss=111.628, agent6/loss=127.057, agent7/loss=133.182, agent8/loss=118.454, agent9/loss=147.341, env_step=135000, len=294, n/ep=0, n/st=500, rew=-3656.02]                          


Epoch #135: test_reward: -8720.106939 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #136: 1001it [01:01, 16.39it/s, agent0/loss=137.692, agent1/loss=119.227, agent2/loss=131.500, agent3/loss=130.927, agent4/loss=158.603, agent5/loss=123.362, agent6/loss=121.168, agent7/loss=141.151, agent8/loss=127.091, agent9/loss=115.232, env_step=136000, len=300, n/ep=2, n/st=500, rew=-4194.79]                          


Epoch #136: test_reward: -8708.994016 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #137: 1001it [01:02, 16.02it/s, agent0/loss=143.388, agent1/loss=125.273, agent2/loss=113.405, agent3/loss=117.713, agent4/loss=106.450, agent5/loss=131.797, agent6/loss=129.254, agent7/loss=136.215, agent8/loss=121.812, agent9/loss=155.422, env_step=137000, len=300, n/ep=3, n/st=500, rew=-3885.72]                          


Epoch #137: test_reward: -8723.601422 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #138: 1001it [01:01, 16.38it/s, agent0/loss=146.522, agent1/loss=136.172, agent2/loss=129.697, agent3/loss=123.473, agent4/loss=132.068, agent5/loss=127.299, agent6/loss=134.901, agent7/loss=128.041, agent8/loss=133.328, agent9/loss=135.594, env_step=138000, len=300, n/ep=0, n/st=500, rew=-4095.90]                          


Epoch #138: test_reward: -8735.914136 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #139: 1001it [01:01, 16.17it/s, agent0/loss=124.796, agent1/loss=146.497, agent2/loss=123.174, agent3/loss=121.726, agent4/loss=132.054, agent5/loss=120.213, agent6/loss=122.631, agent7/loss=122.390, agent8/loss=170.833, agent9/loss=129.569, env_step=139000, len=300, n/ep=2, n/st=500, rew=-4218.13]                          


Epoch #139: test_reward: -8716.625671 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #140: 1001it [01:01, 16.38it/s, agent0/loss=143.378, agent1/loss=155.130, agent2/loss=144.247, agent3/loss=156.972, agent4/loss=139.993, agent5/loss=114.392, agent6/loss=135.089, agent7/loss=128.594, agent8/loss=130.118, agent9/loss=131.091, env_step=140000, len=300, n/ep=3, n/st=500, rew=-4404.33]                          


Epoch #140: test_reward: -8746.441166 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #141: 1001it [00:59, 16.71it/s, agent0/loss=159.640, agent1/loss=133.846, agent2/loss=150.212, agent3/loss=150.739, agent4/loss=144.159, agent5/loss=149.768, agent6/loss=135.123, agent7/loss=124.369, agent8/loss=127.344, agent9/loss=127.494, env_step=141000, len=300, n/ep=0, n/st=500, rew=-4284.12]                          


Epoch #141: test_reward: -8741.137637 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #142: 1001it [01:02, 16.03it/s, agent0/loss=143.326, agent1/loss=150.606, agent2/loss=156.794, agent3/loss=132.047, agent4/loss=140.126, agent5/loss=144.633, agent6/loss=122.209, agent7/loss=146.985, agent8/loss=132.464, agent9/loss=128.076, env_step=142000, len=300, n/ep=2, n/st=500, rew=-4069.66]                          


Epoch #142: test_reward: -8719.107259 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #143: 1001it [01:00, 16.60it/s, agent0/loss=149.296, agent1/loss=145.198, agent2/loss=131.279, agent3/loss=137.226, agent4/loss=136.318, agent5/loss=120.578, agent6/loss=130.047, agent7/loss=135.138, agent8/loss=147.886, agent9/loss=148.840, env_step=143000, len=300, n/ep=3, n/st=500, rew=-4486.75]                          


Epoch #143: test_reward: -8726.537360 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #144: 1001it [01:04, 15.57it/s, agent0/loss=165.993, agent1/loss=164.796, agent2/loss=125.940, agent3/loss=149.899, agent4/loss=132.051, agent5/loss=146.533, agent6/loss=132.742, agent7/loss=138.520, agent8/loss=130.658, agent9/loss=154.142, env_step=144000, len=300, n/ep=0, n/st=500, rew=-3818.45]                          


Epoch #144: test_reward: -8715.574926 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #145: 1001it [01:02, 15.97it/s, agent0/loss=133.446, agent1/loss=140.890, agent2/loss=142.993, agent3/loss=127.640, agent4/loss=138.266, agent5/loss=140.449, agent6/loss=149.081, agent7/loss=169.159, agent8/loss=148.746, agent9/loss=137.023, env_step=145000, len=300, n/ep=2, n/st=500, rew=-4275.78]                          


Epoch #145: test_reward: -8749.237886 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #146: 1001it [01:04, 15.55it/s, agent0/loss=155.348, agent1/loss=155.513, agent2/loss=152.160, agent3/loss=126.346, agent4/loss=137.858, agent5/loss=138.161, agent6/loss=130.953, agent7/loss=143.682, agent8/loss=136.213, agent9/loss=135.807, env_step=146000, len=300, n/ep=3, n/st=500, rew=-4846.40]                          


Epoch #146: test_reward: -8751.016355 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #147: 1001it [01:04, 15.52it/s, agent0/loss=158.075, agent1/loss=147.698, agent2/loss=150.440, agent3/loss=142.794, agent4/loss=135.185, agent5/loss=135.256, agent6/loss=133.127, agent7/loss=125.797, agent8/loss=119.167, agent9/loss=168.618, env_step=147000, len=300, n/ep=0, n/st=500, rew=-4111.30]                          


Epoch #147: test_reward: -8696.649254 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #148: 1001it [01:02, 16.13it/s, agent0/loss=163.870, agent1/loss=146.575, agent2/loss=152.873, agent3/loss=149.201, agent4/loss=147.956, agent5/loss=160.515, agent6/loss=174.333, agent7/loss=156.243, agent8/loss=158.873, agent9/loss=163.561, env_step=148000, len=300, n/ep=2, n/st=500, rew=-4332.85]                          


Epoch #148: test_reward: -8735.542513 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #149: 1001it [01:01, 16.34it/s, agent0/loss=163.402, agent1/loss=148.695, agent2/loss=154.998, agent3/loss=151.608, agent4/loss=138.255, agent5/loss=148.826, agent6/loss=178.152, agent7/loss=158.783, agent8/loss=162.346, agent9/loss=150.866, env_step=149000, len=300, n/ep=3, n/st=500, rew=-4468.80]                          


Epoch #149: test_reward: -8735.642339 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #150: 1001it [01:03, 15.67it/s, agent0/loss=163.652, agent1/loss=159.089, agent2/loss=168.611, agent3/loss=160.566, agent4/loss=132.946, agent5/loss=160.169, agent6/loss=158.015, agent7/loss=139.230, agent8/loss=132.920, agent9/loss=141.128, env_step=150000, len=300, n/ep=0, n/st=500, rew=-4199.30]                          


Epoch #150: test_reward: -8699.082405 ± 0.000000, best_reward: -260.124235 ± 0.000000 in #23


Epoch #151:  50%|#####     | 500/1000 [00:19<00:19, 25.85it/s] 


KeyboardInterrupt: 

In [None]:

from typing import Optional, Tuple

import numpy as np
import torch
from tianshou.env import DummyVectorEnv
from tianshou.trainer import offpolicy_trainer
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger
import torch
#from Custom_Classes import CustomCollector

# Create a new instance of the policy with the same architecture as the saved policy
policy, optim, _ = _get_agents()
model_save_path = os.path.join("dqn_Custom", save_policy_name)        

# Load the saved checkpoint
policy_test = policy.policies['agent0']
policy_test.load_state_dict(torch.load(model_save_path ))

envs = DummyVectorEnv([_get_env for _ in range(1)])

envs.max_time_steps = 200
#policy_test.policies['agent0'].eval()
#policy.policies['agent0'].set_eps(0.9)

policy_test.eval()
policy_test.set_eps(0.00)

#collector = CustomCollector(policy.policies['agent0'], envs, exploration_noise=True)
#collector = CustomCollector(policy_test, envs, exploration_noise=False)
collector = CustomCollector(policy, envs, exploration_noise=True)

results = collector.collect(n_episode=10)
#collector.collect(n_episode=1, render=1 / 5000)




In [None]:
results

In [None]:
max(results['rews'])
print(np.mean(results['rews'][results['rews'] > -10]))


#create a function  to print a histogram of the results['rews']
import matplotlib.pyplot as plt
plt.hist(results['rews'][results['rews'] > -10], bins=100)
plt.show()


In [None]:
from turtle import st
import torch
from tianshou.data import Batch

# load policy as in your original code
policy, optim, _ = _get_agents()
model_save_path = os.path.join("dqn_Custom", save_policy_name)        
policy_test = policy.policies['agent0']
state_saved = torch.load(model_save_path )
print(policy_test)
policy_test.load_state_dict(state_saved)
policy_test.eval()
policy_test.set_eps(0.00)

# initialize your environment
#env = DummyVectorEnv([_get_env for _ in range(1)])
env = MultiDroneEnv(None)
env.max_time_steps = 100

# simulate the interaction with the environment manually
for episode in range(1):  # simulate 10 episodes
    
    obs, _  = env.reset(seed=episode)         
    info         = env.get_initial_state()
    
    drones = info["drones"]
    tasks = info["tasks"]
        
    done = {0 : False}
    truncations = {0 : False}
    
    episodo_reward = 0
    #obs, reward, done, truncations, info = env.step(action)

    while not all(done.values()) and not all(truncations.values()):
        
        agent_id = "agent" + str(env.agent_selector._current_agent)
        # Create a Batch of observations
        obs_batch = Batch(obs=obs[agent_id], info=[{}])  # add empty info for each observation
        
        #print(obs_batch)
        # Forward the batch of observations through the policy to get the actions
        action = policy_test(obs_batch).act
        action = {agent_id : action[0]}
       
        obs, reward, done, truncations, info = env.step(action)
          
        episodo_reward += sum(reward.values())/env.n_agents

       

print(episodo_reward)
