In [1]:
import os
from typing import Optional, Tuple

import gym
import numpy as np
import torch
from tianshou.data import Collector, VectorReplayBuffer, PrioritizedVectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import offpolicy_trainer
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger

from Custom_Classes import CustomNet
from Custom_Classes import CustomCollector
from Custom_Classes import CustomParallelToAECWrapper

#from CustomClass_multi_head import CustomNet
from Custom_Classes_simplified import CustomNetSimple
#from Custom_Classes_simplified import CustomCollectorSimple
#from Custom_Classes_simplified import CustomParallelToAECWrapperSimple

from CustomClasses_Transformer_Reduced import CustomNetReduced
from CustomClass_MultiHead_Transformer import CustomNetMultiHead
import importlib

from DroneEnv import MultiDroneEnv
from tianshou_DQN import train


model = "CustomNetMultiHead" # "CustomNet" or "CustomNetSimple" or "CustomNetReduced" or "CustomNetMultiHead"
test_num = "Eval_TBTA_03_pre_process"

train_env_num = 5
test_env_num = 5

name = model + test_num

load_policy_name = f'policy_CustomNetSimple1605_01_1_Priorized_1605_01_1_Priorized.pth'
save_policy_name = f'policy_{name}.pth'
policy_path = "dqn_Custom"
load_model = False

log_path = os.path.join('./', "Logs", "dqn", name)

dqn_params = {"discount_factor": 0.98, 
              "estimation_step": 5, 
              "target_update_freq": 100,
              "optminizer": "Adam",
              "lr": 1e-4  }

trainer_params = {"max_epoch": 200,
                  "step_per_epoch": 500 * train_env_num,
                  "step_per_collect": 100 * train_env_num,
                  "episode_per_test": 10 * test_env_num,
                  "batch_size" : 32,
                  "update_per_step": 0.1,
                  "tn_eps_max": 0.85,
                  "ts_eps_max": 0.001,
                  }

Run_Data = f'{name}\n\
        Loaded_Model: {load_policy_name if load_model == True else "no"} \n\
        log_path: {log_path} \n\
        train/test_env_num: {train_env_num} / {test_env_num} \n\
        model: {model} \n\
        dqn_params: {dqn_params} \n\
        trainer_params: {trainer_params} \n\
        obs: Task Info -> Dist / Quality for own drone \
            Agents_info -> Post_next / Time_next / Type \
            Scene:  F1:6, R1:6 | Rec:16, Att:4'

model_load_path = os.path.join(policy_path, load_policy_name)  
model_save_path = os.path.join(policy_path, save_policy_name)        
os.makedirs(os.path.join(policy_path), exist_ok=True)  
os.makedirs(os.path.join(log_path), exist_ok=True)


def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
    policy_load_path = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    
    env = _get_env()
    agent_name = env.agents[0]  # Get the name of the first agent

    #print(env.observation_space )
    agent_observation_space = env.observation_space # assuming 'agent0' is a valid agent name
    state_shape_agent_position = agent_observation_space["agent_position"].shape[0]
    state_shape_agent_state = agent_observation_space["agent_state"].shape[0]
    state_shape_agent_type = agent_observation_space["agent_type"].shape[0]
    state_shape_next_free_time = agent_observation_space["next_free_time"].shape[0]
    state_shape_position_after_last_task = agent_observation_space["position_after_last_task"].shape[0]       
    #state_shape_agent_relay_area = agent_observation_space["agent_relay_area"].shape[0]
        
    state_shape_agent = (state_shape_agent_position + state_shape_agent_state +
                     state_shape_agent_type+ state_shape_next_free_time + state_shape_position_after_last_task #+                     
                     #state_shape_agent_relay_area
                     )                 
    

    state_shape_task = env.observation_space["tasks_info"].shape[0]
                  
    action_shape = env.action_space[agent_name].shape[0]
    #action_shape = env.action_space[agent_name].n
               
    if agent_learn is None:
        # model
        if model == "CustomNet":        
            net = CustomNet(
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        if model == "CustomNetSimple":
            net = CustomNetSimple(            
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        if model == "CustomNetReduced":
            net = CustomNetReduced(            
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        if model == "CustomNetMultiHead":
            net = CustomNetMultiHead(
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")

    
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=dqn_params["lr"])
    
        agent_learn = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor= dqn_params["discount_factor"],
            estimation_step=dqn_params["estimation_step"],
            target_update_freq=dqn_params["target_update_freq"],
        )  
        
        if load_model == True:
            # Load the saved checkpoint             
            agent_learn.load_state_dict(torch.load(model_load_path))
            print(f'Loaded-> {model_load_path}')
            
        
        agents = [agent_learn for _ in range(len(env.agents))]
        
    policy = MultiAgentPolicyManager(agents, env)    
        
    return policy, optim, env.agents


def _get_env():
    """This function is needed to provide callables for DummyVectorEnv."""
    env_paralell = MultiDroneEnv()
    #env = parallel_to_aec_wrapper(env_paralell)    
    env = CustomParallelToAECWrapper(env_paralell)
    
    return PettingZooEnv(env)

print(Run_Data)

  from .autonotebook import tqdm as notebook_tqdm


CustomNetMultiHeadEval_TBTA_03_pre_process
        Loaded_Model: no 
        log_path: ./Logs\dqn\CustomNetMultiHeadEval_TBTA_03_pre_process 
        train/test_env_num: 5 / 5 
        model: CustomNetMultiHead 
        dqn_params: {'discount_factor': 0.98, 'estimation_step': 5, 'target_update_freq': 100, 'optminizer': 'Adam', 'lr': 0.0001} 
        trainer_params: {'max_epoch': 200, 'step_per_epoch': 2500, 'step_per_collect': 500, 'episode_per_test': 50, 'batch_size': 32, 'update_per_step': 0.1, 'tn_eps_max': 0.85, 'ts_eps_max': 0.001} 
        obs: Task Info -> Dist / Quality for own drone             Agents_info -> Post_next / Time_next / Type             Scene:  F1:6, R1:6 | Rec:16, Att:4


In [2]:
if __name__ == "__main__":
                        
    torch.set_grad_enabled(True) 
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(train_env_num)])
    test_envs = DummyVectorEnv([_get_env for _ in range(test_env_num)]) 

    # seed
    seed = 1
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)

    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents()
    

    # ======== Step 3: Collector setup =========
    train_collector = CustomCollector(
        policy,
        train_envs,
        #VectorReplayBuffer(100_000, len(train_envs)),
        PrioritizedVectorReplayBuffer( 100_000, len(train_envs), alpha=0.6, beta=0.4) ,       
        exploration_noise=True        
    )
    test_collector = CustomCollector(policy, test_envs, exploration_noise=True)
     
    train_collector.collect(n_step=trainer_params['batch_size'] * train_env_num)
    #test_collector.collect(n_step=trainer_params['batch size'] * train_env_num)
    
    # ======== tensorboard logging setup =========
    #         
    writer = SummaryWriter(log_path)
    writer.add_text(name, str(Run_Data))
    logger = TensorboardLogger(writer)
        
    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):                
        torch.save(policy.policies[agents[0]].state_dict(), model_save_path)

    def stop_fn(mean_rewards):
        return mean_rewards >= 9939.0

    def train_fn(epoch, env_step):
        epsilon = trainer_params['tn_eps_max'] - (trainer_params['tn_eps_max'] - trainer_params['tn_eps_max']/100)*(epoch/trainer_params['max_epoch'])  
        policy.policies[agents[0]].set_eps(epsilon)

    def test_fn(epoch, env_step):
        epsilon = trainer_params['ts_eps_max']#0.01#max(0.001, 0.1 - epoch * 0.001)
        policy.policies[agents[0]].set_eps(epsilon)
        
    def reward_metric(rews):       
        #print(rews)
        return rews.mean()#[:,0]
                           
    # ======== Step 5: Run the trainer =========
    result = offpolicy_trainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,        
        max_epoch=trainer_params['max_epoch'],
        step_per_epoch=trainer_params['step_per_epoch'],
        step_per_collect=trainer_params['step_per_collect'],
        episode_per_test= trainer_params['episode_per_test'],
        batch_size=trainer_params['batch_size'],
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        update_per_step=trainer_params['update_per_step'],
        logger=logger,
        test_in_train=False,
        reward_metric=reward_metric,
        show_progress = True
        )

    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[0]])")



.


Epoch #1: 2501it [00:30, 83.32it/s, agent0/loss=29.720, agent1/loss=52.311, agent2/loss=51.521, agent3/loss=39.067, agent4/loss=26.282, agent5/loss=26.414, agent6/loss=33.183, agent7/loss=31.051, agent8/loss=25.065, agent9/loss=23.676, env_step=2500, len=289, n/ep=0, n/st=500, rew=-4.08]                                 


Epoch #1: test_reward: -6.143507 ± 0.000000, best_reward: 20.523214 ± 0.000000 in #0


Epoch #2: 2501it [00:28, 87.60it/s, agent0/loss=30.851, agent1/loss=19.788, agent2/loss=25.881, agent3/loss=21.793, agent4/loss=25.367, agent5/loss=18.865, agent6/loss=20.734, agent7/loss=19.437, agent8/loss=13.678, agent9/loss=13.422, env_step=5000, len=300, n/ep=2, n/st=500, rew=-45.04]                           


Epoch #2: test_reward: 16.829512 ± 0.000000, best_reward: 20.523214 ± 0.000000 in #0


Epoch #3: 2501it [00:27, 90.12it/s, agent0/loss=20.575, agent1/loss=22.126, agent2/loss=13.895, agent3/loss=11.113, agent4/loss=16.467, agent5/loss=17.629, agent6/loss=20.351, agent7/loss=12.857, agent8/loss=20.530, agent9/loss=6.619, env_step=7500, len=300, n/ep=2, n/st=500, rew=-52.16]                            


Epoch #3: test_reward: 7.673075 ± 0.000000, best_reward: 20.523214 ± 0.000000 in #0


Epoch #4: 2501it [00:29, 84.57it/s, agent0/loss=14.288, agent1/loss=29.550, agent2/loss=13.517, agent3/loss=12.127, agent4/loss=16.255, agent5/loss=15.339, agent6/loss=11.917, agent7/loss=13.037, agent8/loss=12.980, agent9/loss=12.780, env_step=10000, len=300, n/ep=1, n/st=500, rew=-57.23]                          


Epoch #4: test_reward: 25.293285 ± 0.000000, best_reward: 25.293285 ± 0.000000 in #4


Epoch #5: 2501it [00:28, 87.92it/s, agent0/loss=17.410, agent1/loss=21.158, agent2/loss=16.720, agent3/loss=13.578, agent4/loss=18.994, agent5/loss=14.512, agent6/loss=11.468, agent7/loss=16.093, agent8/loss=10.639, agent9/loss=11.860, env_step=12500, len=300, n/ep=2, n/st=500, rew=-35.13]                          


Epoch #5: test_reward: 32.574004 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #6: 2501it [00:28, 87.40it/s, agent0/loss=16.687, agent1/loss=15.882, agent2/loss=9.291, agent3/loss=12.213, agent4/loss=10.851, agent5/loss=8.453, agent6/loss=7.663, agent7/loss=8.365, agent8/loss=9.454, agent9/loss=9.605, env_step=15000, len=300, n/ep=2, n/st=500, rew=-37.40]                                


Epoch #6: test_reward: 12.433095 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #7: 2501it [00:29, 85.69it/s, agent0/loss=20.354, agent1/loss=18.953, agent2/loss=15.159, agent3/loss=11.125, agent4/loss=12.725, agent5/loss=11.630, agent6/loss=7.557, agent7/loss=9.831, agent8/loss=10.846, agent9/loss=9.705, env_step=17500, len=300, n/ep=1, n/st=500, rew=-34.87]                           


Epoch #7: test_reward: 24.610077 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #8: 2501it [00:28, 86.83it/s, agent0/loss=20.341, agent1/loss=14.037, agent2/loss=12.679, agent3/loss=11.997, agent4/loss=11.460, agent5/loss=6.365, agent6/loss=10.575, agent7/loss=10.088, agent8/loss=8.359, agent9/loss=11.312, env_step=20000, len=300, n/ep=2, n/st=500, rew=-72.74]                           


Epoch #8: test_reward: 26.268071 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #9: 2501it [00:28, 86.71it/s, agent0/loss=22.331, agent1/loss=12.773, agent2/loss=10.724, agent3/loss=11.468, agent4/loss=8.456, agent5/loss=8.824, agent6/loss=8.986, agent7/loss=10.983, agent8/loss=10.270, agent9/loss=7.844, env_step=22500, len=300, n/ep=1, n/st=500, rew=-79.37]                            


Epoch #9: test_reward: 0.809599 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #10: 2501it [00:28, 88.71it/s, agent0/loss=16.408, agent1/loss=12.194, agent2/loss=13.695, agent3/loss=14.086, agent4/loss=12.377, agent5/loss=7.580, agent6/loss=7.820, agent7/loss=8.459, agent8/loss=8.608, agent9/loss=8.134, env_step=25000, len=300, n/ep=2, n/st=500, rew=-65.86]                            


Epoch #10: test_reward: 23.896640 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #11: 2501it [00:28, 86.32it/s, agent0/loss=17.863, agent1/loss=13.855, agent2/loss=9.787, agent3/loss=11.824, agent4/loss=6.596, agent5/loss=4.993, agent6/loss=11.099, agent7/loss=9.587, agent8/loss=12.176, agent9/loss=7.768, env_step=27500, len=300, n/ep=2, n/st=500, rew=-66.19]                           


Epoch #11: test_reward: 31.383985 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #12: 2501it [00:29, 86.08it/s, agent0/loss=15.468, agent1/loss=16.554, agent2/loss=10.255, agent3/loss=12.346, agent4/loss=8.464, agent5/loss=10.108, agent6/loss=9.526, agent7/loss=7.973, agent8/loss=9.667, agent9/loss=11.289, env_step=30000, len=300, n/ep=1, n/st=500, rew=-64.74]                            


Epoch #12: test_reward: 19.004681 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #13: 2501it [00:28, 88.05it/s, agent0/loss=16.479, agent1/loss=12.893, agent2/loss=10.090, agent3/loss=9.791, agent4/loss=7.128, agent5/loss=7.515, agent6/loss=8.873, agent7/loss=11.664, agent8/loss=8.311, agent9/loss=10.820, env_step=32500, len=248, n/ep=1, n/st=500, rew=44.60]                             


Epoch #13: test_reward: 17.433123 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #14: 2501it [00:28, 86.89it/s, agent0/loss=17.822, agent1/loss=10.666, agent2/loss=11.380, agent3/loss=8.294, agent4/loss=16.450, agent5/loss=10.486, agent6/loss=15.762, agent7/loss=9.917, agent8/loss=8.952, agent9/loss=8.805, env_step=35000, len=244, n/ep=3, n/st=500, rew=18.77]                            


Epoch #14: test_reward: 5.697079 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #15: 2501it [00:28, 87.99it/s, agent0/loss=16.955, agent1/loss=16.273, agent2/loss=12.682, agent3/loss=11.367, agent4/loss=8.460, agent5/loss=6.897, agent6/loss=6.726, agent7/loss=12.567, agent8/loss=13.148, agent9/loss=8.538, env_step=37500, len=272, n/ep=3, n/st=500, rew=5.57]                              


Epoch #15: test_reward: 16.291907 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #16: 2501it [00:28, 87.32it/s, agent0/loss=11.639, agent1/loss=14.408, agent2/loss=11.831, agent3/loss=14.740, agent4/loss=8.415, agent5/loss=9.078, agent6/loss=13.384, agent7/loss=11.546, agent8/loss=14.070, agent9/loss=9.058, env_step=40000, len=300, n/ep=3, n/st=500, rew=-12.47]                          


Epoch #16: test_reward: 17.810285 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #17: 2501it [00:29, 85.12it/s, agent0/loss=15.347, agent1/loss=15.491, agent2/loss=17.013, agent3/loss=12.480, agent4/loss=8.414, agent5/loss=8.304, agent6/loss=11.060, agent7/loss=9.947, agent8/loss=9.817, agent9/loss=11.510, env_step=42500, len=276, n/ep=2, n/st=500, rew=-11.41]                            


Epoch #17: test_reward: 26.728760 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #18: 2501it [00:27, 89.95it/s, agent0/loss=13.599, agent1/loss=8.531, agent2/loss=10.713, agent3/loss=9.285, agent4/loss=8.034, agent5/loss=11.054, agent6/loss=11.982, agent7/loss=12.841, agent8/loss=8.660, agent9/loss=14.934, env_step=45000, len=246, n/ep=2, n/st=500, rew=14.22]                              


Epoch #18: test_reward: -5.930292 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #19: 2501it [00:28, 86.52it/s, agent0/loss=12.646, agent1/loss=12.397, agent2/loss=15.254, agent3/loss=14.662, agent4/loss=8.438, agent5/loss=8.272, agent6/loss=7.188, agent7/loss=9.719, agent8/loss=10.635, agent9/loss=12.324, env_step=47500, len=300, n/ep=2, n/st=500, rew=-76.30]                            


Epoch #19: test_reward: 4.889419 ± 0.000000, best_reward: 32.574004 ± 0.000000 in #5


Epoch #20:  40%|####      | 1000/2500 [00:07<00:11, 125.36it/s, agent0/loss=11.780, agent1/loss=13.585, agent2/loss=15.255, agent3/loss=13.443, agent4/loss=9.154, agent5/loss=10.052, agent6/loss=9.608, agent7/loss=8.805, agent8/loss=12.715, agent9/loss=14.932, env_step=48000, len=300, n/ep=2, n/st=500, rew=-51.23]


KeyboardInterrupt: 

In [None]:

from typing import Optional, Tuple

import numpy as np
import torch
from tianshou.env import DummyVectorEnv
from tianshou.trainer import offpolicy_trainer
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger
import torch
#from Custom_Classes import CustomCollector

# Create a new instance of the policy with the same architecture as the saved policy
policy, optim, _ = _get_agents()
model_save_path = os.path.join("dqn_Custom", save_policy_name)        

# Load the saved checkpoint
policy_test = policy.policies['agent0']
policy_test.load_state_dict(torch.load(model_save_path ))

envs = DummyVectorEnv([_get_env for _ in range(1)])

envs.max_time_steps = 200
#policy_test.policies['agent0'].eval()
#policy.policies['agent0'].set_eps(0.9)

policy_test.eval()
policy_test.set_eps(0.00)

#collector = CustomCollector(policy.policies['agent0'], envs, exploration_noise=True)
#collector = CustomCollector(policy_test, envs, exploration_noise=False)
collector = CustomCollector(policy, envs, exploration_noise=True)

results = collector.collect(n_episode=10)
#collector.collect(n_episode=1, render=1 / 5000)




In [None]:
results

In [None]:
max(results['rews'])
print(np.mean(results['rews'][results['rews'] > -10]))


#create a function  to print a histogram of the results['rews']
import matplotlib.pyplot as plt
plt.hist(results['rews'][results['rews'] > -10], bins=100)
plt.show()


In [None]:
from turtle import st
import torch
from tianshou.data import Batch

# load policy as in your original code
policy, optim, _ = _get_agents()
model_save_path = os.path.join("dqn_Custom", save_policy_name)        
policy_test = policy.policies['agent0']
state_saved = torch.load(model_save_path )
print(policy_test)
policy_test.load_state_dict(state_saved)
policy_test.eval()
policy_test.set_eps(0.00)

# initialize your environment
#env = DummyVectorEnv([_get_env for _ in range(1)])
env = MultiDroneEnv(None)
env.max_time_steps = 100

# simulate the interaction with the environment manually
for episode in range(1):  # simulate 10 episodes
    
    obs, _  = env.reset(seed=episode)         
    info         = env.get_initial_state()
    
    drones = info["drones"]
    tasks = info["tasks"]
        
    done = {0 : False}
    truncations = {0 : False}
    
    episodo_reward = 0
    #obs, reward, done, truncations, info = env.step(action)

    while not all(done.values()) and not all(truncations.values()):
        
        agent_id = "agent" + str(env.agent_selector._current_agent)
        # Create a Batch of observations
        obs_batch = Batch(obs=obs[agent_id], info=[{}])  # add empty info for each observation
        
        #print(obs_batch)
        # Forward the batch of observations through the policy to get the actions
        action = policy_test(obs_batch).act
        action = {agent_id : action[0]}
       
        obs, reward, done, truncations, info = env.step(action)
          
        episodo_reward += sum(reward.values())/env.n_agents

       

print(episodo_reward)
