In [1]:
import os
from typing import Optional, Tuple

import gym
import numpy as np
import torch
from tianshou.data import Collector, VectorReplayBuffer, PrioritizedVectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import offpolicy_trainer
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger

from Custom_Classes import CustomNet
from Custom_Classes import CustomCollector
from Custom_Classes import CustomParallelToAECWrapper

#from CustomClass_multi_head import CustomNet
from Custom_Classes_simplified import CustomNetSimple
#from Custom_Classes_simplified import CustomCollectorSimple
#from Custom_Classes_simplified import CustomParallelToAECWrapperSimple

from CustomClasses_Transformer_Reduced import CustomNetReduced
from CustomClass_MultiHead_Transformer import CustomNetMultiHead
import importlib

from DroneEnv import MultiDroneEnv
from tianshou_DQN import train


model = "CustomNetMultiHead" # "CustomNet" or "CustomNetSimple" or "CustomNetReduced" or "CustomNetMultiHead"
test_num = "_Eval_TBTA_01_simplified_UCF1"

train_env_num = 5
test_env_num = 5

name = model + test_num

load_policy_name = f'policy_CustomNetMultiHead_Eval_TBTA_03_pre_process_F_reward.pth'
save_policy_name = f'policy_{name}.pth'
policy_path = "dqn_Custom"
load_model = False

log_path = os.path.join('./', "Logs", "dqn", name)

dqn_params = {"discount_factor": 0.98, 
              "estimation_step": 200, 
              "target_update_freq": 750,
              "optminizer": "Adam",
              "lr": 1e-4  }

trainer_params = {"max_epoch": 200,
                  "step_per_epoch": 1500 * train_env_num,
                  "step_per_collect": 300 * train_env_num,
                  "episode_per_test": 5 * test_env_num,
                  "batch_size" : 32,
                  "update_per_step": 0.1,
                  "tn_eps_max": 0.85,
                  "ts_eps_max": 0.001,
                  }

Run_Data = f'{name}\n\
        Loaded_Model: {load_policy_name if load_model == True else "no"} \n\
        log_path: {log_path} \n\
        train/test_env_num: {train_env_num} / {test_env_num} \n\
        model: {model} \n\
        dqn_params: {dqn_params} \n\
        trainer_params: {trainer_params} \n\
        obs: Task Info -> Dist / Quality for own drone \
            Agents_info -> Post_next / Time_next / Type \
            Scene:  F1:6, R1:6 | Rec:16, Att:4'

model_load_path = os.path.join(policy_path, load_policy_name)  
model_save_path = os.path.join(policy_path, save_policy_name)        
os.makedirs(os.path.join(policy_path), exist_ok=True)  
os.makedirs(os.path.join(log_path), exist_ok=True)


def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
    policy_load_path = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    
    env = _get_env()
    agent_name = env.agents[0]  # Get the name of the first agent

    #print(env.observation_space )
    agent_observation_space = env.observation_space # assuming 'agent0' is a valid agent name
    state_shape_agent_position = agent_observation_space["agent_position"].shape[0]
    state_shape_agent_state = agent_observation_space["agent_state"].shape[0]
    state_shape_agent_type = agent_observation_space["agent_type"].shape[0]
    state_shape_next_free_time = agent_observation_space["next_free_time"].shape[0]
    state_shape_position_after_last_task = agent_observation_space["position_after_last_task"].shape[0]       
    #state_shape_agent_relay_area = agent_observation_space["agent_relay_area"].shape[0]
        
    state_shape_agent = (state_shape_agent_position + state_shape_agent_state +
                     state_shape_agent_type+ state_shape_next_free_time + state_shape_position_after_last_task #+                     
                     #state_shape_agent_relay_area
                     )                 
    

    state_shape_task = env.observation_space["tasks_info"].shape[0]
                  
    action_shape = env.action_space[agent_name].shape[0]
    #action_shape = env.action_space[agent_name].n
               
    if agent_learn is None:
        # model
        if model == "CustomNet":        
            net = CustomNet(
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        if model == "CustomNetSimple":
            net = CustomNetSimple(            
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        if model == "CustomNetReduced":
            net = CustomNetReduced(            
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        if model == "CustomNetMultiHead":
            net = CustomNetMultiHead(
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")

    
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=dqn_params["lr"])
    
        agent_learn = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor= dqn_params["discount_factor"],
            estimation_step=dqn_params["estimation_step"],
            target_update_freq=dqn_params["target_update_freq"],
        )  
        
        if load_model == True:
            # Load the saved checkpoint             
            agent_learn.load_state_dict(torch.load(model_load_path))
            print(f'Loaded-> {model_load_path}')
            
        
        agents = [agent_learn for _ in range(len(env.agents))]
        
    policy = MultiAgentPolicyManager(agents, env)    
        
    return policy, optim, env.agents


def _get_env():
    """This function is needed to provide callables for DummyVectorEnv."""
    env_paralell = MultiDroneEnv()
    #env = parallel_to_aec_wrapper(env_paralell)    
    env = CustomParallelToAECWrapper(env_paralell)
    
    return PettingZooEnv(env)

print(Run_Data)

CustomNetMultiHead_Eval_TBTA_01_simplified_UCF1
        Loaded_Model: no 
        log_path: ./Logs\dqn\CustomNetMultiHead_Eval_TBTA_01_simplified_UCF1 
        train/test_env_num: 5 / 5 
        model: CustomNetMultiHead 
        dqn_params: {'discount_factor': 0.98, 'estimation_step': 200, 'target_update_freq': 750, 'optminizer': 'Adam', 'lr': 0.0001} 
        trainer_params: {'max_epoch': 200, 'step_per_epoch': 7500, 'step_per_collect': 1500, 'episode_per_test': 25, 'batch_size': 32, 'update_per_step': 0.1, 'tn_eps_max': 0.85, 'ts_eps_max': 0.001} 
        obs: Task Info -> Dist / Quality for own drone             Agents_info -> Post_next / Time_next / Type             Scene:  F1:6, R1:6 | Rec:16, Att:4


In [2]:
if __name__ == "__main__":
                        
    torch.set_grad_enabled(True) 
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(train_env_num)])
    test_envs = DummyVectorEnv([_get_env for _ in range(test_env_num)]) 

    # seed
    seed = 1
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)

    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents()
    

    # ======== Step 3: Collector setup =========
    train_collector = CustomCollector(
        policy,
        train_envs,
        #VectorReplayBuffer(100_000, len(train_envs)),
        PrioritizedVectorReplayBuffer( 100_000, len(train_envs), alpha=0.6, beta=0.4) ,       
        exploration_noise=True        
    )
    test_collector = CustomCollector(policy, test_envs, exploration_noise=True)
     
    train_collector.collect(n_step=trainer_params['batch_size'] * train_env_num)
    #test_collector.collect(n_step=trainer_params['batch size'] * train_env_num)
    
    # ======== tensorboard logging setup =========
    #         
    writer = SummaryWriter(log_path)
    writer.add_text(name, str(Run_Data))
    logger = TensorboardLogger(writer)
        
    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):                
        torch.save(policy.policies[agents[0]].state_dict(), model_save_path)

    def stop_fn(mean_rewards):
        return mean_rewards >= 9939.0

    def train_fn(epoch, env_step):
        epsilon = trainer_params['tn_eps_max'] - (trainer_params['tn_eps_max'] - trainer_params['tn_eps_max']/100)*(epoch/trainer_params['max_epoch'])  
        policy.policies[agents[0]].set_eps(epsilon)

    def test_fn(epoch, env_step):
        epsilon = trainer_params['ts_eps_max']#0.01#max(0.001, 0.1 - epoch * 0.001)
        policy.policies[agents[0]].set_eps(epsilon)
        
    def reward_metric(rews):       
        #print(rews)
        return rews.mean()#[:,0]
                           
    # ======== Step 5: Run the trainer =========
    result = offpolicy_trainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,        
        max_epoch=trainer_params['max_epoch'],
        step_per_epoch=trainer_params['step_per_epoch'],
        step_per_collect=trainer_params['step_per_collect'],
        episode_per_test= trainer_params['episode_per_test'],
        batch_size=trainer_params['batch_size'],
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        update_per_step=trainer_params['update_per_step'],
        logger=logger,
        test_in_train=False,
        reward_metric=reward_metric,
        show_progress = True
        )

    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[0]])")

Epoch #1: 7501it [04:06, 30.43it/s, agent0/loss=202.745, agent1/loss=176.510, agent2/loss=227.083, agent3/loss=165.875, agent4/loss=175.410, agent5/loss=156.104, agent6/loss=165.100, agent7/loss=154.850, agent8/loss=162.167, agent9/loss=109.210, env_step=7500, len=291, n/ep=5, n/st=1500, rew=27.88]                            


Epoch #1: test_reward: 63.358118 ± 0.000000, best_reward: 76.897085 ± 0.000000 in #0


Epoch #2: 7501it [04:31, 27.60it/s, agent0/loss=251.076, agent1/loss=191.125, agent2/loss=103.811, agent3/loss=175.709, agent4/loss=96.223, agent5/loss=117.376, agent6/loss=87.782, agent7/loss=202.300, agent8/loss=88.334, agent9/loss=86.776, env_step=15000, len=270, n/ep=6, n/st=1500, rew=35.93]                              


Epoch #2: test_reward: 70.831266 ± 0.000000, best_reward: 76.897085 ± 0.000000 in #0


Epoch #3: 7501it [03:38, 34.35it/s, agent0/loss=104.597, agent1/loss=142.363, agent2/loss=124.864, agent3/loss=113.521, agent4/loss=120.109, agent5/loss=62.407, agent6/loss=113.991, agent7/loss=100.579, agent8/loss=76.597, agent9/loss=98.108, env_step=22500, len=300, n/ep=5, n/st=1500, rew=19.93]                            


Epoch #3: test_reward: 77.166958 ± 0.000000, best_reward: 77.166958 ± 0.000000 in #3


Epoch #4: 7501it [02:46, 45.07it/s, agent0/loss=116.628, agent1/loss=154.446, agent2/loss=123.692, agent3/loss=97.668, agent4/loss=86.238, agent5/loss=93.230, agent6/loss=110.529, agent7/loss=82.426, agent8/loss=74.141, agent9/loss=97.557, env_step=30000, len=250, n/ep=7, n/st=1500, rew=46.12]                              


Epoch #4: test_reward: 74.724018 ± 0.000000, best_reward: 77.166958 ± 0.000000 in #3


Epoch #5: 7501it [03:44, 33.45it/s, agent0/loss=145.037, agent1/loss=128.029, agent2/loss=123.441, agent3/loss=167.628, agent4/loss=91.070, agent5/loss=113.797, agent6/loss=87.886, agent7/loss=81.396, agent8/loss=88.307, agent9/loss=88.424, env_step=37500, len=294, n/ep=5, n/st=1500, rew=35.46]                            


Epoch #5: test_reward: 70.164816 ± 0.000000, best_reward: 77.166958 ± 0.000000 in #3


Epoch #6: 7501it [03:03, 40.99it/s, agent0/loss=91.402, agent1/loss=136.242, agent2/loss=123.338, agent3/loss=95.307, agent4/loss=93.078, agent5/loss=94.190, agent6/loss=73.058, agent7/loss=130.764, agent8/loss=81.267, agent9/loss=77.927, env_step=45000, len=288, n/ep=5, n/st=1500, rew=22.41]                               


Epoch #6: test_reward: 56.894299 ± 0.000000, best_reward: 77.166958 ± 0.000000 in #3


Epoch #7: 7501it [02:59, 41.77it/s, agent0/loss=72.972, agent1/loss=124.697, agent2/loss=97.995, agent3/loss=96.156, agent4/loss=99.622, agent5/loss=83.793, agent6/loss=95.254, agent7/loss=89.180, agent8/loss=67.832, agent9/loss=65.165, env_step=52500, len=289, n/ep=5, n/st=1500, rew=20.16]                                


Epoch #7: test_reward: 72.333677 ± 0.000000, best_reward: 77.166958 ± 0.000000 in #3


Epoch #8: 7501it [03:32, 35.22it/s, agent0/loss=96.703, agent1/loss=115.158, agent2/loss=89.686, agent3/loss=107.346, agent4/loss=76.070, agent5/loss=88.503, agent6/loss=85.304, agent7/loss=97.203, agent8/loss=77.316, agent9/loss=100.553, env_step=60000, len=296, n/ep=5, n/st=1500, rew=26.68]                               


Epoch #8: test_reward: 68.148650 ± 0.000000, best_reward: 77.166958 ± 0.000000 in #3


Epoch #9: 7501it [03:15, 38.29it/s, agent0/loss=81.703, agent1/loss=118.379, agent2/loss=128.190, agent3/loss=108.019, agent4/loss=78.083, agent5/loss=109.969, agent6/loss=85.873, agent7/loss=89.613, agent8/loss=65.191, agent9/loss=88.362, env_step=67500, len=269, n/ep=5, n/st=1500, rew=40.43]                             


Epoch #9: test_reward: 72.812904 ± 0.000000, best_reward: 77.166958 ± 0.000000 in #3


Epoch #10: 7501it [03:39, 34.23it/s, agent0/loss=84.897, agent1/loss=107.156, agent2/loss=115.417, agent3/loss=94.817, agent4/loss=66.726, agent5/loss=81.853, agent6/loss=107.360, agent7/loss=82.941, agent8/loss=63.847, agent9/loss=77.769, env_step=75000, len=237, n/ep=6, n/st=1500, rew=53.46]                            


Epoch #10: test_reward: 78.571223 ± 0.000000, best_reward: 78.571223 ± 0.000000 in #10


Epoch #11:  60%|######    | 4500/7500 [02:18<01:32, 32.53it/s, agent0/loss=96.015, agent1/loss=88.327, agent2/loss=132.681, agent3/loss=117.221, agent4/loss=73.111, agent5/loss=106.449, agent6/loss=129.470, agent7/loss=77.314, agent8/loss=80.508, agent9/loss=82.871, env_step=78000, len=270, n/ep=7, n/st=1500, rew=55.69] 


KeyboardInterrupt: 

In [None]:

from typing import Optional, Tuple

import numpy as np
import torch
from tianshou.env import DummyVectorEnv
from tianshou.trainer import offpolicy_trainer
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger
import torch
#from Custom_Classes import CustomCollector

# Create a new instance of the policy with the same architecture as the saved policy
policy, optim, _ = _get_agents()
model_save_path = os.path.join("dqn_Custom", save_policy_name)        

# Load the saved checkpoint
policy_test = policy.policies['agent0']
policy_test.load_state_dict(torch.load(model_save_path ))

envs = DummyVectorEnv([_get_env for _ in range(1)])

envs.max_time_steps = 200
#policy_test.policies['agent0'].eval()
#policy.policies['agent0'].set_eps(0.9)

policy_test.eval()
policy_test.set_eps(0.00)

#collector = CustomCollector(policy.policies['agent0'], envs, exploration_noise=True)
#collector = CustomCollector(policy_test, envs, exploration_noise=False)
collector = CustomCollector(policy, envs, exploration_noise=True)

results = collector.collect(n_episode=10)
#collector.collect(n_episode=1, render=1 / 5000)




In [None]:
results

In [None]:
max(results['rews'])
print(np.mean(results['rews'][results['rews'] > -10]))


#create a function  to print a histogram of the results['rews']
import matplotlib.pyplot as plt
plt.hist(results['rews'][results['rews'] > -10], bins=100)
plt.show()


In [None]:
from turtle import st
import torch
from tianshou.data import Batch

# load policy as in your original code
policy, optim, _ = _get_agents()
model_save_path = os.path.join("dqn_Custom", save_policy_name)        
policy_test = policy.policies['agent0']
state_saved = torch.load(model_save_path )
print(policy_test)
policy_test.load_state_dict(state_saved)
policy_test.eval()
policy_test.set_eps(0.00)

# initialize your environment
#env = DummyVectorEnv([_get_env for _ in range(1)])
env = MultiDroneEnv(None)
env.max_time_steps = 100

# simulate the interaction with the environment manually
for episode in range(1):  # simulate 10 episodes
    
    obs, _  = env.reset(seed=episode)         
    info         = env.get_initial_state()
    
    drones = info["drones"]
    tasks = info["tasks"]
        
    done = {0 : False}
    truncations = {0 : False}
    
    episodo_reward = 0
    #obs, reward, done, truncations, info = env.step(action)

    while not all(done.values()) and not all(truncations.values()):
        
        agent_id = "agent" + str(env.agent_selector._current_agent)
        # Create a Batch of observations
        obs_batch = Batch(obs=obs[agent_id], info=[{}])  # add empty info for each observation
        
        #print(obs_batch)
        # Forward the batch of observations through the policy to get the actions
        action = policy_test(obs_batch).act
        action = {agent_id : action[0]}
       
        obs, reward, done, truncations, info = env.step(action)
          
        episodo_reward += sum(reward.values())/env.n_agents

       

print(episodo_reward)
