In [None]:
from gc import collect
import os
from typing import Optional, Tuple

import gym
import numpy as np
import torch
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import offpolicy_trainer
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger


from Custom_Classes import CustomNet
from Custom_Classes import CustomCollector
from Custom_Classes import CustomParallelToAECWrapper

#from CustomClass_multi_head import CustomNet
from Custom_Classes_simplified import CustomNetSimple
from Custom_Classes_simplified import CustomCollectorSimple
from Custom_Classes_simplified import CustomParallelToAECWrapperSimple

from DroneEnv import MultiDroneEnv
from tianshou_DQN import train


model = "CustomNetSimple" # "CustomNet" or "CustomNetSimple"
test_num = "1505_03_3"

train_env_num = 10
test_env_num = 10

name = model + test_num

load_policy_name = f'policy_Simple_1505_02.pth'
save_policy_name = f'policy_{name}_{test_num}'
policy_path = "dqn_Custom"
load_model = False

log_path = os.path.join('./', "Logs", "dqn", name)

dqn_params = {"discount_factor": 0.99, 
              "estimation_step": 1, 
              "target_update_freq": 100,
              "optminizer": "Adam",
              "lr": 1e-4  }

trainer_params = {"max_epoch": 10000,
                  "step_per_epoch": 300 * train_env_num,
                  "step_per_collect": 100 * train_env_num,
                  "episode_per_test": 10 * test_env_num,
                  "batch_size" : 32,
                  "update_per_step": 0.1,
                  "tn_eps_max": 0.2,
                  "ts_eps_max": 0.01,
                  }

Run_Data = f'{name}\n\
        Loaded_Model: {load_policy_name if load_model == True else "no"} \n\
        log_path: {log_path} \n\
        train/test_env_num: {train_env_num} / {test_env_num} \n\
        model: {model} \n\
        dqn_params: {dqn_params} \n\
        trainer_params: {trainer_params} \n'

model_load_path = os.path.join(policy_path, load_policy_name)  
model_save_path = os.path.join(policy_path, save_policy_name)        
os.makedirs(os.path.join(policy_path), exist_ok=True)  
os.makedirs(os.path.join(log_path), exist_ok=True)


def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
    policy_load_path = None
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    
    env = _get_env()
    agent_name = env.agents[0]  # Get the name of the first agent
    agent_observation_space = env.observation_space[agent_name]  # Get the observation_space for that agent
       
    agent_observation_space = env.observation_space["agent0"]  # assuming 'agent0' is a valid agent name
    state_shape_agent_position = agent_observation_space["agent_position"].shape[0]
    state_shape_agent_state = agent_observation_space["agent_state"].shape[0]
    state_shape_agent_type = agent_observation_space["agent_type"].shape[0]
    state_shape_next_free_time = agent_observation_space["next_free_time"].shape[0]
    state_shape_position_after_last_task = agent_observation_space["position_after_last_task"].shape[0]       
    #state_shape_agent_relay_area = agent_observation_space["agent_relay_area"].shape[0]
    
    
    state_shape_agent = (state_shape_agent_position + state_shape_agent_state +
                     state_shape_agent_type+ state_shape_next_free_time + state_shape_position_after_last_task #+                     
                     #state_shape_agent_relay_area
                     )                 
    

    state_shape_task = env.observation_space["agent0"]["tasks_info"].shape[0]
                  
    action_shape = env.action_space[agent_name].shape[0]
    #action_shape = env.action_space[agent_name].n
               
    if agent_learn is None:
        # model
        if model == "CustomNet":        
            net = CustomNet(
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        if model == "CustomNetSimple":
            net = CustomNetSimple(            
                state_shape_agent=state_shape_agent,
                state_shape_task=state_shape_task,
                action_shape=action_shape,
                hidden_sizes=[128,128],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")

    
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=dqn_params["lr"])
    
        agent_learn = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor= dqn_params["discount_factor"],
            estimation_step=dqn_params["estimation_step"],
            target_update_freq=dqn_params["target_update_freq"],
        )  
        
        if load_model == True:
            # Load the saved checkpoint             
            agent_learn.load_state_dict(torch.load(model_load_path))
            print(f'Loaded-> {model_load_path}')
            
        
        agents = [agent_learn for _ in range(len(env.agents))]
        
    policy = MultiAgentPolicyManager(agents, env)    
        
    return policy, optim, env.agents


def _get_env():
    """This function is needed to provide callables for DummyVectorEnv."""
    env_paralell = MultiDroneEnv()
    #env = parallel_to_aec_wrapper(env_paralell)    
    env = CustomParallelToAECWrapper(env_paralell)
    
    return PettingZooEnv(env)

print(Run_Data)

In [27]:
if __name__ == "__main__":
                        
    torch.set_grad_enabled(True) 
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(train_env_num)])
    test_envs = DummyVectorEnv([_get_env for _ in range(test_env_num)]) 

    # seed
    seed = 1
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)

    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents()
    

    # ======== Step 3: Collector setup =========
    train_collector = CustomCollector(
        policy,
        train_envs,
        VectorReplayBuffer(100_000, len(train_envs)),
        exploration_noise=True,
    )
    test_collector = CustomCollector(policy, test_envs, exploration_noise=True)
     
    train_collector.collect(n_step=trainer_params['batch_size'] * train_env_num)
    #test_collector.collect(n_step=trainer_params['batch size'] * train_env_num)
    
    # ======== tensorboard logging setup =========
    #         
    writer = SummaryWriter(log_path)
    writer.add_text(name, str(Run_Data))
    logger = TensorboardLogger(writer)
        
    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):                
        torch.save(policy.policies[agents[0]].state_dict(), model_save_path)

    def stop_fn(mean_rewards):
        return mean_rewards >= 9939.0

    def train_fn(epoch, env_step):
        epsilon = max(trainer_params['tn_eps_max']/10, trainer_params['tn_eps_max'] - epoch * trainer_params['tn_eps_max']/1000)
        policy.policies[agents[0]].set_eps(epsilon)

    def test_fn(epoch, env_step):
        epsilon = trainer_params['ts_eps_max']#0.01#max(0.001, 0.1 - epoch * 0.001)
        policy.policies[agents[0]].set_eps(epsilon)
        
    def reward_metric(rews):       
        #print(rews)
        return rews.mean()#[:,0]
                           
    # ======== Step 5: Run the trainer =========
    result = offpolicy_trainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,        
        max_epoch=trainer_params['max_epoch'],
        step_per_epoch=trainer_params['step_per_epoch'],
        step_per_collect=trainer_params['step_per_collect'],
        episode_per_test= trainer_params['episode_per_test'],
        batch_size=trainer_params['batch_size'],
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        update_per_step=trainer_params['update_per_step'],
        logger=logger,
        test_in_train=False,
        reward_metric=reward_metric,
        show_progress = True
        )

    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[0]])")

Epoch #315: 3001it [00:05, 576.34it/s, agent0/loss=9.179, env_step=945000, len=10, n/ep=100, n/st=1000, rew=4.02]                           


Epoch #315: test_reward: 2.986403 ± 0.000000, best_reward: 4.202308 ± 0.000000 in #300


Epoch #316: 3001it [00:03, 757.45it/s, agent0/loss=8.804, env_step=948000, len=10, n/ep=100, n/st=1000, rew=4.01]                           


Epoch #316: test_reward: 2.401333 ± 0.000000, best_reward: 4.202308 ± 0.000000 in #300


Epoch #317: 3001it [00:03, 805.42it/s, agent0/loss=8.788, env_step=951000, len=10, n/ep=100, n/st=1000, rew=4.69]                           


Epoch #317: test_reward: 4.848735 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #318: 3001it [00:03, 753.07it/s, agent0/loss=8.981, env_step=954000, len=10, n/ep=100, n/st=1000, rew=4.54]                           


Epoch #318: test_reward: 3.629503 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #319: 3001it [00:04, 630.86it/s, agent0/loss=9.133, env_step=957000, len=10, n/ep=100, n/st=1000, rew=4.25]                           


Epoch #319: test_reward: 3.128085 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #320: 3001it [00:03, 790.36it/s, agent0/loss=8.684, env_step=960000, len=10, n/ep=100, n/st=1000, rew=3.35]                           


Epoch #320: test_reward: 3.001444 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #321: 3001it [00:04, 690.04it/s, agent0/loss=8.716, env_step=963000, len=10, n/ep=100, n/st=1000, rew=4.68]                           


Epoch #321: test_reward: 4.088928 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #322: 3001it [00:03, 803.05it/s, agent0/loss=8.395, env_step=966000, len=10, n/ep=100, n/st=1000, rew=3.66]                           


Epoch #322: test_reward: 3.592046 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #323: 3001it [00:03, 770.47it/s, agent0/loss=8.592, env_step=969000, len=10, n/ep=100, n/st=1000, rew=4.34]                           


Epoch #323: test_reward: 4.143204 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #324: 3001it [00:03, 820.17it/s, agent0/loss=8.741, env_step=972000, len=10, n/ep=100, n/st=1000, rew=3.16]                           


Epoch #324: test_reward: 3.174953 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #325: 3001it [00:04, 704.79it/s, agent0/loss=8.651, env_step=975000, len=10, n/ep=100, n/st=1000, rew=3.78]                           


Epoch #325: test_reward: 3.107093 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #326: 3001it [00:03, 820.17it/s, agent0/loss=8.733, env_step=978000, len=10, n/ep=100, n/st=1000, rew=5.53]                           


Epoch #326: test_reward: 4.329717 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #327: 3001it [00:03, 835.24it/s, agent0/loss=8.648, env_step=981000, len=10, n/ep=100, n/st=1000, rew=3.44]                           


Epoch #327: test_reward: 3.523904 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #328: 3001it [00:03, 760.13it/s, agent0/loss=8.661, env_step=984000, len=10, n/ep=100, n/st=1000, rew=2.42]                           


Epoch #328: test_reward: 2.800879 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #329: 3001it [00:03, 798.35it/s, agent0/loss=8.652, env_step=987000, len=10, n/ep=100, n/st=1000, rew=4.05]                           


Epoch #329: test_reward: -3.583122 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #330: 3001it [00:03, 769.29it/s, agent0/loss=8.813, env_step=990000, len=10, n/ep=100, n/st=1000, rew=3.03]                           


Epoch #330: test_reward: 2.836843 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #331: 3001it [00:03, 752.89it/s, agent0/loss=9.258, env_step=993000, len=10, n/ep=100, n/st=1000, rew=0.23]                           


Epoch #331: test_reward: 3.803474 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #332: 3001it [00:03, 795.39it/s, agent0/loss=8.936, env_step=996000, len=10, n/ep=100, n/st=1000, rew=3.31]                           


Epoch #332: test_reward: 3.218160 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #333: 3001it [00:04, 739.71it/s, agent0/loss=9.120, env_step=999000, len=10, n/ep=100, n/st=1000, rew=4.49]                           


Epoch #333: test_reward: 4.412267 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #334: 3001it [00:03, 762.26it/s, agent0/loss=8.800, env_step=1002000, len=10, n/ep=100, n/st=1000, rew=4.15]                           


Epoch #334: test_reward: 4.386932 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #335: 3001it [00:04, 698.39it/s, agent0/loss=8.852, env_step=1005000, len=10, n/ep=100, n/st=1000, rew=3.03]                           


Epoch #335: test_reward: 4.197777 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #336: 3001it [00:04, 726.46it/s, agent0/loss=8.701, env_step=1008000, len=10, n/ep=100, n/st=1000, rew=4.22]                           


Epoch #336: test_reward: 3.023814 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #337: 3001it [00:03, 758.79it/s, agent0/loss=8.902, env_step=1011000, len=10, n/ep=100, n/st=1000, rew=4.79]                           


Epoch #337: test_reward: 3.910824 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #338: 3001it [00:03, 830.84it/s, agent0/loss=8.394, env_step=1014000, len=10, n/ep=100, n/st=1000, rew=4.91]                           


Epoch #338: test_reward: 0.943138 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #339: 3001it [00:03, 750.44it/s, agent0/loss=8.749, env_step=1017000, len=10, n/ep=100, n/st=1000, rew=1.70]                           


Epoch #339: test_reward: 4.180132 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #340: 3001it [00:03, 760.32it/s, agent0/loss=8.876, env_step=1020000, len=10, n/ep=100, n/st=1000, rew=6.87]                           


Epoch #340: test_reward: 4.574014 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #341: 3001it [00:04, 675.44it/s, agent0/loss=8.802, env_step=1023000, len=10, n/ep=100, n/st=1000, rew=3.69]                           


Epoch #341: test_reward: 4.020876 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #342: 3001it [00:04, 695.64it/s, agent0/loss=8.907, env_step=1026000, len=10, n/ep=100, n/st=1000, rew=2.16]                           


Epoch #342: test_reward: 4.133421 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #343: 3001it [00:04, 729.99it/s, agent0/loss=8.743, env_step=1029000, len=10, n/ep=100, n/st=1000, rew=2.45]                           


Epoch #343: test_reward: 3.986018 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #344: 3001it [00:03, 768.90it/s, agent0/loss=8.780, env_step=1032000, len=10, n/ep=100, n/st=1000, rew=3.17]                           


Epoch #344: test_reward: 2.975484 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #345: 3001it [00:03, 778.47it/s, agent0/loss=8.652, env_step=1035000, len=10, n/ep=100, n/st=1000, rew=3.40]                           


Epoch #345: test_reward: 3.065603 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #346: 3001it [00:03, 777.66it/s, agent0/loss=8.965, env_step=1038000, len=10, n/ep=100, n/st=1000, rew=3.78]                           


Epoch #346: test_reward: 3.706402 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #347: 3001it [00:03, 753.64it/s, agent0/loss=8.886, env_step=1041000, len=10, n/ep=100, n/st=1000, rew=3.73]                           


Epoch #347: test_reward: 1.805432 ± 0.000000, best_reward: 4.848735 ± 0.000000 in #317


Epoch #348:  33%|###3      | 1000/3000 [00:00<00:01, 1414.43it/s]

In [None]:

from typing import Optional, Tuple

import numpy as np
import torch
from tianshou.env import DummyVectorEnv
from tianshou.trainer import offpolicy_trainer
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger
import torch
#from Custom_Classes import CustomCollector

# Create a new instance of the policy with the same architecture as the saved policy
policy, optim, _ = _get_agents()
model_save_path = os.path.join("dqn_Custom", save_policy_name)        

# Load the saved checkpoint
policy_test = policy.policies['agent0']
policy_test.load_state_dict(torch.load(model_save_path ))

envs = DummyVectorEnv([_get_env for _ in range(1)])

#policy_test.policies['agent0'].eval()
#policy.policies['agent0'].set_eps(0.9)

policy_test.eval()
policy_test.set_eps(0.00)

#collector = CustomCollector(policy.policies['agent0'], envs, exploration_noise=True)
collector = CustomCollector(policy_test, envs, exploration_noise=True)

results = collector.collect(n_episode=100)
#collector.collect(n_episode=2, render=1 / 5000)




In [None]:
max(results['rews'])
print(np.mean(results['rews'][results['rews'] > -10]))


#create a function  to print a histogram of the results['rews']
import matplotlib.pyplot as plt
plt.hist(results['rews'][results['rews'] > -10], bins=100)
plt.show()
