In [1]:
import os
import datetime
from typing import Optional, Tuple
import json
import numpy as np
import torch
from gymnasium.spaces import Box, Discrete
import pandas as pd 

import random

from torch.distributions import Normal, Distribution

from tianshou.data import Collector, VectorReplayBuffer, PrioritizedVectorReplayBuffer
from tianshou.env import SubprocVectorEnv, DummyVectorEnv
#from tianshou.env.pettingzoo_env_parallel import PettingZooParallelEnv
#from tianshou.env.pettingzoo_env import PettingZooEnv
#from PettingZooParallelEnv import PettingZooParallelEnv


from tianshou.policy import PPOPolicy
from tianshou.trainer import OnpolicyTrainer

from tianshou.utils.net.common import ActorCritic, DataParallelNet, Net
from tianshou.utils.net.continuous import Actor, Critic

from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, DDPGPolicy
from tianshou.trainer import OffpolicyTrainer
from torch.utils.tensorboard import SummaryWriter
from DNN_B_ACE_ACTOR import DNN_B_ACE_ACTOR
from DNN_B_ACE_CRITIC import DNN_B_ACE_CRITIC
from Task_MHA_B_ACE import Task_MHA_B_ACE
from Task_DNN_B_ACE import Task_DNN_B_ACE
from Task_B_ACE_Env import B_ACE_TaskEnv

from CollectorMA import CollectorMA
from MAParalellPolicy import MAParalellPolicy


####---------------------------#######
#Tianshou Adjustment
import wandb
# os.environ["WANDB_NOTEBOOK_NAME"] = "Tianshow_Training_GoDot.ipybn"
from tianshou.utils import WandbLogger
# from tianshou.utils.logger.base import LOG_DATA_TYPE
# def new_write(self, step_type: str, step: int, data: LOG_DATA_TYPE) -> None:
#      data[step_type] = step
#      wandb.log(data)   
# WandbLogger.write = new_write 
####---------------------------#######


model  =  "Task_MHA_B_ACE"#"SISL_Task_MultiHead" #"CNN_ATT_SISL" #"MultiHead_SISL" 
test_num  =  "_B_ACE03"
policyModel  =  "DQN"
name = model + test_num

train_env_num = 4
test_env_num  = 15

now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
log_name = name + str(now)
log_path = os.path.join('./', "Logs", "dqn_sisl", log_name)

load_policy_name = f'policy_Task_MHA_B_ACE_B_ACE02240721-151049_1261_BestRew.pth'
save_policy_name = f'policy_{log_name}'
policy_path = model + policyModel


model_load_path = os.path.join(policy_path, load_policy_name)  
model_save_path = os.path.join(policy_path, save_policy_name)        
os.makedirs(os.path.join(policy_path), exist_ok=True)  
os.makedirs(os.path.join(log_path), exist_ok=True)

Policy_Config = {
    "same_policy" : True,
    "load_model" : False,
    "freeze_CNN" : False     
                }

B_ACE_Config = { 	
                    "EnvConfig" : 
                    {
                        "task": "b_ace_v1",
                        "env_path": "..\..\BVR_AirCombat/bin/B_ACE_v13.exe",
                        "port": 12500,
                        "renderize": 1,
                        "debug_view": 0,
                        "phy_fps": 20,
                        "speed_up": 50000,
                        "max_cycles": 36000,
                        "experiment_mode"  : 0,
                        "parallel_envs": 1,	
                        "seed": 1,	
                        "action_repeat": 20,	
                        "action_type": "Low_Level_Continuous",                        
                        "stop_mission" : 1,
                        
                        
                        "RewardsConfig" : {
                                    "mission_factor": 0.001,				
                                    "missile_fire_factor": -0.1,		
                                    "missile_no_fire_factor": -0.001,
                                    "missile_miss_factor": -0.5,
                                    "detect_loss_factor": -0.1,
                                    "keep_track_factor": 0.001,
                                    "hit_enemy_factor": 3.0,
                                    "hit_own_factor": -5.0,			
                                    "mission_accomplished_factor": 10.0,			
                                }
                    },

                    "AgentsConfig" : 
                    {
                        "blue_agents": { 
                            "num_agents" : 1,
                            "mission"    : "DCA",
                            "beh_config" : {
                                            "dShot" : [1.04, 0.50, 1.09],
                                            "lCrank": [1.06, 0.98, 0.98],
                                            "lBreak": [1.05, 1.17, 0.45]
                                        },
                            "base_behavior": "external",                  
                            "init_position": {"x": 0.0, "y": 25000.0,"z": 30.0},
                            "offset_pos": {	"x": 0.0, "y": 0.0, "z": 0.0},
                            "init_hdg": 0.0,                        
                            "target_position": {"x": 0.0,"y": 25000.0,"z": 30.0},
                            "rnd_offset_range":{"x": 10.0,"y": 10000.0,"z": 5.0},				
                            "rnd_shot_dist_var": 0.025,
                            "rnd_crank_var": 0.025,
                            "rnd_break_var": 0.025,
                            "wez_models" : "res://assets/wez/Default_Wez_params.json"
                        },	
                        "red_agents":
                        { 
                            "num_agents" : 1, 
                            "base_behavior": "baseline1",
                            "mission"    : "striker",
                            # "beh_config" : {
                            #                 "dShot" : [1.04, 1.04, 1.04], #[1.04, 0.50, 1.09]
                            #                 "lCrank": [1.06, 1.06, 1.06], #1.06, 0.98, 0.98
                            #                 "lBreak": [1.05, 1.05, 1.05], #1.05, 1.17, 0.45
                            #             },
                             "beh_config" : {
                                            "dShot" : [1.04, 0.50, 1.09],
                                            "lCrank": [1.06, 0.98, 0.98],
                                            "lBreak": [1.05, 1.17, 0.45]
                                        },
                            "init_position": {"x": 0.0,"y": 25000.0,"z": -30.0},
                            "offset_pos": {"x": 0.0,"y": 0.0,"z": 0.0},
                            "init_hdg" : 180.0,                        
                            "target_position": {"x": 0.0,"y": 25000.0,"z": 30.0},
                            "rnd_offset_range":{"x": 10.0,"y": 10000.0,"z": 5.0},				
                            "rnd_shot_dist_var": 0.025,
                            "rnd_crank_var": 0.025,
                            "rnd_break_var": 0.025,
                            "wez_models" : "res://assets/wez/Default_Wez_params.json"
                        }
                    }	
            }
#max_cycles = B_ACE_Config["max_cycles"]
#n_agents = 1#B_ACE_Config["n_pursuers"]

dqn_params =    {
                "discount_factor": 0.99, 
                "estimation_step": 180, 
                "target_update_freq": 6000 * 3 ,#max_cycles * n_agents,
                "reward_normalization" : False,
                "clip_loss_grad" : False,
                "optminizer": "Adam",
                "lr": 0.00005, 
                "max_tasks" : 30
                }

PPO_params= {    
                'action_scaling': True,
                'discount_factor': 0.98,
                'max_grad_norm': 0.5,
                'eps_clip': 0.2,
                'vf_coef': 0.5,
                'ent_coef': 0.01,
                'gae_lambda': 0.95,
                'reward_normalization': False, 
                'dual_clip': None,
                'value_clip': False,   
                'deterministic_eval': True,
                'advantage_normalization': False,
                'recompute_advantage': False,
                'action_bound_method': "clip",
                'lr_scheduler': None,
            }


trainer_params = {"max_epoch": 500,
                  "step_per_epoch": 18000,#5 * (150 * n_agents),
                  "step_per_collect": 6000,# * (10 * n_agents),
                  
                  "batch_size" : 1024,
                  
                  "update_per_step": 1 / (100), #Off-Policy Only (run after close a Collect (run many times as necessary to meet the value))
                  
                  "repeat_per_collect": 32, #On-Policy Only
                  
                  "episode_per_test": 30,                  
                  "tn_eps_max": 0.20,
                  "ts_eps_max": 0.01,
                  "warmup_size" : 1,
                  "train_envs" : train_env_num,
                  "test_envs" : test_env_num
}
#agent_learn = PPOPolicy(**policy_params)


runConfig = dqn_params
runConfig.update(Policy_Config)
runConfig.update(B_ACE_Config)
runConfig.update(trainer_params) 
runConfig.update(dqn_params)


def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
    policy_load_path = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    
    env = _get_env()       
    agent_observation_space = env.observation_space("agent_0")
   
    #print(env.action_space)
    #action_shape = 50#env.action_space.shape
    
    print("ActionSPACE: ", env.action_space)
    action_space = env.action_space
    device="cuda" if torch.cuda.is_available() else "cpu"  

    agents = []        
    
    if Policy_Config["same_policy"]:
        policies_number = 1
    else:
        policies_number = len(env.agents)

    for _ in range(policies_number):      
        
        #print(agent_observation_space)
        
        if policyModel == "DQN":

            if model == "Task_MHA_B_ACE":
                net = Task_MHA_B_ACE(
                    #obs_shape=agent_observation_space.shape,                                                  
                    num_tasks = dqn_params["max_tasks"],
                    num_features_per_task= 14,                    
                    nhead = 4,
                    device="cuda" if torch.cuda.is_available() else "cpu"
                    
                ).to(device) 
            
            if model == "Task_DNN_B_ACE":
                net = Task_DNN_B_ACE(
                    #obs_shape=agent_observation_space.shape,                                                  
                    num_tasks = dqn_params["max_tasks"],
                    num_features_per_task= 14,                    
                    nhead = 4,
                    device="cuda" if torch.cuda.is_available() else "cpu"
                    
                ).to(device) 
                
            optim = torch.optim.Adam(net.parameters(), lr=dqn_params["lr"], weight_decay=0.0, amsgrad= True)       
            
            agent_learn = DQNPolicy(
                model=net,
                optim=optim,
                action_space = Discrete(dqn_params["max_tasks"]),
                discount_factor= dqn_params["discount_factor"],
                estimation_step=dqn_params["estimation_step"],
                target_update_freq=dqn_params["target_update_freq"],
                reward_normalization = dqn_params["reward_normalization"],
                clip_loss_grad = dqn_params["clip_loss_grad"]
            )                   
        
        elif model == "PPO_DNN":
            
            actor = DNN_B_ACE_ACTOR(
                obs_shape=agent_observation_space.shape[0],                
                action_shape=4,                
                device="cuda" if torch.cuda.is_available() else "cpu"                
            ).to(device)

            critic = DNN_B_ACE_CRITIC(
                obs_shape=agent_observation_space.shape[0],                
                action_shape=4,                
                device="cuda" if torch.cuda.is_available() else "cpu"                
            ).to(device)
            
                                    
            actor_critic = ActorCritic(actor, critic)
        
            # orthogonal initialization
            # for m in actor_critic.modules():
            #     if isinstance(m, torch.nn.Linear):
            #         torch.nn.init.orthogonal_(m.weight)
            #         torch.nn.init.zeros_(m.bias)            
            
            # dist = torch.distributions.Normal(torch.tensor([0.0]), torch.tensor([1.0])) 
                # define policy
            def dist(mu, sigma) -> Distribution:
                return Normal(mu, sigma)        
                
            #optim_actor  = torch.optim.Adam(netActor.parameters(),  lr=dqn_params["lr"], weight_decay=0.0, amsgrad= True )
            #optim_critic = torch.optim.Adam(netCritic.parameters(), lr=dqn_params["lr"], weight_decay=0.0, amsgrad= True )
            optim = torch.optim.Adam(actor_critic.parameters(), lr=dqn_params["lr"])
                    
            agent_learn = PPOPolicy(
                actor=actor,
                critic=critic,
                optim=optim,
                dist_fn=dist,                
                action_scaling  =       PPO_params['action_scaling'],
                discount_factor =       PPO_params['discount_factor'],
                max_grad_norm   =       PPO_params['max_grad_norm'],
                eps_clip        =       PPO_params['eps_clip'],
                vf_coef         =       PPO_params['vf_coef'],
                ent_coef        =       PPO_params['ent_coef'],
                gae_lambda      =       PPO_params['gae_lambda'],
                reward_normalization=   PPO_params['reward_normalization'],
                action_space    =  action_space,
                deterministic_eval=     PPO_params['deterministic_eval'],
                advantage_normalization=PPO_params['advantage_normalization'],
                recompute_advantage=    PPO_params['recompute_advantage'],
                action_bound_method=    PPO_params['action_bound_method'],
                lr_scheduler=None
            )
            
        if Policy_Config["load_model"] is True:
            # Load the saved checkpoint             
            agent_learn.load_state_dict(torch.load(model_load_path))
            print(f'Loaded-> {model_load_path}')
                   
        
        agents.append(agent_learn)

    if Policy_Config["same_policy"]:
        agents = [agents[0] for _ in range(len(env.agents))]
    else:
        for _ in range(len(env.agents) - policies_number):
            agents.append(agents[0])
    
    policy = MultiAgentPolicyManager(policies = agents, env=env)  
    #policy = MAParalellPolicy(policies = agents, env=env, device="cuda" if torch.cuda.is_available() else "cpu" )  
        
    return policy, optim, env.agents

def _get_env():
    """This function is needed to provide callables for DummyVectorEnv."""   
    
    B_ACE_Config["EnvConfig"]["seed"] = random.randint(0, 1000000)
    
    env = B_ACE_TaskEnv( convert_action_space = True,
                                    device = "cpu",
                                    **B_ACE_Config)
    
    #env.action_space = env.action_space()
    #env = PettingZooEnv(env)  
    
    return env  


  self.hub = sentry_sdk.Hub(client)


In [2]:


def _get_envT():
    """This function is needed to provide callables for DummyVectorEnv."""   
    # env_paralell = MultiUAVEnv()  
    # env = pursuit_v4.env()

    env =  TaskPursuitEnv.env(
                max_cycles=SISL_Config["max_cycles"],
                x_size=SISL_Config["x_size"],
                y_size=SISL_Config["y_size"],
                shared_reward=SISL_Config["shared_reward"],
                n_evaders=SISL_Config["n_evaders"],
                n_pursuers=SISL_Config["n_pursuers"],
                obs_range=7,#[5,5],#SISL_Config["obs_range"],
                n_catch=SISL_Config["n_catch"],
                freeze_evaders=SISL_Config["freeze_evaders"],
                tag_reward=SISL_Config["tag_reward"],
                catch_reward=SISL_Config["catch_reward"],
                urgency_reward=SISL_Config["urgency_reward"],
                surround=SISL_Config["surround"],
                constraint_window=SISL_Config["constraint_window"],
                # att_memory = SISL_Config["att_memory"],
                #render_mode= "human"#True
                render_mode= None#"human"#True
            ) 
           
    #env = parallel_to_aec_wrapper(env_paralell)    
    # env = CustomParallelToAECWrapper(env_paralell)
    return PettingZooEnv(env)
    # return PettingZooParallelEnv(env)   


policy, optim, agents = _get_agents()
test_env_num = 5
 # ======== Step 1: Environment setup =========

test_envs = DummyVectorEnv([_get_env for _ in range(test_env_num)]) 
#SubprocVectorEnv

# seed
seed = 1
np.random.seed(seed)

torch.manual_seed(seed)
#for t_env in test_envs:
#    t_env.seed(np.random.random())

episodes =  30
render  = False

policy_name = "policy_Task_MHA_B_ACE_B_ACE03241018-082438_425_BestRew.pth" 

#policy_name = "policy_CNN_SISL_Desk_CNN02240128-083000_2618_BestRew.pth"
# Load the saved checkpoint
for agent in agents:    
    
    if Policy_Config["same_policy"]:
         model_path = os.path.join("Task_MHA_B_ACEDQN", policy_name)                            
    else:
         model_path = os.path.join("Task_MHA_B_ACEDQN", policy_name) 

    policy.policies[agent].set_eps(0.00)
    policy.policies[agent].load_state_dict(torch.load(model_path))
    policy.policies[agent].eval()
    
test_collector = Collector(policy, test_envs, exploration_noise=True)

results = test_collector.collect(n_episode=episodes)#0.02)#, gym_reset_kwargs={'seed' :2})

print("Mean: ", np.mean(results.returns))
print("Std:  " , np.std (results.returns))
print("Max:  " , np.max( results.returns))
print("Min:  " , np.min(results.returns))


#Gets Final Stats
methods = test_envs.get_env_attr("call_results")  # This returns a list of method references
results = [method() for method in methods]  # Call each method
df = pd.DataFrame(results)


waiting for remote GODOT connection on port 11836
connection established
action space {'input': {'action_type': 'continuous', 'size': 4}}
observation space {'obs': {'size': [22], 'space': 'box'}}
ActionSPACE:  Discrete(30)
waiting for remote GODOT connection on port 12392
connection established
action space {'input': {'action_type': 'continuous', 'size': 4}}
observation space {'obs': {'size': [22], 'space': 'box'}}
waiting for remote GODOT connection on port 11397
connection established
action space {'input': {'action_type': 'continuous', 'size': 4}}
observation space {'obs': {'size': [22], 'space': 'box'}}
waiting for remote GODOT connection on port 13383
connection established
action space {'input': {'action_type': 'continuous', 'size': 4}}
observation space {'obs': {'size': [22], 'space': 'box'}}
waiting for remote GODOT connection on port 12741
connection established
action space {'input': {'action_type': 'continuous', 'size': 4}}
observation space {'obs': {'size': [22], 'space': '

  policy.policies[agent].load_state_dict(torch.load(model_path))


Mean:  7.73477341169898
Std:   8.744787774644204
Max:   16.127168835667927
Min:   -10.653523812569645


In [3]:
import pandas as pd
# Sample data (shortened for illustration; use your full data in practice)
data = results

blue_team_data = []
red_team_data = []
general_data = []

for episode in data:
    for entry in episode:
        blue_team_data.append(entry[0])  # First dictionary: Blue team
        red_team_data.append(entry[1])   # Second dictionary: Red team
        general_data.append(entry[2])    # Third dictionary: General sim data

# Convert to DataFrames
df_blue = pd.DataFrame(blue_team_data)
df_blue = df_blue[df_blue.end_cond.notna()]

df_red = pd.DataFrame(red_team_data)
df_red = df_red[df_red.end_cond.notna()]

df_general = pd.DataFrame(general_data)

# Merge DataFrames for a complete view (optional)
df_merged = pd.concat([df_general, df_blue.add_prefix('blue_'), df_red.add_prefix('red_')], axis=1)
df_merged = df_merged[df_merged.blue_end_cond.notna()]

In [4]:
import numpy as np
from scipy.stats import bootstrap

# Function to compute mean and 95% bootstrap confidence interval
def compute_mean_and_ci(data, confidence_level=0.95):
    mean_value = np.mean(data)
    # Perform bootstrap resampling to compute confidence interval
    res = bootstrap((data,), np.mean, confidence_level=confidence_level, n_resamples=10000, method='basic')
    ci_lower, ci_upper = res.confidence_interval
    return mean_value, ci_lower, ci_upper

# Compute mean and confidence intervals for Blue and Red team metrics
metrics = ['killed', 'missile', 'mission', 'reward']

final_results = {'team': [], 'metric': [], 'mean': [], 'ci_lower': [], 'ci_upper': []}

for metric in metrics:
    for team, df in [('Blue', df_blue), ('Red', df_red)]:
        mean_value, ci_lower, ci_upper = compute_mean_and_ci(df[metric])
        final_results['team'].append(team)
        final_results['metric'].append(metric)
        final_results['mean'].append(mean_value)
        final_results['ci_lower'].append(ci_lower)
        final_results['ci_upper'].append(ci_upper)

# Convert the results into a DataFrame for display
df_final_results = pd.DataFrame(final_results)
print(df_final_results)


   team   metric      mean  ci_lower   ci_upper
0  Blue   killed  0.000000  0.000000   0.000000
1   Red   killed  0.333333  0.166667   0.500000
2  Blue  missile  1.500000  0.900000   2.033333
3   Red  missile  0.166667  0.033333   0.266667
4  Blue  mission  0.000000  0.000000   0.000000
5   Red  mission  0.000000  0.000000   0.000000
6  Blue   reward  7.734773  4.831504  11.060951
7   Red   reward  0.000000  0.000000   0.000000
