In [3]:
import inspect
import time
from statistics import mean, stdev
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
import os

from ray.tune.registry import register_env
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper
import warnings
import numpy as np
from ray import air, tune
warnings.filterwarnings('ignore')

In [4]:
def env_creator(env_config: dict):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
    agents = {"Red": B_lineAgent, "Green": GreenAgent}
    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env

def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")

register_env(name="CybORG", env_creator=env_creator)

In [8]:

tune.Tuner(
        "PPO",
        run_config=air.RunConfig(
            stop={"timesteps_total": 2e6},
            local_dir='results/APPO', name="explore2",
            checkpoint_config=air.CheckpointConfig(
                checkpoint_frequency=500, 
            ),
        ),
        param_space={
            # CC3 specific.
            "env": "CybORG",
            # General
            "num_gpus": 1,
            "num_workers": 30,
            "horizon": 100,
            "num_envs_per_worker": 1,
            #"exploration_config": tune.grid_search([{"type": "RE3",
            #    "embeds_dim": 128,
            #    "beta_schedule": "constant",
            #    "sub_exploration": {
            #        "type": "StochasticSampling",
            #},}, 
            "exploration_config": {"type": "StochasticSampling"},
            #algo params
            "train_batch_size": 3000,
            "lr": 0.0005,
            "gamma": 0.95,
            "framework": 'tf',
            "model": {
                    "fcnet_hiddens": [512, 512],
                    "fcnet_activation": "relu",
                    #'use_lstm': True,
                    #'max_seq_len': 5,
                },
            "output": "dataset",
            "output_config": {
                "format": "json",
                "path": "/logs/"},
            "output_compress_columns": ['prev_actions', 'prev_rewards', 'dones', 't', 'eps_id', 'unroll_id', 'agent_index', 'action_prob', 'action_logp', 'action_dist_inputs', 'advantages', 'value_targets']
        },
    ).fit()

0,1
Current time:,2022-12-16 12:30:40
Running for:,00:01:01.76
Memory:,37.2/125.8 GiB

Trial name,status,loc,iter,total time (s),ts,reward,num_recreated_worker s,episode_reward_max,episode_reward_min
PPO_CybORG_4ea3f_00000,RUNNING,172.28.0.2:33208,6,32.7645,18000,-520.983,0,-109.8,-1145.8


[2m[36m(PPO pid=33208)[0m 2022-12-16 12:29:44,930	INFO algorithm.py:2303 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(PPO pid=33208)[0m 2022-12-16 12:29:44,930	INFO ppo.py:379 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPO pid=33208)[0m 2022-12-16 12:29:44,932	INFO algorithm.py:457 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(PPO pid=33208)[0m 2022-12-16 12:30:02,105	INFO trainable.py:164 -- Trainable.setup took 17.177 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Trial name,agent_timesteps_total,counters,custom_metrics,date,done,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,episodes_total,experiment_id,hostname,info,iterations_since_restore,node_ip,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_trained,num_env_steps_trained_this_iter,num_faulty_episodes,num_healthy_workers,num_recreated_workers,num_steps_trained_this_iter,perf,pid,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,time_since_restore,time_this_iter_s,time_total_s,timers,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
PPO_CybORG_4ea3f_00000,18000,"{'num_env_steps_sampled': 18000, 'num_env_steps_trained': 18000, 'num_agent_steps_sampled': 18000, 'num_agent_steps_trained': 18000}",{},2022-12-16_12-30-35,False,100,{},-109.8,-520.983,-1145.8,30,180,d733ef48241744f583e57d5b6c039e0d,01589170c3ff,"{'learner': {'default_policy': {'learner_stats': {'cur_kl_coeff': 0.20000000298023224, 'cur_lr': 0.0005000000237487257, 'total_loss': 8.900472, 'policy_loss': -0.2132334, 'vf_loss': 9.108389, 'vf_explained_var': -0.025767138, 'kl': 0.026581803, 'entropy': 4.824505, 'entropy_coeff': 0.0, 'model': {}}, 'train': None}}, 'num_env_steps_sampled': 18000, 'num_env_steps_trained': 18000, 'num_agent_steps_sampled': 18000, 'num_agent_steps_trained': 18000}",6,172.28.0.2,18000,18000,18000,3000,18000,3000,0,30,0,3000,"{'cpu_util_percent': 32.17142857142857, 'ram_util_percent': 29.599999999999998}",33208,{},{},{},"{'mean_raw_obs_processing_ms': 1.70938626683753, 'mean_inference_ms': 1.874125006161448, 'mean_action_processing_ms': 0.1553102814265939, 'mean_env_wait_ms': 11.082980257571654, 'mean_env_render_ms': 0.0}","{'episode_reward_max': -109.79999999999987, 'episode_reward_min': -1145.8000000000006, 'episode_reward_mean': -520.9830000000003, 'episode_len_mean': 100.0, 'episode_media': {}, 'episodes_this_iter': 30, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [-370.2, -1035.5, -277.6999999999998, -151.79999999999978, -782.6000000000007, -473.8000000000005, -1034.9000000000005, -288.79999999999995, -1039.8000000000013, -792.8000000000013, -472.8000000000008, -283.6, -719.8000000000014, -1091.2000000000007, -378.80000000000064, -790.7000000000014, -289.7999999999999, -870.7000000000008, -317.8000000000002, -1037.7000000000005, -1015.9000000000001, -1136.4, -122.79999999999977, -1145.8000000000006, -1011.2000000000008, -697.8000000000013, -243.6, -293.7999999999999, -303.8, -1074.8000000000009, -151.3, -223.2999999999998, -1138.8999999999999, -454.1, -218.59999999999968, -290.79999999999995, -295.8000000000005, -109.79999999999987, -514.7000000000014, -1017.0999999999999, -298.2, -1061.800000000001, -357.70000000000044, -266.7999999999998, -245.5, -1142.8, -282.2, -1066.0000000000005, -283.6999999999999, -799.5000000000013, -471.8000000000013, -822.8000000000014, -337.79999999999984, -868.5, -394.80000000000075, -1103.7000000000007, -274.39999999999986, -275.99999999999994, -182.7, -494.6, -236.9, -219.7999999999996, -448.70000000000147, -370.8000000000013, -498.8000000000013, -898.8000000000011, -1065.700000000001, -233.29999999999995, -199.69999999999956, -386.0, -637.7, -309.80000000000035, -469.0, -495.0000000000001, -231.4, -560.8000000000005, -369.8000000000007, -264.8999999999999, -361.8000000000009, -199.79999999999967, -253.79999999999964, -237.79999999999964, -563.0, -262.1, -979.7000000000013, -1061.700000000001, -858.8000000000014, -235.09999999999982, -319.80000000000035, -462.80000000000143, -542.8000000000014, -306.3, -199.79999999999967, -289.60000000000036, -473.70000000000147, -288.4999999999998, -267.6999999999999, -506.8000000000011, -274.7999999999998, -262.7999999999997], 'episode_lengths': [100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 1.70938626683753, 'mean_inference_ms': 1.874125006161448, 'mean_action_processing_ms': 0.1553102814265939, 'mean_env_wait_ms': 11.082980257571654, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}",32.7645,5.44944,32.7645,"{'training_iteration_time_ms': 5446.38, 'load_time_ms': 7.924, 'load_throughput': 378600.445, 'learn_time_ms': 3752.862, 'learn_throughput': 799.39, 'synch_weights_time_ms': 11.747}",1671193835,0,18000,6,4ea3f_00000,17.1936


[2m[36m(PPO pid=33208)[0m 2022-12-16 12:30:40,302	ERROR worker.py:763 -- Worker exits with an exit code 1.
[2m[36m(PPO pid=33208)[0m Traceback (most recent call last):
[2m[36m(PPO pid=33208)[0m   File "python/ray/_raylet.pyx", line 1032, in ray._raylet.task_execution_handler
[2m[36m(PPO pid=33208)[0m   File "python/ray/_raylet.pyx", line 812, in ray._raylet.execute_task
[2m[36m(PPO pid=33208)[0m   File "python/ray/_raylet.pyx", line 852, in ray._raylet.execute_task
[2m[36m(PPO pid=33208)[0m   File "python/ray/_raylet.pyx", line 859, in ray._raylet.execute_task
[2m[36m(PPO pid=33208)[0m   File "python/ray/_raylet.pyx", line 863, in ray._raylet.execute_task
[2m[36m(PPO pid=33208)[0m   File "python/ray/_raylet.pyx", line 810, in ray._raylet.execute_task.function_executor
[2m[36m(PPO pid=33208)[0m   File "/usr/local/lib/python3.8/dist-packages/ray/_private/function_manager.py", line 674, in actor_method_executor
[2m[36m(PPO pid=33208)[0m     return method(__ra

<ray.tune.result_grid.ResultGrid at 0x7efc60c33160>

In [5]:
!nvidia-smi

Mon Dec  5 12:03:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.03   Driver Version: 450.119.03   CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:83:00.0 Off |                    0 |
| N/A   39C    P0    38W / 250W |   2167MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip show ray

In [None]:
path = str(inspect.getfile(CybORG))
path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
agents = {"Red": B_lineAgent, "Green": GreenAgent}
cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)

In [None]:
env.observation_space

In [18]:
dist = np.array([0.9997, 0.0001, 0.0001, 0.0001])
entropy = -np.sum(dist * np.log(dist))
entropy

0.0030630571070921465

In [19]:
dist = np.array([0.5, 0.5, 0.0001, 0.0001])
entropy = -np.sum(dist * np.log(dist))
entropy

0.6949892486343404

In [20]:
dist = np.array([0.25, 0.25, 0.25, 0.25])
entropy = -np.sum(dist * np.log(dist))
entropy

1.3862943611198906