In [1]:
import inspect
import time
from statistics import mean, stdev
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
import os

from ray.tune.registry import register_env
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper
import warnings
import numpy as np
from ray import air, tune
warnings.filterwarnings('ignore')

In [2]:
def env_creator(env_config: dict):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
    agents = {"Red": B_lineAgent, "Green": GreenAgent}
    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env

def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")

register_env(name="CybORG", env_creator=env_creator)

In [3]:
!rm -r logs/APPO/StochasticSampling

In [4]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.policy.policy import PolicySpec

config = (
    PPOConfig()
    #Each rollout worker uses a single cpu
    .rollouts(num_rollout_workers=30, num_envs_per_worker=1, horizon=100)\
    .training(train_batch_size=3000, gamma=0.85, lr=0.00005, 
              model={"fcnet_hiddens": [256, 256], "fcnet_activation": "tanh",})\
    .environment(disable_env_checking=True, env = 'CybORG')\
    .resources(num_gpus=1)\
    .framework('tf')\
    .offline_data(output="logs/APPO/StochasticSampling", output_compress_columns=['prev_actions', 'prev_rewards', 'dones', 't', 'eps_id', 'unroll_id', 'agent_index', 'action_prob', 'action_logp', 'action_dist_inputs', 'advantages', 'value_targets'],
                 output_config={"format": "json"},)
)
trainer = config.build()


2022-12-16 14:21:36,591	INFO worker.py:1528 -- Started a local Ray instance.
2022-12-16 14:21:53,842	INFO trainable.py:164 -- Trainable.setup took 19.508 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [5]:
def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")

In [6]:
for i in range(200):
    print_results(trainer.train())

   1 	r_mean: -583.4 	r_max: -46.5 	r_min: -1107.8
   2 	r_mean: -634.3 	r_max: -46.5 	r_min: -1146.8
   3 	r_mean: -575.7 	r_max: -46.5 	r_min: -1149.8
   4 	r_mean: -503.2 	r_max: -46.5 	r_min: -1149.8
   5 	r_mean: -397.4 	r_max: -90.0 	r_min: -1149.8
   6 	r_mean: -335.3 	r_max: -17.0 	r_min: -1100.8
   7 	r_mean: -283.3 	r_max: -17.0 	r_min: -1100.8
   8 	r_mean: -255.3 	r_max: -17.0 	r_min: -1100.8
   9 	r_mean: -224.5 	r_max: -127.5 	r_min: -558.8
  10 	r_mean: -218.4 	r_max: -56.8 	r_min: -567.8
  11 	r_mean: -204.4 	r_max: -56.8 	r_min: -567.8
  12 	r_mean: -186.6 	r_max: -41.7 	r_min: -567.8
  13 	r_mean: -177.3 	r_max: -28.5 	r_min: -284.2
  14 	r_mean: -165.0 	r_max: -28.5 	r_min: -274.8
  15 	r_mean: -161.1 	r_max: -28.5 	r_min: -274.8
  16 	r_mean: -151.5 	r_max: -34.8 	r_min: -247.2
  17 	r_mean: -141.4 	r_max: -17.3 	r_min: -247.2
  18 	r_mean: -134.1 	r_max: -17.3 	r_min: -206.8
  19 	r_mean: -123.1 	r_max: -17.3 	r_min: -193.7
  20 	r_mean: -119.0 	r_max: -24.8 	r_min

 170 	r_mean: -32.7 	r_max: -9.6 	r_min: -162.1
 171 	r_mean: -29.9 	r_max: -9.6 	r_min: -162.1
 172 	r_mean: -26.2 	r_max: -9.7 	r_min: -146.8
 173 	r_mean: -26.2 	r_max: -9.7 	r_min: -146.8
 174 	r_mean: -22.1 	r_max: -9.7 	r_min: -140.7
 175 	r_mean: -24.1 	r_max: -9.8 	r_min: -141.8
 176 	r_mean: -23.4 	r_max: -9.8 	r_min: -141.8
 177 	r_mean: -30.8 	r_max: -9.7 	r_min: -483.6
 178 	r_mean: -40.4 	r_max: -9.0 	r_min: -892.7
 179 	r_mean: -40.9 	r_max: -9.0 	r_min: -892.7
 180 	r_mean: -36.1 	r_max: -9.0 	r_min: -892.7
 181 	r_mean: -40.4 	r_max: -8.9 	r_min: -892.7
 182 	r_mean: -30.6 	r_max: -8.9 	r_min: -299.8
 183 	r_mean: -30.3 	r_max: -8.9 	r_min: -299.8
 184 	r_mean: -37.4 	r_max: -8.9 	r_min: -749.8
 185 	r_mean: -31.6 	r_max: -8.5 	r_min: -749.8
 186 	r_mean: -41.7 	r_max: -8.5 	r_min: -766.8
 187 	r_mean: -34.5 	r_max: -8.5 	r_min: -766.8
 188 	r_mean: -33.0 	r_max: -9.7 	r_min: -766.8
 189 	r_mean: -34.6 	r_max: -9.7 	r_min: -766.8
 190 	r_mean: -26.3 	r_max: -9.7 	r_min: