In [1]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import numpy as np
import ray

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
from keras.layers import Bidirectional
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, Input
from keras.layers import Bidirectional
from ray import tune
from ray.tune.registry import register_env
from keras.models import Model

In [7]:
sequence_length=3
state_len = 91
num_actions = 41
encoding_len = state_len + num_actions
NUM_NODES = 13
NODE_CLASSES = [3, 4]

class WorldMovelEnv(gym.Env):

    def __init__(self):
        
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(state_len,))
        self.action_space = gym.spaces.Discrete(num_actions)
        
        self.step_count = 0

        #Reward Model
        self.r_model = Sequential()
        self.r_model.add(Input(shape=(state_len*2 + 2,)))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(11, activation='softmax'))
        #self.r_model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy()])
        self.r_model.load_weights('RewardModel')
        
        
        input_ = Input(shape=(state_len+num_actions+1,))
        #x = Dense(512, activation='relu')(input_)
        #x = Dense(512, activation='relu')(x)
        outs = []
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                x_ = Dense(128, activation='relu')(input_)
                outs.append(Dense(n, activation='softmax', name=str(i)+str(n))(x_))

        self.ns_model_multi_model = Model(input_, outs)
        self.ns_model_multi_model.load_weights('NextStateModel_MulitLabel')
     
        '''
        input_ = Input(shape=(10,42+20,))
        x = Bidirectional(LSTM(256))(input_)
        x = Flatten()(x)
        x = Dense(512, activation='relu')(x)

        outs = []
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                x_ = Dense(256, activation='relu')(x)
                outs.append(Dense(n, activation='softmax', name=str(i)+str(n))(x_))
        #x_ = Dense(256, activation='relu')(x)
        #outs.append(Dense(6, activation='softmax', name='reward')(x_))
        self.ns_model_multi_model = Model(input_, outs)
        self.ns_model_multi_model.load_weights('NextStateModel_MulitLabel_rnn')
        '''
        self.reward_map = np.load('reward_map.npy', allow_pickle=True).item()
        #self.state = np.zeros((10, 42))
        self.init_state = np.array([0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
         0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
         1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
         1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1.])
        self.state = self.init_state
        #self.action = np.zeros((10, 20))

    def step(self, action):
        self.step_count += 1
        
        #self.state_action[0,52] = action / 145
        #obs_index_probs = self.st_model.predict(np.array([self.state_action]))
        #obs_index = np.argmax(obs_index_probs)
        #obs_index = np.random.choice(np.arange(9674), p=obs_index_probs[0])
        #new_state = self.index_to_state[obs_index]
        #new_state = np.frombuffer(new_state)
        
        action_vec = np.zeros(num_actions)
        action_vec[action] = 1
        #self.action[-1,:] = action_vec
        
        state_action = np.concatenate([self.state, [self.step_count/100], action_vec], axis=-1)
        #afterstate_probs = self.as_model.predict(np.array([state_action]), verbose=0)
        #afterstate = np.array(np.random.rand(1, 42) < afterstate_probs, dtype=np.int8)
        
        #while True:
        #    new_state_probs = self.ns_model.predict(np.array([state_action]), verbose=0)
        #    new_state = np.array(np.random.rand(1, 42) < new_state_probs, dtype=np.int8)
        #    if int(np.sum(self.state-new_state)) == 0:
        #        break
        #self.state = new_state[0]
        probs = self.ns_model_multi_model.predict(np.array([state_action]), verbose=0)
        next_state = np.zeros(state_len)
        index_state = 0; index = 0
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                next_state[index_state+np.random.choice(np.arange(n), p=probs[index][0])] = 1
                index_state += n; index += 1
        
       # reward_probs = self.r_model.predict(np.array([self.state]), verbose=0)
        #reward = np.random.choice(np.arange(6), p=reward_probs[0])
        #reward = self.reward_map[reward]
        
        #for i in range(1,3):
        #    self.state_action[i,:] = self.state_action[i-1,:]
        #
        #self.state_action[0,:] = np.concatenate([new_state, [0]])
        
        reward_probs = self.r_model.predict(np.array([np.concatenate([self.state, [self.step_count/100], next_state, [self.step_count/100]])]))
        reward_index = np.random.choice(np.arange(11), p=reward_probs[0])
        reward = self.reward_map[reward_index]
        
        #for i in range(0,9):
        #    self.state[i,:] = self.state[i+1,:]
        #    self.action[i,:] = self.action[i+1,:]
            
        #self.state[-1,:] = state
        
        self.state = next_state
        
        done = self.step_count == 99
        if done:
            self.step_count = 0
        return self.state, reward, done, {}

    def reset(self):
        step_count = 0
        
        #self.state_action = np.zeros((sequence_length,encoding_len))
        #self.state = np.zeros((sequence_length,state_len))
        
        #obs_index_probs = self.st_model.predict(np.array([self.state_action]))
        #obs_index = np.argmax(obs_index_probs)
        #obs_index = np.random.choice(np.arange(9674), p=obs_index_probs[0])
        #new_state = self.index_to_state[obs_index]
        #new_state = np.frombuffer(new_state)
       
        #self.state = np.zeros((10, 42))
        #self.state[0,:] = init_state
        self.state = self.init_state
      
        return self.init_state
    

    def render(self, mode='human', close=False):
        pass

    def close(self):
        pass
    
def env_creator(config):
    return WorldMovelEnv() 

In [8]:
test = WorldMovelEnv()
test.reset()
for i in range(1):
    print(test.step(np.random.randint(20)))

  updates=self.state_updates,


(array([0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 0., 1.]), 0.0, False, {})


  updates=self.state_updates,


In [9]:
register_env(name="DreamCybORG", env_creator=env_creator)

  and should_run_async(code)


In [10]:
def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")

In [11]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.policy.policy import PolicySpec

config = (
    PPOConfig()
    #Each rollout worker uses a single cpu
    .rollouts(num_rollout_workers=20, num_envs_per_worker=1, horizon=100)\
    .training(sgd_minibatch_size = 100, train_batch_size=2000, gamma=0.99, lr=0.00005, 
              model={"fcnet_hiddens": [256, 256], "fcnet_activation": "tanh",})\
    .environment(disable_env_checking=True, env = 'DreamCybORG')\
    .resources(num_gpus=1)\
    .framework('torch')\
    #.exploration(explore=True, exploration_config={"type": "RE3", "embeds_dim": 128, "beta_schedule": "constant", "sub_exploration": {"type": "StochasticSampling",},})\
)
trainer = config.build()


[2m[36m(pid=37866)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=37863)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=37870)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=38162)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=37879)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=37868)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=37864)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=37874)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=37871)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=37859)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=37875)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=37877)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=37860)[0m   if (distutils

2023-02-13 17:30:41,742	INFO trainable.py:172 -- Trainable.setup took 15.606 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [16]:
for i in range(500):
    print_results(trainer.train())

  and should_run_async(code)


 221 	r_mean: -36.1 	r_max: -17.8 	r_min: -128.8
 222 	r_mean: -36.5 	r_max: -17.8 	r_min: -128.8
 223 	r_mean: -36.5 	r_max: -19.6 	r_min: -128.8
 224 	r_mean: -36.4 	r_max: -19.6 	r_min: -128.8


KeyboardInterrupt: 

In [19]:
trainer.save()

  and should_run_async(code)


'/root/ray_results/PPO_CybORG_2023-02-13_18-39-51n6m9jq1i/checkpoint_000016'

In [24]:
trainer.load_checkpoint('results/checkpoint_000268')

  and should_run_async(code)


RuntimeError: Error(s) in loading state_dict for FullyConnectedNetwork:
	size mismatch for _logits._model.0.weight: copying a param with shape torch.Size([20, 256]) from checkpoint, the shape in current model is torch.Size([41, 256]).
	size mismatch for _logits._model.0.bias: copying a param with shape torch.Size([20]) from checkpoint, the shape in current model is torch.Size([41]).
	size mismatch for _hidden_layers.0._model.0.weight: copying a param with shape torch.Size([256, 42]) from checkpoint, the shape in current model is torch.Size([256, 91]).
	size mismatch for _value_branch_separate.0._model.0.weight: copying a param with shape torch.Size([256, 42]) from checkpoint, the shape in current model is torch.Size([256, 91]).

In [7]:
for i in range(1):

    cyborg = CybORG(path, 'sim', agents={'Red': B_lineAgent})
    wrapped_cyborg = wrap(cyborg)

    observation = wrapped_cyborg.reset()
    print(wrapped_cyborg.reset())
    obs = wrapped_cyborg.step(5)[3]['observation']
    print(obs)

  and should_run_async(code)


NameError: name 'CybORG' is not defined

In [9]:
!pip install tdqm

  and should_run_async(code)


E0213 15:06:02.796053617   39484 backup_poller.cc:136]       Run client channel backup poller: UNKNOWN:pollset_work {created_time:"2023-02-13T15:06:02.795903359+00:00", children:[UNKNOWN:Bad file descriptor {syscall:"epoll_wait", os_error:"Bad file descriptor", errno:9, created_time:"2023-02-13T15:06:02.795795923+00:00"}]}
Collecting tdqm
  Downloading tdqm-0.0.1.tar.gz (1.4 kB)
Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.6 MB/s eta 0:00:011
[?25hBuilding wheels for collected packages: tdqm
  Building wheel for tdqm (setup.py) ... [?25ldone
[?25h  Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1321 sha256=532c668caa6999b215925d5953c295f7a5e1eb4482762ce6287cf4b107a9e93b
  Stored in directory: /root/.cache/pip/wheels/86/cd/38/f96ed05dd8049e95d8fbeaa0587664eb001a1848979636b771
Successfully built tdqm
Installing collected packages: tqdm, tdqm
Successfully installed tdqm-0.0.1 tqdm-4.64.1
You

In [21]:
test.reset()
obs2, r, d, i = test.step(5)
obs - obs2

  and should_run_async(code)


NameError: name 'test' is not defined

In [2]:
import inspect
import time
from statistics import mean, stdev
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
import os
from tqdm import trange

MAX_EPS = 5
agent_name = 'Blue'

def wrap(env):
    return RLlibWrapper(agent_name="Blue", env=env)

def env_creator(env_config: dict):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2_No_Decoy.yaml'
    agents = {"Red": RedMeanderAgent, "Green": GreenAgent}
    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env

register_env(name="CybORG", env_creator=env_creator)


path = str(inspect.getfile(CybORG))
path = path[:-10] + '/Shared/Scenarios/Scenario2_No_Decoy.yaml'
obs = []
#print(f'using CybORG v{cyborg_version}, {scenario}\n')
for red_agent in [B_lineAgent]:#, RedMeanderAgent]:

    cyborg = CybORG(path, 'sim', agents={'Red': red_agent})
    wrapped_cyborg = wrap(cyborg)

    observation = wrapped_cyborg.reset()
    obs.append(observation)
    # observation = cyborg.reset().observation

    action_space = wrapped_cyborg.get_action_space(agent_name)
    # action_space = cyborg.get_action_space(agent_name)
    total_reward = []
    actions = []
    for i in trange(MAX_EPS):
        r = []
        #a = []
        cyborg = CybORG(path, 'sim', agents={'Red': red_agent})
        wrapped_cyborg = wrap(cyborg)

        observation = wrapped_cyborg.reset()
        obs.append(observation)
        # observation = cyborg.reset().observation

        # cyborg.env.env.tracker.render()
        for j in range(100):
            
            
            action = trainer.compute_single_action(observation, explore=False)
            #action_vec = np.zeros(145)
            #action_vec[int(action)] = 1
            #action = agent.get_action(observation, action_space)
            observation, rew, done, info = wrapped_cyborg.step(action)
            obs.append(observation)
            #actions.append(action_vec)
            # result = cyborg.step(agent_name, action)
            r.append(rew)
            # r.append(result.reward)
           # a.append((str(cyborg.get_last_action('Blue')), str(cyborg.get_last_action('Red'))))
        total_reward.append(sum(r))
        print(sum(r))
        # observation = cyborg.reset().observation
        observation = wrapped_cyborg.reset()
    print(f'Average reward for red agent {red_agent.__name__} and steps {100} is: {mean(total_reward):.1f} with a standard deviation of {stdev(total_reward):.1f}')
    #return mean(total_reward), np.mean(np.array(obs), axis=0),  np.mean(np.array(actions), axis=0)

  and should_run_async(code)
  if (distutils.version.LooseVersion(tf.__version__) <
  0%|          | 0/5 [00:00<?, ?it/s]


NameError: name 'trainer' is not defined

In [55]:
from ray.rllib.algorithms.dqn import DQNConfig
from ray.rllib.policy.policy import PolicySpec

config = (
    DQNConfig()
    #Each rollout worker uses a single cpu
    .rollouts(num_rollout_workers=10, num_envs_per_worker=1, horizon=100)\
    .training(gamma=0.99, lr=0.005, 
              model={"fcnet_hiddens": [256, 256], "fcnet_activation": "tanh",})\
    .environment(disable_env_checking=True, env = 'DreamCybORG')\
    .resources(num_gpus=1)\
    .framework('torch')\
    #.exploration(explore=True, exploration_config={"type": "RE3", "embeds_dim": 128, "beta_schedule": "constant", "sub_exploration": {"type": "StochasticSampling",},})\
)
trainer = config.build()
for i in range(500):
    print_results(trainer.train())

  and should_run_async(code)
[2m[36m(pid=25909)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25826)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25829)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25824)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25822)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25828)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25820)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25911)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25819)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25818)[0m   if (distutils.version.LooseVersion(tf.__version__) <
2023-02-08 12:36:23,387	INFO trainable.py:172 -- Trainable.setup took 12.251 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce



   1 	r_mean: -81.2 	r_max: -67.7 	r_min: -97.9


  """Adds a new policy to this Algorithm.
  """Defines a configuration class from which a DQN Algorithm can be built.

KeyboardInterrupt



In [None]:
test = WorldMovelEnv()
import time 

t = time.time()
print(test.reset())
for i in range(100):
    print(test.step(55))
print(t-time.time())

In [53]:
trainer.save('results')

[2m[36m(pid=22797)[0m   if (distutils.version.LooseVersion(tf.__version__) <
  and should_run_async(code)

[2m[36m(pid=22790)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22787)[0m   if (distutils.version.LooseVersion(tf.__version__) <


'results/checkpoint_000119'

[2m[36m(pid=22786)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22795)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22800)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22793)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22794)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22792)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22796)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22788)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22789)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22805)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22802)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22798)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=22803)[0m   if (distutils

In [9]:
!pip install tqdm

  and should_run_async(code)


Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.5 MB/s eta 0:00:01
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.64.1
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [49]:
count = np.arange(10)
count

  and should_run_async(code)


array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [50]:
for i in range(0,9):
    count[i] = count[i+1]
count[-1] = 10
count

  and should_run_async(code)


array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])









In [3]:
RED_AGENT = "B_Line"
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.policy.policy import PolicySpec
import inspect
import time
from statistics import mean, stdev
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
import os
from ray.tune.registry import register_env
from tqdm import trange

def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")

def env_creator(env_config: dict):
    # import pdb; pdb.set_trace()
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2_No_Decoy.yaml'
    if RED_AGENT == "B_Line":
        agents = {"Red": B_lineAgent, "Green": GreenAgent}
    else:
        agents = {"Red": RedMeanderAgent, "Green": GreenAgent}

    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env


register_env(name="CybORG", env_creator=env_creator)
config = (
    PPOConfig()
    #Each rollout worker uses a single cpu
    .rollouts(num_rollout_workers=20, num_envs_per_worker=1, horizon=100)\
    .training(sgd_minibatch_size = 100, train_batch_size=2000, gamma=0.99, lr=0.0001, 
              model={"fcnet_hiddens": [256, 256], "fcnet_activation": "tanh",})\
    .environment(disable_env_checking=True, env = 'CybORG')\
    .resources(num_gpus=1)\
    .framework('torch')\
    #.exploration(explore=True, exploration_config={"type": "RE3", "embeds_dim": 128, "beta_schedule": "constant", "sub_exploration": {"type": "StochasticSampling",},})\
)
trainer = config.build()

trainer.load_checkpoint('/root/ray_results/PPO_CybORG_2023-02-13_18-39-51n6m9jq1i/checkpoint_000016')

for i in range(500):
    print_results(trainer.train())

  and should_run_async(code)
2023-02-14 12:40:28,248	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
[2m[36m(pid=3800)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=3804)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=3810)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=3811)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=3798)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=3801)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=3813)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=3802)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=3824)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=3806)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=3817)[0m   if (distutils.

   1 	r_mean: -136.8 	r_max: -80.8 	r_min: -324.8
   2 	r_mean: -127.2 	r_max: -73.7 	r_min: -324.8
   3 	r_mean: -142.3 	r_max: -71.7 	r_min: -646.5


  """Adds a new policy to this Algorithm.


KeyboardInterrupt: 

In [5]:
t = trainer.save()
t

  and should_run_async(code)


'/root/ray_results/PPO_CybORG_2023-02-14_12-40-25931iq2wl/checkpoint_000003'

In [7]:
import numpy as np
LOGS_PATH = 'logs/APPO/Insomnia0'
states_t = np.load(LOGS_PATH + '/data/states_t.npy')
states_t.shape

(79200, 92)