In [1]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import numpy as np

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
from keras.layers import Bidirectional
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, Input
from keras.layers import Bidirectional
from ray import tune
from ray.tune.registry import register_env
from keras.models import Model

In [8]:
sequence_length=3
state_len = 91
num_actions = 41
encoding_len = state_len + num_action
NUM_NODES = 6
NODE_CLASSES = [3, 4]

class WorldMovelEnv(gym.Env):

    def __init__(self):
        
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(42,))
        self.action_space = gym.spaces.Discrete(20)
        
        self.step_count = 0

        #Reward Model
        self.r_model = Sequential()
        self.r_model.add(Input(shape=(state_len+num_actions+1,)))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(17, activation='softmax'))
        #self.r_model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy()])
        self.r_model.load_weights('RewardModel')
        
        
        input_ = Input(shape=(state_len+num_actions+1,))
        x = Dense(512, activation='relu')(input_)
        x = Dense(512, activation='relu')(x)
        outs = []
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                x_ = Dense(128, activation='relu')(x)
                outs.append(Dense(n, activation='softmax', name=str(i)+str(n))(x_))

        self.ns_model_multi_model = Model(input_, outs)
        self.ns_model_multi_model.load_weights('AfterStateModel_MulitLabel')
     
        '''
        input_ = Input(shape=(10,42+20,))
        x = Bidirectional(LSTM(256))(input_)
        x = Flatten()(x)
        x = Dense(512, activation='relu')(x)

        outs = []
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                x_ = Dense(256, activation='relu')(x)
                outs.append(Dense(n, activation='softmax', name=str(i)+str(n))(x_))
        #x_ = Dense(256, activation='relu')(x)
        #outs.append(Dense(6, activation='softmax', name='reward')(x_))
        self.ns_model_multi_model = Model(input_, outs)
        self.ns_model_multi_model.load_weights('NextStateModel_MulitLabel_rnn')
        '''
        self.reward_map = np.load('reward_map.npy', allow_pickle=True).item()
        #self.state = np.zeros((10, 42))
        self.init_state = np.array([0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 
       0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1.
       0., 0., 1., 0., 0., 0., 1.])
        self.state = self.init_state
        #self.action = np.zeros((10, 20))

    def step(self, action):
        self.step_count += 1
        
        #self.state_action[0,52] = action / 145
        #obs_index_probs = self.st_model.predict(np.array([self.state_action]))
        #obs_index = np.argmax(obs_index_probs)
        #obs_index = np.random.choice(np.arange(9674), p=obs_index_probs[0])
        #new_state = self.index_to_state[obs_index]
        #new_state = np.frombuffer(new_state)
        
        action_vec = np.zeros(num_actions)
        action_vec[action] = 1
        #self.action[-1,:] = action_vec
        
        state_action = np.concatenate([self.state, [self.step_count/100], action_vec], axis=-1)
        #afterstate_probs = self.as_model.predict(np.array([state_action]), verbose=0)
        #afterstate = np.array(np.random.rand(1, 42) < afterstate_probs, dtype=np.int8)
        
        #while True:
        #    new_state_probs = self.ns_model.predict(np.array([state_action]), verbose=0)
        #    new_state = np.array(np.random.rand(1, 42) < new_state_probs, dtype=np.int8)
        #    if int(np.sum(self.state-new_state)) == 0:
        #        break
        #self.state = new_state[0]
        probs = self.ns_model_multi_model.predict(np.array([state_action]), verbose=0)
        self.state = np.zeros(state_len)
        index_state = 0; index = 0
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                self.state[index_state+np.random.choice(np.arange(n), p=probs[index][0])] = 1
                index_state += n; index += 1
        
       # reward_probs = self.r_model.predict(np.array([self.state]), verbose=0)
        #reward = np.random.choice(np.arange(6), p=reward_probs[0])
        #reward = self.reward_map[reward]
        
        #for i in range(1,3):
        #    self.state_action[i,:] = self.state_action[i-1,:]
        #
        #self.state_action[0,:] = np.concatenate([new_state, [0]])
        
        reward_probs = self.r_model.predict(np.array([state_action]))
        reward_index = np.random.choice(np.arange(6), p=reward_probs[0])
        reward = self.reward_map[reward_index]
        
        #for i in range(0,9):
        #    self.state[i,:] = self.state[i+1,:]
        #    self.action[i,:] = self.action[i+1,:]
            
        #self.state[-1,:] = state
        
        done = self.step_count == 99
        if done:
            self.step_count = 0
        return state, reward, done, {}

    def reset(self):
        step_count = 0
        
        #self.state_action = np.zeros((sequence_length,encoding_len))
        #self.state = np.zeros((sequence_length,state_len))
        
        #obs_index_probs = self.st_model.predict(np.array([self.state_action]))
        #obs_index = np.argmax(obs_index_probs)
        #obs_index = np.random.choice(np.arange(9674), p=obs_index_probs[0])
        #new_state = self.index_to_state[obs_index]
        #new_state = np.frombuffer(new_state)
       
        #self.state = np.zeros((10, 42))
        #self.state[0,:] = init_state
        self.state = self.init_state
      
        return self.init_state
    

    def render(self, mode='human', close=False):
        pass

    def close(self):
        pass
    
def env_creator(config):
    return WorldMovelEnv() 

In [9]:
test = WorldMovelEnv()
test.reset()
for i in range(100):
    print(test.step(np.random.randint(20)))

ValueError: in user code:

    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 2137, in predict_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 2123, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 2111, in run_step  **
        outputs = model.predict_step(data)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 2079, in predict_step
        return self(x, training=False)
    File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model_3" is incompatible with the layer: expected shape=(None, 133), found shape=(None, 84)


In [53]:
register_env(name="DreamCybORG", env_creator=env_creator)

  and should_run_async(code)


In [54]:
def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")

In [55]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.policy.policy import PolicySpec

config = (
    PPOConfig()
    #Each rollout worker uses a single cpu
    .rollouts(num_rollout_workers=20, num_envs_per_worker=1, horizon=100)\
    .training(sgd_minibatch_size = 100, train_batch_size=2000, gamma=0.99, lr=0.00005, 
              model={"fcnet_hiddens": [256, 256], "fcnet_activation": "tanh",})\
    .environment(disable_env_checking=True, env = 'DreamCybORG')\
    .resources(num_gpus=1)\
    .framework('torch')\
    #.exploration(explore=True, exploration_config={"type": "RE3", "embeds_dim": 128, "beta_schedule": "constant", "sub_exploration": {"type": "StochasticSampling",},})\
)
trainer = config.build()


[2m[36m(pid=38023)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=38015)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=38013)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=38014)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=38016)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=38436)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=38017)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=38161)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=38019)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(RolloutWorker pid=38023)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38023)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38023)[0m Instructions for updating:
[2m[36m(RolloutW

[2m[36m(pid=38424)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(RolloutWorker pid=38013)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38013)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38013)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38013)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38013)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38013)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(pid=38433)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(RolloutWorker pid=38425)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38425)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38425)[0m Instructions for 

[2m[36m(RolloutWorker pid=38429)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38429)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38429)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38429)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38429)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38429)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38021)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38021)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38021)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38021)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWor

[2m[36m(RolloutWorker pid=38424)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38424)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38424)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38424)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38424)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38424)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38433)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38433)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWorker pid=38433)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=38433)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(RolloutWor

In [56]:
for i in range(500):
    print_results(trainer.train())

  and should_run_async(code)
[2m[36m(RolloutWorker pid=38023)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38015)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38013)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38014)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38016)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38436)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38017)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38161)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38019)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38426)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38429)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38425)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38021)[0m   updates=self.state_updates,
[2m[36m(RolloutWorker pid=38437)[0m   updates=self.state_updates,
[2m

   1 	r_mean: -93.7 	r_max: -48.0 	r_min: -131.4
   2 	r_mean: -87.9 	r_max: -48.0 	r_min: -131.4
   3 	r_mean: -82.0 	r_max: -45.5 	r_min: -131.4
   4 	r_mean: -79.9 	r_max: -44.6 	r_min: -131.4
   5 	r_mean: -78.5 	r_max: -44.6 	r_min: -131.4
   6 	r_mean: -72.8 	r_max: -44.6 	r_min: -117.3
   7 	r_mean: -69.1 	r_max: -43.7 	r_min: -117.3
   8 	r_mean: -67.1 	r_max: -27.4 	r_min: -114.7
   9 	r_mean: -64.2 	r_max: -27.4 	r_min: -114.7
  10 	r_mean: -61.2 	r_max: -27.4 	r_min: -114.7
  11 	r_mean: -59.8 	r_max: -27.4 	r_min: -114.7
  12 	r_mean: -58.7 	r_max: -27.4 	r_min: -122.4
  13 	r_mean: -57.9 	r_max: -27.7 	r_min: -122.4
  14 	r_mean: -58.7 	r_max: -27.7 	r_min: -122.4
  15 	r_mean: -57.0 	r_max: -27.7 	r_min: -122.4
  16 	r_mean: -56.2 	r_max: -26.9 	r_min: -122.4
  17 	r_mean: -55.8 	r_max: -26.9 	r_min: -108.6
  18 	r_mean: -55.1 	r_max: -26.9 	r_min: -108.6




  19 	r_mean: -53.6 	r_max: -26.9 	r_min: -101.5
  20 	r_mean: -54.3 	r_max: -26.9 	r_min: -101.5
  21 	r_mean: -54.5 	r_max: -29.9 	r_min: -107.0
  22 	r_mean: -53.8 	r_max: -29.9 	r_min: -107.0
  23 	r_mean: -53.2 	r_max: -30.9 	r_min: -107.0
  24 	r_mean: -52.1 	r_max: -30.9 	r_min: -107.0
  25 	r_mean: -51.9 	r_max: -31.7 	r_min: -107.0
  26 	r_mean: -50.6 	r_max: -29.6 	r_min: -108.7
  27 	r_mean: -49.9 	r_max: -22.6 	r_min: -108.7
  28 	r_mean: -48.7 	r_max: -22.6 	r_min: -108.7
  29 	r_mean: -48.0 	r_max: -22.6 	r_min: -108.7
  30 	r_mean: -46.4 	r_max: -22.6 	r_min: -108.7
  31 	r_mean: -46.8 	r_max: -22.6 	r_min: -92.4
  32 	r_mean: -46.7 	r_max: -25.0 	r_min: -92.4
  33 	r_mean: -46.0 	r_max: -25.0 	r_min: -92.4
  34 	r_mean: -46.8 	r_max: -25.0 	r_min: -92.4
  35 	r_mean: -46.8 	r_max: -25.0 	r_min: -92.4
  36 	r_mean: -45.5 	r_max: -25.0 	r_min: -80.0
  37 	r_mean: -44.4 	r_max: -25.4 	r_min: -80.0
  38 	r_mean: -45.5 	r_max: -26.4 	r_min: -80.0
  39 	r_mean: -44.6 	r_max: 


KeyboardInterrupt



In [7]:
trainer.load_checkpoint('results/checkpoint_000268')

  and should_run_async(code)


In [25]:
for i in range(10):

    cyborg = CybORG(path, 'sim', agents={'Red': RedMeanderAgent})
    wrapped_cyborg = wrap(cyborg)

    observation = wrapped_cyborg.reset()
    wrapped_cyborg.reset()
    obs = wrapped_cyborg.step(5)[3]['observation']
    print(obs)

  and should_run_async(code)


NameError: name 'CybORG' is not defined

In [51]:
test.reset()
obs2, r, d, i = test.step(5)
obs - obs2

  and should_run_async(code)




array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -1.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0., -1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.])

In [57]:
import inspect
import time
from statistics import mean, stdev
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
import os
from tqdm import trange

MAX_EPS = 5
agent_name = 'Blue'

def wrap(env):
    return RLlibWrapper(agent_name="Blue", env=env)

def env_creator(env_config: dict):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2Small.yaml'
    agents = {"Red": RedMeanderAgent, "Green": GreenAgent}
    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env

register_env(name="CybORG", env_creator=env_creator)


path = str(inspect.getfile(CybORG))
path = path[:-10] + '/Shared/Scenarios/Scenario2Small.yaml'
obs = []
#print(f'using CybORG v{cyborg_version}, {scenario}\n')
for red_agent in [RedMeanderAgent]:#, RedMeanderAgent]:

    cyborg = CybORG(path, 'sim', agents={'Red': red_agent})
    wrapped_cyborg = wrap(cyborg)

    observation = wrapped_cyborg.reset()
    obs.append(observation)
    # observation = cyborg.reset().observation

    action_space = wrapped_cyborg.get_action_space(agent_name)
    # action_space = cyborg.get_action_space(agent_name)
    total_reward = []
    actions = []
    for i in trange(MAX_EPS):
        r = []
        #a = []
        cyborg = CybORG(path, 'sim', agents={'Red': red_agent})
        wrapped_cyborg = wrap(cyborg)

        observation = wrapped_cyborg.reset()
        obs.append(observation)
        # observation = cyborg.reset().observation

        # cyborg.env.env.tracker.render()
        for j in range(100):
            
            
            action = trainer.compute_single_action(observation, explore=False)
            #action_vec = np.zeros(145)
            #action_vec[int(action)] = 1
            #action = agent.get_action(observation, action_space)
            observation, rew, done, info = wrapped_cyborg.step(action)
            obs.append(observation)
            #actions.append(action_vec)
            # result = cyborg.step(agent_name, action)
            r.append(rew)
            # r.append(result.reward)
           # a.append((str(cyborg.get_last_action('Blue')), str(cyborg.get_last_action('Red'))))
        total_reward.append(sum(r))
        # observation = cyborg.reset().observation
        observation = wrapped_cyborg.reset()
    print(f'Average reward for red agent {red_agent.__name__} and steps {100} is: {mean(total_reward):.1f} with a standard deviation of {stdev(total_reward):.1f}')
    #return mean(total_reward), np.mean(np.array(obs), axis=0),  np.mean(np.array(actions), axis=0)

  and should_run_async(code)
100%|██████████| 5/5 [00:26<00:00,  5.37s/it]

Average reward for red agent RedMeanderAgent and steps 100 is: -98.1 with a standard deviation of 3.3





In [55]:
from ray.rllib.algorithms.dqn import DQNConfig
from ray.rllib.policy.policy import PolicySpec

config = (
    DQNConfig()
    #Each rollout worker uses a single cpu
    .rollouts(num_rollout_workers=10, num_envs_per_worker=1, horizon=100)\
    .training(gamma=0.99, lr=0.005, 
              model={"fcnet_hiddens": [256, 256], "fcnet_activation": "tanh",})\
    .environment(disable_env_checking=True, env = 'DreamCybORG')\
    .resources(num_gpus=1)\
    .framework('torch')\
    #.exploration(explore=True, exploration_config={"type": "RE3", "embeds_dim": 128, "beta_schedule": "constant", "sub_exploration": {"type": "StochasticSampling",},})\
)
trainer = config.build()
for i in range(500):
    print_results(trainer.train())

  and should_run_async(code)
[2m[36m(pid=25909)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25826)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25829)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25824)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25822)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25828)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25820)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25911)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25819)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=25818)[0m   if (distutils.version.LooseVersion(tf.__version__) <
2023-02-08 12:36:23,387	INFO trainable.py:172 -- Trainable.setup took 12.251 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce



   1 	r_mean: -81.2 	r_max: -67.7 	r_min: -97.9


  """Adds a new policy to this Algorithm.
  """Defines a configuration class from which a DQN Algorithm can be built.

KeyboardInterrupt



In [None]:
test = WorldMovelEnv()
import time 

t = time.time()
print(test.reset())
for i in range(100):
    print(test.step(55))
print(t-time.time())

In [14]:
trainer.save('results')

  and should_run_async(code)


'results/checkpoint_000268'

In [9]:
!pip install tqdm

  and should_run_async(code)


Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.5 MB/s eta 0:00:01
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.64.1
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [49]:
count = np.arange(10)
count

  and should_run_async(code)


array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [50]:
for i in range(0,9):
    count[i] = count[i+1]
count[-1] = 10
count

  and should_run_async(code)


array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])







