In [3]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import numpy as np
import ray

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
from keras.layers import Bidirectional
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, Input
from keras.layers import Bidirectional
from ray import tune
from ray.tune.registry import register_env
from keras.models import Model
from keras import backend as K

import tensorflow as tf
from ray.rllib.offline.json_reader import JsonReader
import numpy as np
import numpy_indexed as npi
import pandas as pd
from true_state_viewer import TrueStateTreeGraphViz, display_tree_pairs
from tqdm import trange
import os

from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.algorithms.dqn import DQNConfig
from ray.rllib.policy.policy import PolicySpec
import inspect
import time
from statistics import mean, stdev
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
import os
from ray.tune.registry  import register_env
from tqdm import trange

import warnings
warnings.filterwarnings('ignore')

  if (distutils.version.LooseVersion(tf.__version__) <


In [4]:
class WorldMovelEnv(gym.Env):

    def __init__(self):
        
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(STATE_LEN,))
        self.action_space = gym.spaces.Discrete(ACTION_LEN)
        self.step_count = 0
        
        reward_to_index = np.load('reward_to_index.npy', allow_pickle=True).item()
        self.number_rewards = int(len(reward_to_index.keys()))

        #Reward Model
        self.r_model = Sequential()
        self.r_model.add(Input(shape=(STATE_LEN*2+1,)))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(self.number_rewards, activation='softmax'))
        self.r_model.load_weights('RewardModel_Insomnia')
        
        input_ = Input(shape=(STATE_LEN+ACTION_LEN+1,))
        outs = []
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                x_ = Dense(128, activation='relu')(input_)
                outs.append(Dense(n, activation='softmax', name=str(i)+str(n))(x_))

        self.ns_model_multi_model = Model(input_, outs)
        self.ns_model_multi_model.load_weights('AfterStateModel_Insomnia')
     
        self.reward_map = np.load('index_to_reward.npy', allow_pickle=True).item()
        self.init_state = np.array([0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
         0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
         1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
         1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1.])
        self.state = self.init_state

    def step(self, action):
        self.step_count += 1
       
        action_vec = np.zeros(ACTION_LEN)
        action_vec[action] = 1
        
        state_action = np.concatenate([self.state, [self.step_count/100], action_vec], axis=-1)
      
        probs = self.ns_model_multi_model.predict(np.array([state_action]), verbose=0)
        next_state = np.zeros(STATE_LEN)
        index_state = 0; index = 0
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                next_state[index_state+np.random.choice(np.arange(n), p=probs[index][0])] = 1
                index_state += n; index += 1
        
        reward_probs = self.r_model.predict(np.array([np.concatenate([self.state, next_state, [self.step_count/100]])]))
        reward_index = np.random.choice(np.arange(self.number_rewards), p=reward_probs[0])
        reward = self.reward_map[reward_index]
        
        self.state = next_state
        
        done = self.step_count == 99
        if done:
            self.step_count = 0
        return self.state, reward, done, {}

    def reset(self):
        step_count = 0
        self.state = self.init_state 
        return self.init_state
    
    def render(self, mode='human', close=False):
        pass

    def close(self):
        pass
    
def dream_env_creator(config):
    return WorldMovelEnv() 

In [5]:
def process_data(iteration):

    input_reader = JsonReader(LOGS_PATH)
    num_episodes = int((CAGE_Rollout_Batch_Size/100)*CAGE_Rollout_Iters)
    num_data_points = num_episodes * 99
    print(num_data_points)
    states_t = np.ones((num_data_points, STATE_LEN+1))
    actions_onehot = np.zeros((num_data_points, ACTION_LEN))
    rewards = np.ones(num_data_points) 
    next_states = np.ones((num_data_points, STATE_LEN))
    inital_states = []
    
    data = input_reader.next()
    episodes_per_batch = int(data['obs'].shape[0]//100)

    data_index = 0
    for b in trange(int(num_episodes/episodes_per_batch)):
        data = input_reader.next()
        batch_index = 0
        for e in range(episodes_per_batch):
            states_t[data_index:data_index+99,:] = np.concatenate([data['obs'][batch_index:batch_index+99],np.arange(1,100).reshape(99,1)/100], axis=1)
            next_states[data_index:data_index+99,:] = data['obs'][batch_index+1:batch_index+100]
            rewards[data_index:data_index+99] = data['rewards'][batch_index:batch_index+99]
            actions_onehot[np.arange(data_index,data_index+99),data['actions'][batch_index:batch_index+99]] = 1  
            data_index += 99
            batch_index += 100
            if not data['obs'][0].tobytes() in inital_states:
                inital_states.append(data['obs'][0].tobytes())
                
    for i in range(rewards.shape[0]):
        if rewards[i] <-10:
            rewards[i] == -11
    
    print(states_t.shape)
    #Get data already processed
    if os.path.isfile(LOGS_PATH + '/data/states_t.npy'):
        states_t_, next_states_, actions_, rewards_ = load_data()
        states_t = np.concatenate([states_t_, states_t])
        actions_onehot = np.concatenate([actions_, actions_onehot])
        rewards = np.concatenate([rewards_, rewards])
        next_states = np.concatenate([next_states_, next_states])
    else: 
        #Set up reward map
        labels, encoding = np.unique(rewards, return_inverse=True)
        index_to_reward = {}; reward_to_index = {}
        for i in range(labels.shape[0]):
            index_to_reward[i] = labels[i]
            reward_to_index[labels[i]] = i
        np.save('index_to_reward.npy', index_to_reward) 
        np.save('reward_to_index.npy', reward_to_index) 
    print(states_t.shape)
    if not os.path.exists(LOGS_PATH + '/data'):
        os.mkdir(LOGS_PATH + '/data')   
    states_t = np.array(states_t, dtype=np.int8)
    next_states = np.array(next_states, dtype=np.int8)
    actions_onehot = np.array(actions_onehot, dtype=np.int8)
    np.save(LOGS_PATH + '/data/states_t.npy', states_t)
    np.save(LOGS_PATH + '/data/next_states.npy', next_states)
    np.save(LOGS_PATH + '/data/rewards.npy', rewards)
    np.save(LOGS_PATH + '/data/actions_onehot.npy', actions_onehot)

    #Delete old logs
    filelist = [ f for f in os.listdir('logs/APPO/Insomnia0') if f.endswith(".json") ]
    for f in filelist:
        os.remove(os.path.join('logs/APPO/Insomnia0', f))


def load_data():
    states_t = np.load(LOGS_PATH + '/data/states_t.npy')
    next_states = np.load(LOGS_PATH + '/data/next_states.npy')
    actions = np.load(LOGS_PATH + '/data/actions_onehot.npy')
    rewards = np.load(LOGS_PATH + '/data/rewards.npy')
    return states_t, next_states, actions, rewards


def train_state_tranistion_model():
    states_t = np.load(LOGS_PATH + '/data/states_t.npy')
    next_states = np.load(LOGS_PATH + '/data/next_states.npy')
    actions = np.load(LOGS_PATH + '/data/actions_onehot.npy')
    
    states_actions_t = np.concatenate([states_t, actions], axis=1)
    
    NUM_NODES = 13
    NODE_CLASSES = [3, 4]
    data_map = {}
    losses = []
    index = 0
    for i in range(NUM_NODES):
        for n in NODE_CLASSES:
            data_map[str(i)+str(n)] = next_states[:,index:index+n]
            losses.append(tf.keras.losses.CategoricalCrossentropy())
            index += n

    input_ = Input(shape=(STATE_LEN+ACTION_LEN+1,))
    x = Dense(256, activation='relu')(input_)
    outs = []
    for i in range(NUM_NODES):
        for n in NODE_CLASSES:
            x_ = Dense(128, activation='relu')(input_)
            outs.append(Dense(n, activation='softmax', name=str(i)+str(n))(x_))

    model = Model(input_, outs)
    if os.path.isfile('AfterStateModel_Insomnia.index'):
        model.load_weights('AfterStateModel_Insomnia')
    model.compile(optimizer='adam', loss=losses, metrics=[tf.keras.metrics.CategoricalAccuracy()])
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.01)
    K.set_value(model.optimizer.learning_rate, 0.001)
    model.fit(states_actions_t, data_map, epochs=WM_Training_Iters, validation_split=0.15, verbose=0, callbacks=[callback], batch_size=256)
    model.save_weights('AfterStateModel_Insomnia')
    
def train_reward_model():
    rewards = np.load(LOGS_PATH + '/data/rewards.npy')
    reward_to_index = np.load('reward_to_index.npy', allow_pickle=True).item()
    reward_classes = np.vectorize(reward_to_index.get)(rewards)
    reward_classes[np.isnan(reward_classes)] = -10 #Incase an unknown reward pops up
    reward_onehot = np.eye(int(len(reward_to_index.keys())))[np.array(reward_classes, dtype=np.int8)]
    
    states_t = np.load(LOGS_PATH + '/data/states_t.npy')
    next_states = np.load(LOGS_PATH + '/data/next_states.npy')
    state_concate = np.concatenate([states_t, next_states], axis=1)
     
    model = Sequential()
    model.add(Input(shape=(STATE_LEN*2+1,)))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(int(len(reward_to_index.keys())), activation='softmax'))
    if os.path.isfile('RewardModel_Insomnia.index'):
        model.load_weights('RewardModel_Insomnia')
    model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy()])
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_categorical_accuracy', patience=3, min_delta=0.01)
    K.set_value(model.optimizer.learning_rate, 0.001)
    model.fit(state_concate, reward_onehot, epochs=WM_Training_Iters, validation_split=0.15, verbose=0, callbacks=[callback], batch_size=256)
    model.save_weights('RewardModel_Insomnia')
    

def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    reward.append(r_mean)
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")

def env_creator(env_config: dict):
    # import pdb; pdb.set_trace()
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2_No_Decoy.yaml'
    if RED_AGENT == "B_Line":
        agents = {"Red": B_lineAgent, "Green": GreenAgent}
    else:
        agents = {"Red": RedMeanderAgent, "Green": GreenAgent}

    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env

def rollout_in_CAGE(agent_checkpoint, iters):
    register_env(name="CybORG", env_creator=env_creator)
    '''
    config = (
        PPOConfig()
        #Each rollout worker uses a single cpu
        .rollouts(num_rollout_workers=20, num_envs_per_worker=1, horizon=100)\
        .training(sgd_minibatch_size = 100, train_batch_size=CAGE_Rollout_Batch_Size, gamma=0.99, lr=0.00001, 
                  model={"fcnet_hiddens": [256, 256], "fcnet_activation": "tanh",})\
        .environment(disable_env_checking=True, env = 'CybORG')\
        .resources(num_gpus=1)\
        .framework('torch')\
        .offline_data(output=LOGS_PATH, output_compress_columns=['prev_actions', 'prev_rewards', 'dones', 't', 'action_prob', 'action_logp', 'action_dist_inputs', 'advantages', 'value_targets'], #'eps_id', 'unroll_id', 'agent_index',
                 output_config={"format": "json"},)
        #.exploration(explore=True, exploration_config={"type": "RE3", "embeds_dim": 128, "beta_schedule": "constant", "sub_exploration": {"type": "StochasticSampling",},})\
    )
    '''
    
    config = (
        DQNConfig()
        #Each rollout worker uses a single cpu
        .rollouts(num_rollout_workers=10, num_envs_per_worker=1, horizon=100)\
        .training(num_atoms=21, v_min=-50, v_max=0, noisy=True, dueling=True,
                  double_q=True, gamma=0.99, lr=0.00005, n_step=5,
                  train_batch_size=2000,
                  model={"fcnet_hiddens": [256, 256], "fcnet_activation": "tanh",})\
        .environment(disable_env_checking=True, env = 'CybORG')\
        .resources(num_gpus=1)\
        .framework('torch')\
        .offline_data(output=LOGS_PATH, output_compress_columns=['prev_actions', 'prev_rewards', 'dones', 't', 'action_prob', 'action_logp', 'action_dist_inputs', 'advantages', 'value_targets'], #'eps_id', 'unroll_id', 'agent_index',
                 output_config={"format": "json"},)
        #.exploration(explore=True, exploration_config={"type": "RE3", "embeds_dim": 128, "beta_schedule": "constant", "sub_exploration": {"type": "StochasticSampling",},})\
    )
    
    trainer = config.build()
    
    if not agent_checkpoint == '':
        trainer.load_checkpoint(agent_checkpoint)

    for i in range(iters):
        print_results(trainer.train())
    agent_checkpoint = trainer.save(LOGS_PATH)
    return agent_checkpoint
    
def dream(agent_checkpoint):
    register_env(name="DreamCybORG", env_creator=dream_env_creator)
    '''
    config = (
        PPOConfig()
        #Each rollout worker uses a single cpu
        .rollouts(num_rollout_workers=20, num_envs_per_worker=1, horizon=100)\
        .training(sgd_minibatch_size = 100, train_batch_size=2000, gamma=0.99, lr=0.00001, 
                  model={"fcnet_hiddens": [256, 256], "fcnet_activation": "tanh",})\
        .environment(disable_env_checking=True, env = 'CybORG')\
        .resources(num_gpus=1)\
        .framework('torch')\
        #.exploration(explore=True, exploration_config={"type": "RE3", "embeds_dim": 128, "beta_schedule": "constant", "sub_exploration": {"type": "StochasticSampling",},})\
    )
    '''
    config = (
        DQNConfig()
        #Each rollout worker uses a single cpu
        .rollouts(num_rollout_workers=10, num_envs_per_worker=1, horizon=100)\
        .training(num_atoms=21, v_min=-50, v_max=0, noisy=True, dueling=True,
                  double_q=True, gamma=0.99, lr=0.00005, n_step=5,
                  train_batch_size=2000,
                  model={"fcnet_hiddens": [256, 256], "fcnet_activation": "tanh",})\
        .environment(disable_env_checking=True, env = 'CybORG')\
        .resources(num_gpus=1)\
        .framework('torch')\
        .offline_data(output=LOGS_PATH, output_compress_columns=['prev_actions', 'prev_rewards', 'dones', 't', 'action_prob', 'action_logp', 'action_dist_inputs', 'advantages', 'value_targets'], #'eps_id', 'unroll_id', 'agent_index',
                 output_config={"format": "json"},)
        #.exploration(explore=True, exploration_config={"type": "RE3", "embeds_dim": 128, "beta_schedule": "constant", "sub_exploration": {"type": "StochasticSampling",},})\
    )
    trainer = config.build()
    trainer.load_checkpoint(agent_checkpoint)
    for i in range(Learning_In_Dream_Iters):
        print_results(trainer.train())
    agent_checkpoint = trainer.save(LOGS_PATH)
    return agent_checkpoint

In [6]:
LOGS_PATH = 'logs/APPO/InsomniaDQN'
CAGE_Rollout_Iters_Inital = 200 
CAGE_Rollout_Iters_After = 20
CAGE_Rollout_Batch_Size = 2000
WM_Training_Iters = 25
Learning_In_Dream_Iters = 20
STATE_LEN = 91
ACTION_LEN = 41
RED_AGENT = 'B_Line' 
NUM_NODES = 13
NODE_CLASSES = [3, 4]
reward = [] #np.load('insomnia_r.npy').tolist()

agent_checkpoint = ''
CAGE_Rollout_Iters = CAGE_Rollout_Iters_Inital

for i in range(1,15):
    print("ROLLOUT: ", i)
    if i == 1:
        agent_checkpoint = rollout_in_CAGE(agent_checkpoint, CAGE_Rollout_Iters_Inital)
    else: 
        CAGE_Rollout_Iters = CAGE_Rollout_Iters_After
        agent_checkpoint = rollout_in_CAGE(agent_checkpoint, CAGE_Rollout_Iters)
    print("PROCESS DATA: ", i)
    process_data(i)
    print("TRAIN STATE TRANISION MODEL: ", i)
    train_state_tranistion_model()
    K.clear_session()
    print("TRAIN REWARD MODEL: ", i)
    train_reward_model()
    K.clear_session()
    print("DREAM: ", i)
    agent_checkpoint = dream(agent_checkpoint)
    np.save('insomnia_r.npy', np.array(reward))

ROLLOUT:  1


2023-02-19 14:43:41,268	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m
[2m[36m(pid=9467)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=9470)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=9472)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=9476)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=9479)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=9466)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=9468)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=9482)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=9477)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=9473)[0m   if (distutils.version.LooseVersion(tf.__version__) <
2023-02-19 14:44:00,729	INFO trainable.py:172 -- Trainable.setup took

   1 	r_mean: -422.2 	r_max: -194.4 	r_min: -878.9




   2 	r_mean: -468.5 	r_max: -192.2 	r_min: -1078.1
   3 	r_mean: -488.0 	r_max: -153.5 	r_min: -1078.1
   4 	r_mean: -456.1 	r_max: -142.7 	r_min: -1078.1
   5 	r_mean: -463.9 	r_max: -142.7 	r_min: -1078.1
   6 	r_mean: -480.6 	r_max: -142.7 	r_min: -1078.1
   7 	r_mean: -474.1 	r_max: -142.7 	r_min: -1078.1
   8 	r_mean: -464.1 	r_max: -142.7 	r_min: -1078.1
   9 	r_mean: -465.1 	r_max: -142.7 	r_min: -1078.1
  10 	r_mean: -466.4 	r_max: -142.7 	r_min: -1136.7
  11 	r_mean: -451.2 	r_max: -142.7 	r_min: -1136.7
  12 	r_mean: -443.4 	r_max: -142.7 	r_min: -1136.7
  13 	r_mean: -418.8 	r_max: -142.7 	r_min: -1136.7
  14 	r_mean: -429.0 	r_max: -148.6 	r_min: -1136.7
  15 	r_mean: -421.5 	r_max: -148.6 	r_min: -1136.7
  16 	r_mean: -405.8 	r_max: -145.7 	r_min: -1136.7
  17 	r_mean: -407.5 	r_max: -145.7 	r_min: -1136.7
  18 	r_mean: -400.2 	r_max: -145.7 	r_min: -1136.7
  19 	r_mean: -386.9 	r_max: -141.9 	r_min: -1136.7
  20 	r_mean: -369.5 	r_max: -141.9 	r_min: -1079.2
  21 	r_mean


KeyboardInterrupt



In [None]:
import matplotlib.pyplot as plt
interval = 20
fig = plt.figure(figsize=(12, 8), dpi=100)
plt.title('PPO Training With World Model - CAGEv2 No Decoy')
x = 0
plt.plot(np.arange(x,x+100), reward[x:x+100], linestyle='solid', color='black', label='Learning In CAGE')
x = interval
plt.plot(np.arange(x+interval,x+(interval*2)), reward[x+interval:x+(interval*2)], linestyle='dotted', color='black', label='Learning In The Dream')
for i in range(1,6):
    x = interval * (i*2)
    plt.plot(np.arange(x,x+interval), reward[x:x+interval], linestyle='solid', color='black')
    plt.plot(np.arange(x+interval,x+(interval*2)), reward[x+interval:x+(interval*2)], linestyle='dotted', color='black')
plt.ylabel('Mean Reward On Batch')
plt.xlabel('Batch Number')
plt.show()


In [None]:
reward[x+interval:x+(interval*2)]

In [6]:
 np.save('insomnia_r.npy', np.array(reward))

In [None]:
agent_checkpoint

In [None]:
for i in range(11,31):
    print("ROLLOUT: ", i)
    if i == 1:
        agent_checkpoint = rollout_in_CAGE(agent_checkpoint, CAGE_Rollout_Iters_Inital)
    else: 
        CAGE_Rollout_Iters = CAGE_Rollout_Iters_After
        agent_checkpoint = rollout_in_CAGE(agent_checkpoint, CAGE_Rollout_Iters)
    print("PROCESS DATA: ", i)
    process_data(i)
    print("TRAIN STATE TRANISION MODEL: ", i)
    train_state_tranistion_model()
    print("TRAIN REWARD MODEL: ", i)
    train_reward_model()
    print("DREAM: ", i)
    agent_checkpoint = dream(agent_checkpoint)

In [None]:
states_t = np.load(LOGS_PATH + '/data/states_t.npy')
states_t.shape

In [None]:
1052 * 128

In [None]:
!ls /root/ray_results