In [1]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import numpy as np

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
from keras.layers import Bidirectional
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, Input
from keras.layers import Bidirectional
from ray import tune
from ray.tune.registry import register_env
from keras.models import Model

In [2]:
sequence_length=3
state_len = 52
num_action = 1
encoding_len = state_len + num_action
NUM_NODES = 6
NODE_CLASSES = [3, 4]

class WorldMovelEnv(gym.Env):

    def __init__(self):
        
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(42,))
        self.action_space = gym.spaces.Discrete(20)
        
        self.step_count = 0
        '''
        #Afterstate Model
        self.as_model = Sequential()
        self.as_model.add(Input(shape=(42+20,)))
        self.as_model.add(Dense(512, activation='relu'))
        self.as_model.add(Dense(512, activation='relu'))
        self.as_model.add(Dense(512, activation='relu'))
        self.as_model.add(Dense(42, activation='sigmoid'))
        #self.as_model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.BinaryAccuracy()])
        self.as_model.load_weights('AfterStateModel')
        
        #Next State Model
        self.ns_model = Sequential()
        self.ns_model.add(Input(shape=(42+20,)))
        self.ns_model.add(Dense(512, activation='relu'))
        self.ns_model.add(Dense(512, activation='relu'))
        self.ns_model.add(Dense(512, activation='relu'))
        self.ns_model.add(Dense(42, activation='sigmoid'))
        #self.ns_model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.BinaryAccuracy()])
        self.ns_model.load_weights('NextStateModel')
        '''
        #Reward Model
        self.r_model = Sequential()
        self.r_model.add(Input(shape=(43+20,)))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(6, activation='softmax'))
        #self.r_model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy()])
        self.r_model.load_weights('RewardModel')
        
        input_ = Input(shape=(43+20,))
        x = Dense(512, activation='relu')(input_)
        x = Dense(512, activation='relu')(x)
        outs = []
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                x_ = Dense(128, activation='relu')(x)
                outs.append(Dense(n, activation='softmax', name=str(i)+str(n))(x_))

        self.ns_model_multi_model = Model(input_, outs)
        self.ns_model_multi_model.load_weights('AfterStateModel_MulitLabel')
        
        self.reward_map = np.load('reward_map.npy', allow_pickle=True).item()
        self.state = np.array([0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 0., 1.])
        self.action = np.zeros(20)

    def step(self, action):
        self.step_count += 1
        
        #self.state_action[0,52] = action / 145
        #obs_index_probs = self.st_model.predict(np.array([self.state_action]))
        #obs_index = np.argmax(obs_index_probs)
        #obs_index = np.random.choice(np.arange(9674), p=obs_index_probs[0])
        #new_state = self.index_to_state[obs_index]
        #new_state = np.frombuffer(new_state)
        
        
        #for i in range(1,3):
        #    self.state[i,:] = self.state[i-1,:]
        
        self.action = np.zeros(20)
        self.action[action] = 1
        state_action = np.concatenate([self.state, [self.step_count/100], self.action])
        #afterstate_probs = self.as_model.predict(np.array([state_action]), verbose=0)
        #afterstate = np.array(np.random.rand(1, 42) < afterstate_probs, dtype=np.int8)
        
        #while True:
        #    new_state_probs = self.ns_model.predict(np.array([state_action]), verbose=0)
        #    new_state = np.array(np.random.rand(1, 42) < new_state_probs, dtype=np.int8)
        #    if int(np.sum(self.state-new_state)) == 0:
        #        break
        #self.state = new_state[0]
        probs = self.ns_model_multi_model.predict(np.array([state_action]), verbose=0)
        self.state = np.zeros(42)
        index_state = 0; index = 0
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                self.state[index_state+np.random.choice(np.arange(n), p=probs[index][0])] = 1
                index_state += n; index += 1
        
       # reward_probs = self.r_model.predict(np.array([self.state]), verbose=0)
        #reward = np.random.choice(np.arange(6), p=reward_probs[0])
        #reward = self.reward_map[reward]
        
        #for i in range(1,3):
        #    self.state_action[i,:] = self.state_action[i-1,:]
        #
        #self.state_action[0,:] = np.concatenate([new_state, [0]])
        
        reward_probs = self.r_model.predict(np.array([state_action]))
        reward_index = np.random.choice(np.arange(6), p=reward_probs[0])
        reward = self.reward_map[reward_index]
        
        done = self.step_count == 99
        if done:
            self.step_count = 0
        return self.state, reward, done, {}

    def reset(self):
        step_count = 0
        
        #self.state_action = np.zeros((sequence_length,encoding_len))
        #self.state = np.zeros((sequence_length,state_len))
        
        #obs_index_probs = self.st_model.predict(np.array([self.state_action]))
        #obs_index = np.argmax(obs_index_probs)
        #obs_index = np.random.choice(np.arange(9674), p=obs_index_probs[0])
        #new_state = self.index_to_state[obs_index]
        #new_state = np.frombuffer(new_state)
        self.state = np.array([0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
           0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
           1., 0., 0., 1., 0., 0., 0., 1.])
      
        return self.state
    

    def render(self, mode='human', close=False):
        pass

    def close(self):
        pass
    
def env_creator(config):
    return WorldMovelEnv() 

  and should_run_async(code)


In [20]:
test = WorldMovelEnv()
test.reset()
for i in range(100):
    print(test.step(np.random.randint(20)))

  and should_run_async(code)


(array([0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 0., 1.]), 0.0, False, {})
(array([0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 1.]), 0.0, False, {})
(array([0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 0., 1.]), 0.0, False, {})
(array([0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 0., 1.]), -1.0, False, {})
(array([0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       1., 0., 0


KeyboardInterrupt



In [70]:
class SearchTree():

    def __init__(self):
        self.r_model = Sequential()
        self.r_model.add(Input(shape=(43+20,)))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(512, activation='relu'))
        self.r_model.add(Dense(6, activation='softmax'))
        #self.r_model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy()])
        self.r_model.load_weights('RewardModel')
        
        self.reward_map = np.load('reward_map.npy', allow_pickle=True).item()
        self.reward_map = np.fromiter(reward_map.values(), dtype=np.float32)

        input_ = Input(shape=(43+20,))
        x = Dense(512, activation='relu')(input_)
        x = Dense(512, activation='relu')(x)
        outs = []
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                x_ = Dense(128, activation='relu')(x)
                outs.append(Dense(n, activation='softmax', name=str(i)+str(n))(x_))

        self.ns_model_multi_model = Model(input_, outs)
        self.ns_model_multi_model.load_weights('AfterStateModel_MulitLabel')
        
    
    def select_action(self, state, t):
        values = np.zeros(20)
        actions_to_state = {}
        for i in range(20):
            action = np.zeros(20)
            action[i] = 1
            probs = self.r_model(np.array([np.concatenate([state, [t], action])]))
            values[i] = (probs[0].numpy() * self.reward_map).sum()
            next_state = self.next_state(np.array([np.concatenate([state, [t], action])]))
            values_ = np.zeros(20)
            for k in range(20):
                action = np.zeros(20)
                action[k] = 1
                probs = self.r_model(np.array([np.concatenate([next_state, [t], action])]))
                values_[k] = (probs[0].numpy() * self.reward_map).sum()
            values[i] += values_[k].max()
        return values.argmax()
    
    def next_state(self, state_action):
        probs = self.ns_model_multi_model.predict(state_action, verbose=0)
        state = np.zeros(42)
        index_state = 0; index = 0
        for i in range(NUM_NODES):
            for n in NODE_CLASSES:
                state[index_state+np.random.choice(np.arange(n), p=probs[index][0])] = 1
                index_state += n; index += 1
        return state
    
    

In [72]:
import inspect
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
import os
from tqdm import trange

MAX_EPS = 100
agent_name = 'Blue'

path = str(inspect.getfile(CybORG))
path = path[:-10] + '/Shared/Scenarios/Scenario2Small.yaml'
cyborg = CybORG(path, 'sim', agents={'Red': RedMeanderAgent})
wrapped_cyborg = RLlibWrapper(agent_name="Blue", env=cyborg)
obs = wrapped_cyborg.reset()

tree = SearchTree()   

r = []
for j in trange(100):

    action = tree.select_action(obs,j)
    obs, rew, done, info = wrapped_cyborg.step(action)
    r.append(rew)
    print(rew)

print(sum(r))

  1%|          | 1/100 [00:03<05:54,  3.58s/it]

0.0


  2%|▏         | 2/100 [00:07<06:20,  3.88s/it]

0.0


  3%|▎         | 3/100 [00:10<05:50,  3.61s/it]

0.0


  4%|▍         | 4/100 [00:15<06:05,  3.80s/it]

0.0


  5%|▌         | 5/100 [00:18<05:42,  3.61s/it]

0.0












  6%|▌         | 6/100 [00:23<06:17,  4.01s/it]

-0.1


  7%|▋         | 7/100 [00:26<06:07,  3.95s/it]

-0.1


  8%|▊         | 8/100 [00:30<05:57,  3.89s/it]

-0.1


  9%|▉         | 9/100 [00:34<05:55,  3.91s/it]

-0.1


 10%|█         | 10/100 [00:38<05:39,  3.78s/it]

-1.1


 11%|█         | 11/100 [00:42<05:40,  3.82s/it]

-1.1


 12%|█▏        | 12/100 [00:45<05:28,  3.73s/it]

-1.1


 13%|█▎        | 13/100 [00:49<05:33,  3.84s/it]

-1.1


 14%|█▍        | 14/100 [00:52<05:15,  3.67s/it]

-1.1


 15%|█▌        | 15/100 [00:56<05:17,  3.74s/it]

-1.1


 16%|█▌        | 16/100 [01:00<05:07,  3.66s/it]

-1.1


 17%|█▋        | 17/100 [01:04<05:17,  3.83s/it]

-1.1


 18%|█▊        | 18/100 [01:07<05:01,  3.67s/it]

-1.1


 19%|█▉        | 19/100 [01:12<05:11,  3.85s/it]

-1.1


 20%|██        | 20/100 [01:15<04:54,  3.69s/it]

-1.1


 21%|██        | 21/100 [01:19<05:00,  3.80s/it]

-1.1


 22%|██▏       | 22/100 [01:22<04:46,  3.67s/it]

-1.1


 23%|██▎       | 23/100 [01:27<04:54,  3.82s/it]

-1.1


 24%|██▍       | 24/100 [01:30<04:40,  3.69s/it]

-1.1


 25%|██▌       | 25/100 [01:34<04:41,  3.76s/it]

-1.1


 25%|██▌       | 25/100 [01:35<04:45,  3.81s/it]

KeyboardInterrupt



In [63]:
from abc import ABC, abstractmethod
from collections import defaultdict
import math
DEPTH = 1
STOCHASITC_SAMPLES = 10


class MCTS:
    "Monte Carlo tree searcher. First rollout the tree then choose a move."

    def __init__(self, exploration_weight=1):
        self.Q = defaultdict(int)  # total reward of each node
        self.N = defaultdict(int)  # total visit count for each node
        self.children = dict()  # children of each node
        self.exploration_weight = exploration_weight

    def choose(self, node):
        "Choose the best successor of node. (Choose a move in the game)"

        if node not in self.children:
            return node.find_random_child()

        def score(n):
            if self.N[n] == 0:
                return float("-inf")  # avoid unseen moves
            return self.Q[n] / self.N[n]  # average reward

        return max(self.children[node], key=score)

    def do_rollout(self, node):
        "Make the tree one layer better. (Train for one iteration.)"
        path = self._select(node)
        leaf = path[-1]
        self._expand(leaf)
        reward = self._simulate(leaf)
        self._backpropagate(path, reward)

    def _select(self, node):
        "Find an unexplored descendent of `node`"
        path = []
        while True:
            path.append(node)
            if node not in self.children or not self.children[node]:
                # node is either unexplored or terminal
                return path
            unexplored = self.children[node] - self.children.keys()
            if unexplored:
                n = unexplored.pop()
                path.append(n)
                return path
            node = self._uct_select(node)  # descend a layer deeper

    def _expand(self, node):
        "Update the `children` dict with the children of `node`"
        if node in self.children:
            return  # already expanded
        self.children[node] = node.find_children()

    def _simulate(self, node):
        "Returns the reward for a random simulation (to completion) of `node`"
        invert_reward = True
        while True:
            if node.is_terminal():
                reward = node.reward()
                return 1 - reward if invert_reward else reward
            node = node.find_random_child()
            invert_reward = not invert_reward

    def _backpropagate(self, path, reward):
        "Send the reward back up to the ancestors of the leaf"
        for node in reversed(path):
            self.N[node] += 1
            self.Q[node] += reward
            reward = 1 - reward  # 1 for me is 0 for my enemy, and vice versa

    def _uct_select(self, node):
        "Select a child of node, balancing exploration & exploitation"

        # All children of node should already be expanded:
        assert all(n in self.children for n in self.children[node])

        log_N_vertex = math.log(self.N[node])

        def uct(n):
            "Upper confidence bound for trees"
            return self.Q[n] / self.N[n] + self.exploration_weight * math.sqrt(
                log_N_vertex / self.N[n]
            )

        return max(self.children[node], key=uct)


class Node(ABC):
    """
    A representation of a single board state.
    MCTS works by constructing a tree of these Nodes.
    Could be e.g. a chess or checkers board state.
    """

    @abstractmethod
    def find_children(self):
        "All possible successors of this board state"
        return set()

    @abstractmethod
    def find_random_child(self):
        "Random successor of this board state (for more efficient simulation)"
        return None

    @abstractmethod
    def is_terminal(self):
        "Returns True if the node has no children"
        return True

    @abstractmethod
    def reward(self):
        "Assumes `self` is terminal node. 1=win, 0=loss, .5=tie, etc"
        return 0

    @abstractmethod
    def __hash__(self):
        "Nodes must be hashable"
        return 123456789

    @abstractmethod
    def __eq__(node1, node2):
        "Nodes must be comparable"
        return True

In [None]:
class WorldModel(_TTTB, Node):
    
    self.r_model = Sequential()
    self.r_model.add(Input(shape=(43+20,)))
    self.r_model.add(Dense(512, activation='relu'))
    self.r_model.add(Dense(512, activation='relu'))
    self.r_model.add(Dense(512, activation='relu'))
    self.r_model.add(Dense(6, activation='softmax'))
    #self.r_model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy()])
    self.r_model.load_weights('RewardModel')

    input_ = Input(shape=(43+20,))
    x = Dense(512, activation='relu')(input_)
    x = Dense(512, activation='relu')(x)
    outs = []
    for i in range(NUM_NODES):
        for n in NODE_CLASSES:
            x_ = Dense(128, activation='relu')(x)
            outs.append(Dense(n, activation='softmax', name=str(i)+str(n))(x_))

    self.ns_model_multi_model = Model(input_, outs)
    self.ns_model_multi_model.load_weights('AfterStateModel_MulitLabel')

    self.reward_map = np.load('reward_map.npy', allow_pickle=True).item()
    
    def find_children(env):
        if board.terminal:  # If the game is finished then no moves can be made
            return set()
        # Otherwise, you can make a move in each of the empty spots
        return {
            env.take_action(i) for i, value in enumerate(20) if value is None
        }

    def find_random_child(board):
        if board.terminal:
            return None  # If the game is finished then no moves can be made
        empty_spots = [i for i, value in enumerate(board.tup) if value is None]
        return board.make_move(choice(empty_spots))

    def reward(board):
        if not board.terminal:
            raise RuntimeError(f"reward called on nonterminal board {board}")
        if board.winner is board.turn:
            # It's your turn and you've already won. Should be impossible.
            raise RuntimeError(f"reward called on unreachable board {board}")
        if board.turn is (not board.winner):
            return 0  # Your opponent has just won. Bad.
        if board.winner is None:
            return 0.5  # Board is a tie
        # The winner is neither True, False, nor None
        raise RuntimeError(f"board has unknown winner type {board.winner}")

    def is_terminal(board):
        return board.terminal

    def take_action(board, index):
        tup = board.tup[:index] + (board.turn,) + board.tup[index + 1 :]
        turn = not board.turn
        winner = _find_winner(tup)
        is_terminal = (winner is not None) or not any(v is None for v in tup)
        return TicTacToeBoard(tup, turn, winner, is_terminal)

    def to_pretty_string(board):
        to_char = lambda v: ("X" if v is True else ("O" if v is False else " "))
        rows = [
            [to_char(board.tup[3 * row + col]) for col in range(3)] for row in range(3)
        ]
        return (
            "\n  1 2 3\n"
            + "\n".join(str(i + 1) + " " + " ".join(row) for i, row in enumerate(rows))
            + "\n"
        )

In [None]:
def solve():
    tree = MCTS()
    env = WorldModel()
    while True:
        for _ in range(50):
            tree.do_rollout(board)
        board = tree.choose(board)
        print(board.to_pretty_string())
        if board.terminal:
            break