In [1]:
import gym
from gym import spaces
import numpy as np
import tensorflow as tf
from tensorflow import keras

from collections import deque
import numpy as np

chain_mdp

In [2]:
class ChainMDP(gym.Env):
    """Chain MDP
    The environment consists of a chain of N states and the agent always starts in state s2,
    from where it can either move left or right.
    In state s1, the agent receives a small reward of r = 0.001 by moving left.
    A larger reward r = 1 is recived when moving right from state sN.
    This environment is described in
    Deep Exploration via Bootstrapped DQN(https://papers.nips.cc/paper/6501-deep-exploration-via-bootstrapped-dqn.pdf)
    """
    def __init__(self, n):
        self.n = n
        self.state = 1  # start at s2
        self.action_space = spaces.Discrete(2)  # {0, 1}
        self.observation_space = spaces.Discrete(self.n)  # {0, 1, ... n-1}
        self.max_nsteps = n + 8

    def step(self, action):
        assert self.action_space.contains(action)  # assert 뒤의 조건이 True가 아니면 AssertError 발생
        v = np.arange(self.n)  # [0, 1, ... n-1]
        reward = lambda s, a: 1.0 if (s == (self.n - 1) and a == 1) else (0.001 if (s == 0 and a == 0) else 0)
        is_done = lambda nsteps: nsteps >= self.max_nsteps  # True/False

        r = reward(self.state, action)
        if action:    # right
            if self.state != self.n - 1:
                self.state += 1
        else:   # left
            if self.state != 0:
                self.state -= 1
        self.nsteps += 1
        return (v <= self.state).astype('float32'), r, is_done(self.nsteps), None

    def reset(self):
        v = np.arange(self.n)
        self.state = 1
        self.nsteps = 0
        return (v <= self.state).astype('float32')

agent _chainMDP

In [3]:
class Qfunction(keras.Model):
    
    def __init__(self, obssize, actsize, hidden_dims):
        """
        obssize: dimension of state space
        actsize: dimension of action space
        hidden_dims: list containing output dimension of hidden layers 
        """
        super(Qfunction, self).__init__()

        # Layer weight initializer
        initializer = keras.initializers.RandomUniform(minval=-1., maxval=1.)

        # Input Layer
        self.input_layer = keras.layers.InputLayer(input_shape=(obssize,))
        
        # Hidden Layer
        self.hidden_layers = []
        for hidden_dim in hidden_dims:
            # TODO: define each hidden layers
            layer = keras.layers.Dense(hidden_dim, activation='relu',
                                      kernel_initializer=initializer)
            self.hidden_layers.append(layer) 
        
        # Output Layer : 
        # TODO: Define the output layer.
        self.output_layer = keras.layers.Dense(actsize)

    @tf.function
    def call(self, states):
        ########################################################################
        # TODO: You SHOULD implement the model's forward pass
        x = self.input_layer(states)
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
        return self.output_layer(x)
        ########################################################################

# Wrapper class for training Qfunction and updating weights (target network) 

class DQN(object):
    
    def __init__(self, obssize, actsize, hidden_dims, optimizer):
        """
        obssize: dimension of state space
        actsize: dimension of action space
        optimizer: 
        """
        self.qfunction = Qfunction(obssize, actsize, hidden_dims)
        self.optimizer = optimizer
        self.obssize = obssize
        self.actsize = actsize

    def _predict_q(self, states, actions):
        """
        states represent s_t
        actions represent a_t
        """
        ########################################################################
        # TODO: Define the logic for calculate  Q_\theta(s,a)
        q = []
        for j in range(len(actions)):
            q.append(self.qfunction(states)[j][actions[j]])
        return tf.convert_to_tensor(q, dtype=tf.float32)
        ########################################################################
        

    def _loss(self, Qpreds, targets):
        """
        Qpreds represent Q_\theta(s,a)
        targets represent the terms E[r+gamma Q] in Bellman equations
        This function is OBJECTIVE function
        """
        l = tf.math.reduce_mean(tf.square(Qpreds - targets))
        return l

    
    def compute_Qvalues(self, states):
        """
        states: numpy array as input to the neural net, states should have
        size [numsamples, obssize], where numsamples is the number of samples
        output: Q values for these states. The output should have size 
        [numsamples, actsize] as numpy array
        """
        inputs = np.atleast_2d(states.astype('float32'))
        return self.qfunction(inputs)


    def train(self, states, actions, targets):
        """
        states: numpy array as input to compute loss (s)
        actions: numpy array as input to compute loss (a)
        targets: numpy array as input to compute loss (Q targets)
        """
        with tf.GradientTape() as tape:
            Qpreds = self._predict_q(states, actions)
            loss = self._loss(Qpreds, targets)
        variables = self.qfunction.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        return loss

    def update_weights(self, from_network):
        """
        We need a subroutine to update target network 
        i.e. to copy from principal network to target network. 
        This function is for copying  theta -> theta target 
        """
        
        from_var = from_network.qfunction.trainable_variables
        to_var = self.qfunction.trainable_variables
        
        # soft assign
        for v1, v2 in zip(from_var, to_var):
            v2.assign(0.8*v1+0.2*v2)

# Implement replay buffer
class ReplayBuffer(object):
    
    def __init__(self, maxlength):
        """
        maxlength: max number of tuples to store in the buffer
        if there are more tuples than maxlength, pop out the oldest tuples
        """
        self.buffer = deque()
        self.number = 0
        self.maxlength = maxlength
    
    def append(self, experience):
        """
        this function implements appending new experience tuple
        experience: a tuple of the form (s,a,r,s^\prime)
        """
        self.buffer.append(experience)
        self.number += 1
        if(self.number > self.maxlength):
            self.pop()
        
    def pop(self):
        """
        pop out the oldest tuples if self.number > self.maxlength
        """
        while self.number > self.maxlength:
            self.buffer.popleft()
            self.number -= 1
    
    def sample(self, batchsize):
        """
        this function samples 'batchsize' experience tuples
        batchsize: size of the minibatch to be sampled
        return: a list of tuples of form (s,a,r,s^\prime)
        """
        inds = np.random.choice(len(self.buffer), batchsize, replace=False)
        return [self.buffer[idx] for idx in inds]

In [4]:
# class agent():
    
#     def __init__(self):
        
#         return
    
#     def action(self):
        
#         return 1

In [5]:
### DQN implementation ###
class agent():
    
    def __init__(self, e, head_num):

        self.env = e
        self.state = self.env.reset()

        ### For Q value training ###        
        self.episode_length = 100 #10000
        self.hidden_dim = [8, 4]
        self.lr = 5e-4
        
        self.bernoulli_prob = 0.9
        self.ensemble_num = head_num
        
        self.Qprin = DQN(self.env.n, self.env.action_space.n, self.hidden_dim, optimizer = keras.optimizers.Adam(learning_rate=self.lr))
        self.Qtarg = DQN(self.env.n, self.env.action_space.n, self.hidden_dim, optimizer = keras.optimizers.Adam(learning_rate=self.lr))
        self.Qs = []
        for _ in range(self.ensemble_num):
            self.Qs.append([self.Qprin, self.Qtarg])
        ############################

        return
    
    def action(self, s):
        self.s = s
        voting_paper = np.zeros(self.env.action_space.n)
        
        for n in range(self.ensemble_num):
            Q = self.Qs[n][0].compute_Qvalues(np.array(self.s))
            action = np.argmax(Q)   # always max action choose
            # voting_paper[action] += 1
            voting_paper[action] += Q[0][action] - np.mean(Q[0])
        

        return np.argmax(voting_paper)

    def train(self):

        ### For Q value training ###
        totalstep = 0
        initialize = 500
        eps = 1; eps_minus = 1e-4
        tau = 100
        gamma = 0.99
        batchsize = 64
        buff_max_size = 10000
        buffer = ReplayBuffer(buff_max_size)
        ############################

        self.r_record = []
        self.AUC = []

        for ite in range(self.episode_length):
            self.state = self.env.reset()
            done = False
            rsum = 0

            # Action :
            # - train : head fixed for each epoch
            # - eval : vote
            head4action_train = np.random.randint(0, self.ensemble_num)
            
            while not done:
                totalstep += 1

                if eps > 0.05 and totalstep > initialize: eps -= eps_minus
                elif eps < 0.05 and totalstep > initialize: eps = 0.05

                ##################
                ### Get Action ###
                ##################
                if np.random.rand() < eps or totalstep <= initialize:
                    action = np.random.choice([0, 1])
                else:
                    Q = self.Qs[head4action_train][0].compute_Qvalues(np.array(self.state)) # Qprin
                    action = np.argmax(Q)   # always max action choose
                ##################

                ##################
                ###  ONE STEP  ###
                ##################
                curr_state = self.state
                next_state, reward, done, _ = self.env.step(action)
                rsum += reward
                ##################
                
                # === ensemble DQN === #
                heads = np.random.binomial(1, self.bernoulli_prob, self.ensemble_num)
                if np.sum(heads) == 0:
                    heads[np.random.randint(0, self.ensemble_num)] = 1
                # ==================== #
                
                #####################
                ### Update Buffer ###
                #####################
                buffer.append((curr_state, action, reward, next_state, done, heads))
                #####################

                #############################
                ### N Samples from Buffer ###
                ###         and           ###
                ### Update theta of Qprin ###
                #############################
                if totalstep > initialize:

                    # sample
                    s = buffer.sample(batchsize)

                    d = []
                    for j in range(len(s)): # for each s[j]s
                        ## if head 0 : append 0 at K, head 1 : append value of k to K
                        ## iterate for all samples
                        K=[]
                        cS = s[j][0]; A = s[j][1]; R = s[j][2]; nS = s[j][3]; DONE = s[j][4]; heads = s[j][5];
                        if not DONE:
                            for n in range(len(heads)):
                                if heads[n] == 0 : k = 0
                                else: k = R + gamma*np.max(self.Qs[n][1].compute_Qvalues(nS)) #Qtarg_n
                                K.append(k)
                        elif DONE:
                            for n in range(len(heads)):
                                if heads[n] == 0 : k = 0
                                else: k = R
                                K.append(k)
                        d.append(K)
                    
                    # update Qprins
                    for n in range(self.ensemble_num):
                        set_of_S = np.array([s[x][0] for x in range(len(s)) if s[x][5][n] == 1])
                        set_of_A = np.array([s[x][1] for x in range(len(s)) if s[x][5][n] == 1])
                        D = [d[x][n] for x in range(len(s)) if s[x][5][n] == 1]
                        
                        self.Qs[n][0].train(set_of_S, set_of_A, tf.convert_to_tensor(D, dtype=tf.float32))  # Qprin
                #############################


                #############################
                ### Update theta of Qtarg ###
                #############################
                if totalstep % tau == 0:
#                     print("")
#                     print("epsilon : ", eps)
#                     print("target updated, totalstep : ", totalstep)
                    for n in range(self.ensemble_num):
                        self.Qs[n][1].update_weights(self.Qs[n][0])

                #############################

                pass
            

            self.r_record.append(rsum)
#             if ite % 10 == 0:
#                 print('iteration {} ave reward {}'.format(ite, np.mean(self.r_record[-10:])))
            
            #########################
            ### Sample Efficiency ###
            #########################
            done = False
            cum_reward = 0.0
            s = self.env.reset()
            while not done:
                
                voting_paper = np.zeros(self.env.action_space.n)

                for n in range(self.ensemble_num):
                    Q = self.Qs[n][0].compute_Qvalues(np.array(s))
                    action = np.argmax(Q)   # always max action choose
                    # voting_paper[action] += 1
                    voting_paper[action] += Q[0][action] - np.mean(Q[0])
                    
                action = np.argmax(voting_paper)
                ns, reward, done, _ = env.step(action)
                cum_reward += reward
                s = ns
            self.AUC.append(cum_reward)

        return self.AUC

chain_test

In [6]:
def seed_everything(seed): # seed 고정
    np.random.seed(seed)
#    random.seed(seed)
#    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

head_num = 5
seeds = 20
for n in [10, 50, 100]:
    print("chain length:", n)
    agents = []
    
    for seed in range(seeds):
        seed_everything(seed)
        
        env = ChainMDP(n)
        s = env.reset()
        agents.append(agent(env, head_num))
        AUC = agents[seed].train() #iteration: 100
        print("seed:", seed)
        print("AUC: ", np.sum(AUC))

chain length: 10
seed: 0
AUC:  0.0
seed: 1
AUC:  0.5100000000000002
seed: 2
AUC:  0.5950000000000003
seed: 3
AUC:  1000.0
seed: 4
AUC:  1000.0
seed: 5
AUC:  560.0
seed: 6
AUC:  890.0
seed: 7
AUC:  1.2070000000000003
seed: 8
AUC:  910.0
seed: 9
AUC:  0.4930000000000002
seed: 10
AUC:  190.11899999999997
seed: 11
AUC:  0.5100000000000002
seed: 12
AUC:  0.017000000000000008
seed: 13
AUC:  590.0
seed: 14
AUC:  411.00300000000016
seed: 15
AUC:  950.0
seed: 16
AUC:  150.204
seed: 17
AUC:  280.0
seed: 18
AUC:  480.0
seed: 19
AUC:  300.0
chain length: 50
seed: 0
AUC:  0.0
seed: 1
AUC:  0.5130000000000003
seed: 2
AUC:  0.6840000000000005
seed: 3
AUC:  1000.0
seed: 4
AUC:  980.0
seed: 5
AUC:  0.0
seed: 6
AUC:  1000.0
seed: 7
AUC:  4.845000000000004
seed: 8
AUC:  850.0
seed: 9
AUC:  0.5130000000000003
seed: 10
AUC:  770.171
seed: 11
AUC:  0.7980000000000006
seed: 12
AUC:  0.0
seed: 13
AUC:  640.0
seed: 14
AUC:  150.0
seed: 15
AUC:  890.0
seed: 16
AUC:  790.2280000000001
seed: 17
AUC:  0.0
seed: 18

StagingError: in user code:

    <ipython-input-3-65a570de21a5>:35 call  *
        x = hidden_layer(x)
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:1012 __call__  **
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\keras\layers\core.py:1207 call
        return core_ops.dense(
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\keras\layers\ops\core.py:53 dense
        outputs = gen_math_ops.mat_mul(inputs, kernel)
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py:5547 mat_mul
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:517 _apply_op_helper
        values = ops.convert_to_tensor(
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\profiler\trace.py:163 wrapped
        return func(*args, **kwargs)
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py:1540 convert_to_tensor
        ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py:1992 _dense_var_to_tensor
        return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py:1393 _dense_var_to_tensor
        return self.value()
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py:565 value
        return self._read_variable_op()
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py:672 _read_variable_op
        result = read_and_set_handle()
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py:662 read_and_set_handle
        result = gen_resource_variable_ops.read_variable_op(
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\ops\gen_resource_variable_ops.py:483 read_variable_op
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:517 _apply_op_helper
        values = ops.convert_to_tensor(
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\profiler\trace.py:163 wrapped
        return func(*args, **kwargs)
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py:1501 convert_to_tensor
        return graph.capture(value, name=name)
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\framework\func_graph.py:626 capture
        return self._capture_helper(tensor, name, shape)
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\framework\func_graph.py:658 _capture_helper
        tape.record_operation("captured_value", [placeholder], [tensor],
    C:\Users\seongmin\anaconda3\lib\site-packages\tensorflow\python\eager\tape.py:184 record_operation
        pywrap_tfe.TFE_Py_TapeSetRecordOperation(op_type, output_tensors,

    OverflowError: Python int too large to convert to C long


single

In [None]:
# chain length
n = 10
head_num = 5

In [None]:
# from chain_mdp import ChainMDP
# from agent_chainMDP import agent


# recieve 1 at rightmost stae and recieve small reward at leftmost state
env = ChainMDP(n)
s = env.reset()

""" Your agent"""
agent = agent(env, head_num)     # agent call
agent.train()          # train policy of the agent

In [None]:
##### eval code #####
done = False
cum_reward = 0.0
# always move right left: 0, right: 1

env = ChainMDP(n)
s = env.reset()

while not done:
    action = agent.action(s)
    print("state: ", s)
    print("action: ", action)
    ns, reward, done, _ = env.step(action)
    cum_reward += reward
    s = ns
    
print(f"total reward: {cum_reward}")