In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn import preprocessing
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)

In [164]:
np.random.seed(1)
tf.set_random_seed(1)

class SumTree(object):
    """
    This SumTree code is modified version and the original code is from: 
    https://github.com/jaara/AI-blog/blob/master/SumTree.py
    Story the data with it priority in tree and data frameworks.
    """
    data_pointer = 0

    def __init__(self, capacity):
        self.capacity = capacity  # for all priority values
        self.tree = np.zeros(2 * capacity - 1)
        # [--------------Parent nodes-------------][-------leaves to recode priority-------]
        #             size: capacity - 1                       size: capacity
        self.data = np.zeros(capacity, dtype=object)  # for all transitions
        # [--------------data frame-------------]
        #             size: capacity

    def add(self, p, data):
        tree_idx = self.data_pointer + self.capacity - 1
        self.data[self.data_pointer] = data  # update data_frame
        self.update(tree_idx, p)  # update tree_frame

        self.data_pointer += 1
        if self.data_pointer >= self.capacity:  # replace when exceed the capacity
            self.data_pointer = 0

    def update(self, tree_idx, p):
        change = p - self.tree[tree_idx]
        self.tree[tree_idx] = p
        # then propagate the change through tree
        while tree_idx != 0:    # this method is faster than the recursive loop in the reference code
            tree_idx = (tree_idx - 1) // 2
            self.tree[tree_idx] += change

    def get_leaf(self, v):
        """
        Tree structure and array storage:
        Tree index:
             0         -> storing priority sum
            / \
          1     2
         / \   / \
        3   4 5   6    -> storing priority for transitions
        Array type for storing:
        [0,1,2,3,4,5,6]
        """
        parent_idx = 0
        while True:     # the while loop is faster than the method in the reference code
            cl_idx = 2 * parent_idx + 1         # this leaf's left and right kids
            cr_idx = cl_idx + 1
            if cl_idx >= len(self.tree):        # reach bottom, end search
                leaf_idx = parent_idx
                break
            else:       # downward search, always search for a higher priority node
                if v <= self.tree[cl_idx]:
                    parent_idx = cl_idx
                else:
                    v -= self.tree[cl_idx]
                    parent_idx = cr_idx

        data_idx = leaf_idx - self.capacity + 1
        return leaf_idx, self.tree[leaf_idx], self.data[data_idx]

    @property
    def total_p(self):
        return self.tree[0]  # the root


class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
    """
    This SumTree code is modified version and the original code is from:
    https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
    """
    epsilon = 0.01  # small amount to avoid zero priority
    alpha = 0.6  # [0~1] convert the importance of TD error to priority
    beta = 0.4  # importance-sampling, from initial value increasing to 1
    beta_increment_per_sampling = 0.001
    abs_err_upper = 1.  # clipped abs error

    def __init__(self, capacity):
        self.tree = SumTree(capacity)

    def store(self, transition):
        max_p = np.max(self.tree.tree[-self.tree.capacity:])
        if max_p == 0:
            max_p = self.abs_err_upper
        self.tree.add(max_p, transition)   # set the max p for new p

    def sample(self, n):
        b_idx, b_memory, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, self.tree.data[0].size)), np.empty((n, 1))
        pri_seg = self.tree.total_p / n       # priority segment
        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])  # max = 1

        min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p     # for later calculate ISweight
        for i in range(n):
            a, b = pri_seg * i, pri_seg * (i + 1)
            v = np.random.uniform(a, b)
            idx, p, data = self.tree.get_leaf(v)
            prob = p / self.tree.total_p
            ISWeights[i, 0] = np.power(prob/min_prob, -self.beta)
            b_idx[i], b_memory[i, :] = idx, data
        return b_idx, b_memory, ISWeights

    def batch_update(self, tree_idx, abs_errors):
        abs_errors += self.epsilon  # convert to abs and avoid 0
        clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
        ps = np.power(clipped_errors, self.alpha)
        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)

In [862]:
class UniversalACN:
    def __init__(
            self,
            n_actions,
            n_features,
            n_weights,
            n_skills,
            learning_rate=0.0005,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=1,
            memory_size=3000,
            batch_size=1,
            e_greedy_increment=None,
            output_graph=False,
            double_q=False,
            dueling=False,
            a2c=False,
            prioritized=False,
            sess=None       
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.n_l1 = n_weights
        self.n_skills = n_skills
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        self.double_q = double_q    # decide to use double q or not
        self.dueling = dueling # dueling DQN
        self.a2c = a2c # advantage actor critic
        self.prioritized = prioritized    # decide to use double q or not

        self.learn_step_counter = 0
        
        # build NN layers
        self._build_net()
        t_params = tf.get_collection('target_net_params')
        e_params = tf.get_collection('eval_net_params')
        self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

        if self.prioritized:
            self.memory = Memory(capacity=memory_size)
        else:
            self.memory = np.zeros((self.memory_size, n_features*2+2))
            
        if sess is None:
            self.sess = tf.Session()
            self.sess.run(tf.global_variables_initializer())
        else:
            self.sess = sess
        if output_graph:
            tf.summary.FileWriter("logs/", self.sess.graph)
        self.cost_his = []
        self.disc_cost_his = []

    def _build_net(self):
        def build_discriminator(s, c_names, n_l1, w_initializer, b_initializer):
            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.relu(tf.matmul(s, w1) + b1)                              
                    
            with tf.variable_scope('Discriminator'):                    
                w2 = tf.get_variable('w2', [n_l1, self.n_skills], initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [1, self.n_skills], initializer=b_initializer, collections=c_names)
                out = tf.matmul(l1, w2) + b2                         
                
            return out

        def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.relu(tf.matmul(s, w1) + b1)                              
                                
            if self.dueling: # Dueling DQN
                with tf.variable_scope('Value'):
                    w2 = tf.get_variable('w2', [n_l1, 1], initializer=w_initializer, collections=c_names)
                    b2 = tf.get_variable('b2', [1, 1], initializer=b_initializer, collections=c_names)
                    self.V = tf.matmul(l1, w2) + b2

                with tf.variable_scope('Advantage'):
                    w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                    b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                    self.A = tf.matmul(l1, w2) + b2           
                    
                with tf.variable_scope('Q'):     # Q = V(s) + A(s,a)                    
                    out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) 

            elif self.a2c: # A2C
                with tf.variable_scope('Critic'):
                    w2 = tf.get_variable('w2', [n_l1, 1], initializer=w_initializer, collections=c_names)
                    b2 = tf.get_variable('b2', [1, 1], initializer=b_initializer, collections=c_names)
                    self.V = tf.matmul(l1, w2) + b2

                with tf.variable_scope('Actor'):
                    w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                    b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                    self.A = tf.nn.softmax(tf.matmul(l1, w2) + b2)
                                                            
                    out = (self.V, self.A)
                                        
            else:
                with tf.variable_scope('Q'):
                    w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                    b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                    out = tf.matmul(l1, w2) + b2

            return out
        # ------------------ build evaluate_net ------------------
        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
        self.q_action = tf.placeholder(tf.int32, [None], name='Q_action')  # action by agent
        self.q_skill = tf.placeholder(tf.int32, [None], name='Q_skill')  # for z skill
        _p_z = np.full(self.n_skills, 1.0 / self.n_skills)
        
        if self.a2c:
            self.q_target = tf.placeholder(tf.float32, [None, 1], name='Q_target')  # for calculating loss                    
        else:
            self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss                    
        
        if self.prioritized:
            self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
            
        with tf.variable_scope('disc_net'):
            c_names, n_l1, w_initializer, b_initializer = \
                ['disc_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
                tf.random_normal_initializer(0., 1), tf.constant_initializer(0.1)  # config of layers
            
            self.disc = build_discriminator(self.s, c_names, self.n_l1, w_initializer, b_initializer)   
            self.disc_z = tf.one_hot(self.q_skill, depth=self.n_skills)            
            self.p_z = tf.reduce_sum(_p_z * self.disc_z, axis=1)
            self.disc_reward = tf.nn.softmax_cross_entropy_with_logits(labels=self.disc_z, logits=self.disc) - self.p_z
            
        with tf.variable_scope('eval_net'):
            c_names, n_l1, w_initializer, b_initializer = \
                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
                tf.random_normal_initializer(0., 1), tf.constant_initializer(0.1)  # config of layers
                        
            self.q_eval = build_layers(self.s, c_names, self.n_l1, w_initializer, b_initializer)            
            
            if self.a2c:
                self.action_probs = self.q_eval[1]
                self.q_eval = self.q_eval[0]

        with tf.variable_scope('loss'):
            self.advantage = self.q_target - self.q_eval            
            
            if self.a2c:                                
                self.entropy_loss = -tf.reduce_sum(self.action_probs * tf.log(self.action_probs + 1e-10), axis=1, keepdims=True)
                self.policy_loss = -tf.log(tf.reduce_max(self.action_probs * 
                                                     tf.squeeze(tf.one_hot(self.q_action, depth=self.n_actions)), 
                                                     axis=1, keepdims=True) + 1e-10) * self.advantage

                self.loss1 = self.policy_loss + 0.5 * self.advantage - 0.01 * self.entropy_loss
            else:
                self.loss1 = self.advantage
            
            if self.prioritized:
                self.abs_errors = tf.reduce_sum(tf.abs(self.advantage), axis=1)    # for updating Sumtree
                self.loss = tf.reduce_mean(self.ISWeights * self.loss1)
            else:            
                self.loss = tf.reduce_mean(self.loss1)
                
            self.disc_loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(labels=self.disc_z, logits=self.disc))
            
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
            self._train_disc_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.disc_loss)

        # ------------------ build target_net ------------------
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')    # input
        with tf.variable_scope('target_net'):
            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]

            self.q_next = build_layers(self.s_, c_names, self.n_l1, w_initializer, b_initializer)
            if self.a2c:
                self.q_next = self.q_next[0]

            
    def reset_store(self):
        if self.prioritized:
            self.memory = Memory(capacity=self.memory_size)
        else:
            self.memory = np.zeros((self.memory_size, self.n_features*2+2))
        
        self.memory_counter = 0
        
    def store_transition(self, s, a, r, s_):
        if self.prioritized:    # prioritized replay
            transition = np.hstack((s, [a, r], s_))
            self.memory.store(transition)    # have high priority for newly arrived transition
        else:
            if not hasattr(self, 'memory_counter'):
                self.memory_counter = 0
            transition = np.hstack((s, [a, r], s_))
            index = self.memory_counter % self.memory_size
            self.memory[index, :] = transition
            self.memory_counter += 1

    def choose_action(self, observation):
        observation = observation[np.newaxis, :]
        if self.a2c:            
            actions = self.sess.run(self.action_probs, feed_dict={self.s: observation})
            action = np.random.choice(self.n_actions, p=actions[0])
        else:
            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
            action = np.argmax(actions_value)

            if not hasattr(self, 'q'):  # record action value it gets
                self.q = []
                self.running_q = 0
            self.running_q = self.running_q*0.99 + 0.01 * np.max(actions_value)
            self.q.append(self.running_q)

            if np.random.uniform() > self.epsilon:  # choosing random action
                action = np.random.randint(0, self.n_actions)

        return action

    def learn(self):
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.replace_target_op)
            #print('\ntarget_params_replaced\n')

        if self.prioritized:
            tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
        else:
            if self.memory_counter > self.memory_size:
                sample_index = np.random.choice(self.memory_size, size=self.batch_size)
            else:
                sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
            batch_memory = self.memory[sample_index, :]

        q_next, q_eval4next = self.sess.run(
            [self.q_next, self.q_eval],
            feed_dict={self.s_: batch_memory[:, -self.n_features:],    # next observation
                       self.s: batch_memory[:, -self.n_features:]})    # next observation
        
        q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})        
        q_target = q_eval.copy()

        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = batch_memory[:, self.n_features].astype(int)
        
        self.skills = batch_memory[:, self.n_features + 1]                   
        self.reward = self.sess.run(self.disc_reward, feed_dict={
                                            self.s: batch_memory[:, :self.n_features], 
                                            self.q_skill:self.skills})
        
        if self.double_q:
            max_act4next = np.argmax(q_eval4next, axis=1)        # the action that brings the highest value is evaluated by q_eval
            selected_q_next = q_next[batch_index, max_act4next]  # Double DQN, select q_next depending on above actions
        else:
            selected_q_next = np.max(q_next, axis=1)    # the natural DQN

        if self.a2c:
            q_target[batch_index, 0] = self.reward + self.gamma * selected_q_next
        else:
            q_target[batch_index, eval_act_index] = self.reward + self.gamma * selected_q_next
        
        if self.prioritized: # use prioritized replay
            _, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
                                         feed_dict={self.s: batch_memory[:, :self.n_features],
                                                    self.q_target: q_target,
                                                    self.q_action: eval_act_index,
                                                    self.ISWeights: ISWeights})
            self.memory.batch_update(tree_idx, abs_errors)     # update priority        
        else:
            _, self.cost, _, self.disc_cost = self.sess.run([self._train_op, 
                                                             self.loss, 
                                                             self._train_disc_op, 
                                                             self.disc_loss],
                                        feed_dict={self.s: batch_memory[:, :self.n_features],
                                                   self.q_action: eval_act_index,
                                                   self.q_target: q_target,
                                                   self.q_skill:self.skills})

        self.cost_his.append(self.cost)
        self.disc_cost_his.append(self.disc_cost)

        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1
        
    def reward_info(self, obs, skill):
        return self.sess.run([self.disc_reward], 
                             feed_dict={self.s: obs, self.q_skill:skill})


In [863]:
tf.reset_default_graph()
rl = UniversalACN(n_actions=1,
                  n_features=3,
                  n_weights=7,
                  n_skills=2, 
                  batch_size=3)

In [881]:
observation = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4], [5, 5, 5], [6, 6, 6]])

rl.reset_store()
rl.store_transition(observation[0], rl.choose_action(observation[0]), 0, observation[1])
rl.store_transition(observation[1], rl.choose_action(observation[1]), 0, observation[2])
rl.store_transition(observation[2], rl.choose_action(observation[2]), 0, observation[1])
rl.learn()
print("cost=", rl.cost, "disc_cost=", rl.disc_cost)

rl.reset_store()
rl.store_transition(observation[3], rl.choose_action(observation[3]), 1, observation[4])
rl.store_transition(observation[4], rl.choose_action(observation[4]), 1, observation[5])
rl.store_transition(observation[5], rl.choose_action(observation[5]), 1, observation[4])
rl.learn()
print("cost=", rl.cost, "disc_cost=", rl.disc_cost)

rl.reset_store()
rl.store_transition(observation[0], rl.choose_action(observation[0]), 1, observation[4])
rl.store_transition(observation[4], rl.choose_action(observation[4]), 1, observation[1])
rl.store_transition(observation[1], rl.choose_action(observation[1]), 1, observation[2])
rl.learn()
print("cost=", rl.cost, "disc_cost=", rl.disc_cost)

rl.reset_store()
rl.store_transition(observation[5], rl.choose_action(observation[0]), 0, observation[1])
rl.store_transition(observation[1], rl.choose_action(observation[1]), 0, observation[2])
rl.store_transition(observation[2], rl.choose_action(observation[2]), 0, observation[1])
rl.learn()
print("cost=", rl.cost, "disc_cost=", rl.disc_cost)

print("skill=0 >>> ", rl.reward_info(observation, np.repeat(0, len(observation))))
print("skill=1 >>> ", rl.reward_info(observation, np.repeat(1, len(observation))))


cost= -0.76721114 disc_cost= 0.024358863
cost= 16.17832 disc_cost= 16.631063
cost= 4.963382 disc_cost= 6.0382285
cost= -0.09931046 disc_cost= 2.2649509e-05
skill=0 >>>  [array([-0.47505087, -0.49924487, -0.49997735, -0.49999928, -0.5       ,
       -0.5       ], dtype=float32)]
skill=1 >>>  [array([ 3.2033668,  6.6889606, 10.198015 , 13.70778  , 17.217566 ,
       20.727352 ], dtype=float32)]


In [87]:
rl.learn()

print("action:", rl.choose_action(observation[1]))

#l, l1, a1, p1, e1 = rl.sess.run([rl.loss, rl.loss1, rl.advantage, rl.policy_loss, rl.entropy_loss],
#                      feed_dict={rl.q_target: qe2, rl.s:observation, rl.q_action:[0, 2, 0]})

#print("loss: ", l)
#print("advantage: ", np.mean(a1))
#print("policy loss: ", np.mean(p1))
#print("entropy loss: ", np.mean(e1))


target_params_replaced

action: 0
