In [1]:
import gym
import keras.layers as layers
import keras.optimizers as optimizers
from keras.models import Model
from keras import backend as K
import tensorflow as tf

import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
class Value:
    def __init__(self,input_shape):
        self.input_shape = input_shape
        self.__make_network()
        self.__make_loss_function()
        
    def __make_network(self):
        input_layer = layers.Input(shape=(self.input_shape,))
        x = layers.Dense(256,activation = 'relu')(input_layer)
        #x = layers.Dense(256,activation = 'relu')(x)
        x = layers.Dense(1)(x)
        self.model = Model(inputs = input_layer, outputs = x)
    def get_value(self,state):
        return self.model.predict(state)
    
    def __make_loss_function(self):
        
        value_output = self.model.output
        reward_placeholder = K.placeholder(shape=(None,1),name = 'reward')
        #HUBER_DELTA = 0.5
        #loss = K.abs(reward_placeholder - value_output)
        #loss = K.switch(loss < HUBER_DELTA, 0.5 * loss ** 2 , HUBER_DELTA * (loss - 0.5 * HUBER_DELTA))
        #loss = K.sum(loss)
        loss = K.mean(K.square(reward_placeholder - value_output))
        
        optimizer = optimizers.Adam(learning_rate)
        update = optimizer.get_updates(loss =loss, params = self.model.trainable_weights)
        
        self.update_function = K.function(inputs = [self.model.input,\
                                                   reward_placeholder],\
                                         outputs = [] , updates = update)

In [3]:
class Actor:
    def __init__(self,input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        
        self.__make_network()
        self.__make_loss_function()
        
    def __make_network(self):
        input_layer = layers.Input(shape=(self.input_shape,))    
        x = layers.Dense(256, activation = 'relu')(input_layer)
        #x = layers.Dense(256, activation = 'relu')(x)
        x = layers.Dense(self.output_shape, activation='softmax')(x)
        self.model = Model(inputs = input_layer, outputs = x)
    
    def get_action(self,state):
        return self.model.predict(state)
    
    def __make_loss_function(self):
        before_action_prob = K.placeholder(shape = (None, 1),\
                                          name = 'before_action_prob')
        before_action = K.placeholder(shape = (None, 1),\
                                          name = 'before_action',dtype = 'int64') ########

        advantage = K.placeholder(shape = (None,1), name ='advantage')
        
        now_action_prob = self.model.output
        now_action_select = K.sum(tf.squeeze(tf.one_hot(before_action,depth=2),axis=1) * now_action_prob,axis=-1)
        now_action_select = K.reshape(now_action_select,(-1,1))
        
        ratio = (K.exp(K.log(now_action_select) - K.log(before_action_prob)))
        
        #return -K.mean(K.minimum(r * advantage, K.clip(r, min_value=1 - LOSS_CLIPPING, \
        #max_value=1 + LOSS_CLIPPING) * advantage) + ENTROPY_LOSS * -(prob * K.log(prob + 1e-10)))
        
        surr_1 = ratio * advantage
        surr_2 = K.clip(ratio, 1-eps_clip, 1+eps_clip) * advantage
        loss = - K.mean(K.minimum(surr_1,surr_2))
        optimizer = optimizers.Adam(lr = learning_rate)
        updates = optimizer.get_updates(loss = loss, params = self.model.trainable_weights)
        #[state,action,prob,advantage]
        self.update_function = K.function(inputs = [self.model.input,before_action,\
                                       before_action_prob,advantage],\
                            outputs = [ratio],\
                            updates = updates)
        

In [4]:
class Agent:
    def __init__(self, input_shape,output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        
        self.actor = Actor(input_shape,output_shape)
        self.value = Value(input_shape)
    
        self.memory = []
    def put_data(self,data):
        self.memory.append(data)
    
    def get_action(self,state):
        return self.actor.model.predict(state)
    def memory_to_trainable(self):
        state_list, action_list, reward_list, next_state_list, prob_list, done_list = [],\
        [], [], [], [], []
        
        for data in self.memory:
            state, action, reward, next_state, prob, done = data
            
            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            next_state_list.append(next_state)
            prob_list.append([prob])
            done = 0 if done else 1
            done_list.append([done])
        return np.array(state_list), np.array(action_list), np.array(reward_list),\
                np.array(next_state_list), np.array(prob_list), np.array(done_list)     
        
    def train(self):
        state,action,reward,next_state,prob,done_mask = self.memory_to_trainable()
        done_mask = done_mask.reshape(-1,1)
        #print('state',state.shape)
        #print('action',action.shape)
        #print('reward',reward.shape)
        #print('next_state',next_state.shape)
        #print('prob',prob.shape)
        #print('done_mask',done_mask.shape)
        for i in range(env_iteration_number):
            #print('train iterate : ',i)
            td_error = reward + gamma * self.value.get_value(next_state) * done_mask
            delta = np.array(td_error - self.value.get_value(state))
            advantage_list = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append(advantage)
            advantage_list.reverse()
            advantage = np.array(advantage_list).reshape(-1,1)
            #print('advantage',advantage)
            (self.actor.update_function([state,action,prob,advantage]))
            self.value.update_function([state,advantage])
        self.memory = []

In [5]:
env = gym.make("CartPole-v1")
env_iteration_number = 3
learning_rate = 0.0005
lmbda =0.95
eps_clip = 0.1
epochs = 100
T_horizon = 20
gamma = 0.98

In [6]:
model = Agent(env.observation_space.shape[0],env.action_space.n)

In [13]:
check = 20
score = 0

for iterate in range(2000):
    #print(iterate)
    state = env.reset()
    done = False

    while not done:
        for t in range(T_horizon):
            action_prob = model.get_action(state.reshape(1,-1))

            action = np.random.choice([x for x in range(env.action_space.n)], p = action_prob[0])
            
            next_state, reward, done, info = env.step(action)
            model.put_data((state,action,reward/100.0, next_state,action_prob[0][action],done))
            state = next_state
            
            score += reward
            if done:
                break
        model.train()
    if (iterate % 20 == 0) & (iterate != 0) :
        print(iterate, " episode average : ", score / check) 
        score =0

20  episode average :  21.6
40  episode average :  29.6
60  episode average :  31.8
80  episode average :  33.65
100  episode average :  44.75
120  episode average :  63.9
140  episode average :  61.0
160  episode average :  147.85


KeyboardInterrupt: 

In [15]:
env.close()

In [14]:
###test
state = env.reset()
done = False
score = 0
while not done:
    env.render()
    action_prob = model.get_action(state.reshape(1,-1))
    action = np.random.choice([x for x in range(env.action_space.n)], p = action_prob[0])
    next_state, reward, done, info = env.step(action)
    print(action_prob[0],action,done)
    state = next_state     
    score += reward


[0.7360866  0.26391342] 0 False
[0.4725775 0.5274225] 1 False
[0.74555475 0.25444523] 0 False
[0.4822357 0.5177644] 0 False
[0.24546207 0.75453794] 1 False
[0.46971887 0.5302812 ] 0 False
[0.23232603 0.767674  ] 1 False
[0.4454279 0.5545721] 1 False
[0.7149244 0.2850756] 1 False
[0.87187034 0.12812963] 0 False
[0.7154518 0.2845482] 0 False
[0.4318987 0.5681012] 0 False
[0.20463656 0.7953635 ] 0 False
[0.08510637 0.9148937 ] 1 False
[0.16999276 0.8300072 ] 0 False
[0.06805086 0.9319491 ] 1 False
[0.13081755 0.8691824 ] 1 False
[0.25328594 0.74671406] 1 False
[0.46573368 0.5342663 ] 0 False
[0.20069928 0.7993007 ] 1 False
[0.3809905 0.6190095] 1 False
[0.62185293 0.37814713] 0 False
[0.31133687 0.6886631 ] 1 False
[0.547242 0.452758] 0 False
[0.24823552 0.75176454] 0 False
[0.09568042 0.9043196 ] 1 False
[0.18162803 0.81837195] 1 False
[0.35103127 0.6489687 ] 1 False
[0.5680176  0.43198237] 1 False
[0.74437106 0.2556289 ] 0 False
[0.5013852  0.49861476] 0 False
[0.22405942 0.77594054] 1 

In [24]:
test = Actor(env.observation_space.shape[0],env.action_space.n) #model = Agent()

In [26]:
now_action_select = K.sum(tf.one_hot(before_action,depth=2) * now_action_prob,axis=-1)

ratio = K.exp(K.log(now_action_select) - K.log(before_action_prob))

array([[0.5016776, 0.4983225]], dtype=float32)

In [7]:
test_index = K.constant([1,0,1,0,0],dtype = 'int64')
test_index = K.reshape(test_index,(-1,1))

In [8]:
test_action = K.constant([[0.1,0.2],[0.3,0.4],[0.5,0.6],[0.7,0.8],[0.9,1.0]])

In [11]:
test_index = K.constant([0,0,0,0,0],dtype = 'int64')
test_index = K.reshape(test_index,(-1,1))

<tf.Tensor 'one_hot_6:0' shape=(5, 1, 3) dtype=float32>

In [24]:
K.eval(K.sum(tf.squeeze(tf.one_hot(test_index,depth=2),axis=1)* test_action,axis=-1))

array([0.2, 0.3, 0.6, 0.7, 0.9], dtype=float32)

In [36]:
K.eval(test_index)

array([[0],
       [1],
       [0],
       [0],
       [1]], dtype=int64)