In [1]:
import gym
import keras.layers as layers
import keras.optimizers as optimizers
from keras.models import Model
from keras import backend as K
import tensorflow as tf

import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
class Value:
    def __init__(self,input_shape):
        self.input_shape = input_shape
        self.__make_network()
        self.__make_loss_function()
        
    def __make_network(self):
        input_layer = layers.Input(shape=(self.input_shape,))
        x = layers.Dense(256,activation = 'relu')(input_layer)
        x = layers.Dense(1)(x)
        self.model = Model(inputs = input_layer, outputs = x)
    def get_value(self,state):
        return self.model.predict(state)
    
    def __make_loss_function(self):
        HUBER_DELTA = 0.5
        value_output = self.model.output
        reward_placeholder = K.placeholder(shape=(None,1),name = 'reward')
        #loss = K.abs(reward_placeholder - value_output)
        #loss = K.switch(loss < HUBER_DELTA, 0.5 * loss ** 2 , HUBER_DELTA * (loss - 0.5 * HUBER_DELTA))
        #loss = K.sum(loss)
        loss = K.mean(K.square(reward_placeholder - value_output))
        
        optimizer = optimizers.Adam(learning_rate)
        update = optimizer.get_updates(loss =loss, params = self.model.trainable_weights)
        
        self.update_function = K.function(inputs = [self.model.input,\
                                                   reward_placeholder],\
                                         outputs = [] , updates = update)

In [8]:
class Actor:
    def __init__(self,input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        
        self.__make_network()
        self.__make_loss_function()
        
    def __make_network(self):
        input_layer = layers.Input(shape=(self.input_shape,))    
        x = layers.Dense(256, activation = 'relu')(input_layer)
        x = layers.Dense(self.output_shape, activation='softmax')(x)
        self.model = Model(inputs = input_layer, outputs = x)
    
    def get_action(self,state):
        return self.model.predict(state)
    
    def __make_loss_function(self):
        before_action_prob = K.placeholder(shape = (None, 1),\
                                          name = 'before_action_prob')
        before_action = K.placeholder(shape = (None, 1),\
                                          name = 'before_action',dtype = 'int64') ########

        advantage = K.placeholder(shape = (None,1), name ='advantage')
        
        now_action_prob = self.model.output
        #now_action_prob = tf.gather_nd(now_action_prob,before_action)
        now_action_prob = tf.gather(now_action_prob,before_action,axis=1)
        
        ratio = K.exp(K.log(before_action_prob) - K.log(now_action_prob))
        
        surr_1 = ratio * advantage
        surr_2 = K.clip(ratio, 1-eps_clip, 1+eps_clip) * advantage
        loss = - K.min((surr_1,surr_2))
        optimizer = optimizers.Adam(lr = learning_rate)
        updates = optimizer.get_updates(loss = loss, params = self.model.trainable_weights)
        
        self.update_function = K.function(inputs = [self.model.input,before_action,\
                                       before_action_prob,advantage],\
                            outputs = [],\
                            updates = updates)
        

In [9]:
class Agent:
    def __init__(self, input_shape,output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        
        self.actor = Actor(input_shape,output_shape)
        self.value = Value(input_shape)
    
        self.memory = []
    def put_data(self,data):
        self.memory.append(data)
    
    def get_action(self,state):
        return self.actor.model.predict(state)
    def memory_to_trainable(self):
        state_list, action_list, reward_list, next_state_list, prob_list, done_list = [],\
        [], [], [], [], []
        
        for data in self.memory:
            state, action, reward, next_state, prob, done = data
            
            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            next_state_list.append(next_state)
            #print(prob) 이거 리스트인지확인해야함
            prob_list.append(prob)
            done = 0 if done else 1
            done_list.append([done])
        return np.array(state_list), np.array(action_list), np.array(reward_list),\
                np.array(next_state_list), np.array(prob_list), np.array(done_list)     
        
    def train(self):
        state,action,reward,next_state,done_mask,prob = self.memory_to_trainable()
        done_mask = done_mask.reshape(-1,1)
        for i in range(env_iteration_number):
            td_error = reward + gamma * self.value.get_value(next_state) * done_mask
            delta = np.array(td_error - self.value.get_value(state))
            advantage_list = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append(advantage)
            advantage_list.reverse()
            advantage = np.array(advantage_list).reshape(-1,1)
            self.actor.update_function([state,action,prob,advantage])
            self.value.update_function([state,advantage])
        self.memory = []

In [10]:
env = gym.make("CartPole-v1")
env_iteration_number = 3
learning_rate = 0.0005
lmbda =0.95
eps_clip = 0.1
epochs = 100
T_horizon = 20
gamma         = 0.98

In [11]:
model = Agent(env.observation_space.shape[0],env.action_space.n)

In [None]:
check = 20
score = 0

for iterate in range(2000):
    state = env.reset()
    done = False

    while not done:
        for t in range(T_horizon):
            action_prob = model.get_action(state.reshape(1,-1))

            action = np.random.choice([x for x in range(env.action_space.n)], p = action_prob[0])
            next_state, reward, done, info = env.step(action)
            
            model.put_data((state,action,reward/100.0, next_state,action_prob[0][action],done))
            state = next_state
            
            score += reward
            if done:
                break
        model.train()
    if (iterate % 20 == 0) & (iterate != 0) :
        print(check, " episode average : ", score / check) 
        score =0

20  episode average :  24.5
20  episode average :  23.4
20  episode average :  20.5
20  episode average :  24.25
20  episode average :  19.85
20  episode average :  20.9
20  episode average :  22.15
20  episode average :  27.2
20  episode average :  26.05
20  episode average :  18.3
20  episode average :  24.15
20  episode average :  22.4
20  episode average :  16.8
20  episode average :  23.6
20  episode average :  20.55
20  episode average :  26.0
20  episode average :  22.0
20  episode average :  16.7
20  episode average :  19.4
20  episode average :  21.85
20  episode average :  24.2
20  episode average :  23.05
20  episode average :  22.9
