In [1]:
from keras import backend as K
from keras.layers import Activation, Dense, Input
from keras.models import Model
from keras.optimizers import Adam
import numpy as np

class Agent(object):
    def __init__(self, alpha, beta, gamma=0.99, n_actions=4,
                 layer1_size=1024, layer2_size=512, input_dims=8):
        self.gamma = gamma
        self.alpha = alpha
        self.beta = beta
        self.input_dims = input_dims
        self.fc1_dims = layer1_size
        self.fc2_dims = layer2_size
        self.n_actions = n_actions

        self.actor, self.critic, self.policy = self.build_actor_critic_network()
        self.action_space = [*range(n_actions)]

    def build_actor_critic_network(self):
        input = Input(shape=(self.input_dims,))
        delta = Input(shape=[1])
        dense1 = Dense(self.fc1_dims, activation='relu')(input)
        dense2 = Dense(self.fc2_dims, activation='relu')(dense1)
        probs = Dense(self.n_actions, activation='softmax')(dense2)
        values = Dense(1, activation='linear')(dense2)

        def custom_loss(y_true, y_pred):
            out = K.clip(y_pred, 1e-8, 1-1e-8)
            log_lik = y_true*K.log(out)
            print("log like *del ", end=" ")
            print(-log_lik*delta)
            return K.sum(-log_lik*delta)

        actor = Model(input=[input, delta], output=[probs])

        actor.compile(optimizer=Adam(lr=self.alpha), loss=custom_loss)

        critic = Model(input=[input], output=[values])

        critic.compile(optimizer=Adam(lr=self.beta), loss='mean_squared_error')

        policy = Model(input=[input], output=[probs])

        return actor, critic, policy

    def choose_random_action(self, observation):
        state = observation[np.newaxis, :]
        action = np.random.choice(self.action_space)
        return action
    
    def choose_action(self, observation):
        state = observation[np.newaxis, :]
        probabilities = self.policy.predict(state)
        action = np.random.choice(self.action_space, p=probabilities[0])
        return action

    def learnValue(self, state, action, reward, state_, done):
        state = state[np.newaxis,:]
        state_ = state_[np.newaxis,:]
        critic_value_ = self.critic.predict(state_)
        critic_value = self.critic.predict(state)
        
        target = reward + self.gamma*critic_value_*(1-int(done))

        self.critic.fit(state, target, verbose=0)
    
    def learn(self, state, action, reward, state_, done):
        state = state[np.newaxis,:]
        state_ = state_[np.newaxis,:]
        critic_value_ = self.critic.predict(state_)
        critic_value = self.critic.predict(state)

        target = reward + self.gamma*critic_value_*(1-int(done))
        delta =  target - critic_value
        actions = np.zeros([1, self.n_actions])

#         if delta > 0:
        actions[[0], action] = 1
#         else :
#             actions[[0], 0] = 1
        self.actor.fit([state, delta], actions, verbose=0)

        self.critic.fit(state, target, verbose=0)



Using TensorFlow backend.


In [1]:
import gym
import random
# from utils import plotLearning

agent = Agent(alpha=0.00001, beta=0.00005)
env = gym.make('LunarLander-v2')
score_history=[]
num_epis = 2000
mem=[]
batch_size = 32

for i in range(1, num_epis+1):
    done = 0
    score = 0
    state = env.reset()
    
    while not done:
        env.render()
        if i <= 500:
            action = agent.choose_random_action(state)
        else :
            action = agent.choose_action(state)
        next_state, reward, done, inf = env.step(action)
        state = next_state
        mem.append((state, action, reward, next_state, done))
        score += reward
    
    if i <= 1000:
        if i > 500:
            batch_size = 64
        if i%10 == 0:
            minibatch = random.choices(mem, k=batch_size)
            for state, action, reward, next_state, done in minibatch:
                agent.learnValue(state, action, reward, next_state, done)
    else :
        if i%10 == 0:
            minibatch = random.choices(mem, k=batch_size)
            for state, action, reward, next_state, done in minibatch:
                agent.learn(state, action, reward, next_state, done)
    score_history += [score]
    avg_score = np.mean(score_history[-100:])
    print('episode %d, score %.2f, avg score %.2f' % (i, score, avg_score))
    filename= 'lander_plot.png'
#     plotLearning(score_history, filename=filename, window=100)