In [1]:
import gym
import numpy as np
from collections import deque
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.optimizers as optimizers
from IPython.display import clear_output

In [89]:
buffer_size = 5
state_size = 8

state1 = [1,2,3,4,5,6,7,8]
state2 = [11,22,33,44,55,66,77,88]
state3 = [2,3,4,5,6,7,8,9]
state4 = [22,33,44,55,66,77,88,99]
state5 = [12,23,34,45,56,67,78,89]
state6 = [23,34,45,56,67,78,89,90]

buffer = deque(np.zeros((buffer_size, state_size)), maxlen=buffer_size)
buffer.append(state1)
state1 = np.array(buffer)
buffer.append(state2)
state2 = np.array(buffer)
buffer.append(state3)
state3 = np.array(buffer)
buffer.append(state4)
state4 = np.array(buffer)
buffer.append(state5)
state5 = np.array(buffer)
buffer.append(state6)
state6 = np.array(buffer)

state6.flatten()

array([11, 22, 33, 44, 55, 66, 77, 88,  2,  3,  4,  5,  6,  7,  8,  9, 22,
       33, 44, 55, 66, 77, 88, 99, 12, 23, 34, 45, 56, 67, 78, 89, 23, 34,
       45, 56, 67, 78, 89, 90])

In [96]:
class ActorModel(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(__class__, self).__init__()
        self.norm1 = layers.BatchNormalization()
        self.dense1 = layers.Dense(512, activation='relu')
        self.norm2 = layers.BatchNormalization()
        self.out = layers.Dense(action_size, activation='softmax')
        self.build(input_shape=(None, state_size))
    
    def call(self, x):
        x = self.norm1(x)
        x = self.dense1(x)
        x = self.norm2(x)
        policy = self.out(x)
        return policy

    
class CriticModel(tf.keras.Model):
    def __init__(self, state_size):
        super(__class__, self).__init__()
        self.norm1 = layers.BatchNormalization()
        self.dense1 = layers.Dense(512, activation='relu')
        self.norm2 = layers.BatchNormalization()
        self.dense2 = layers.Dense(512, activation='relu')
        self.norm3 = layers.BatchNormalization()
        self.out = layers.Dense(1)
        self.build(input_shape=(None, state_size))
    
    def call(self, x):
        x = self.norm1(x)
        x = self.dense1(x)
        x = self.norm2(x)
        x = self.dense2(x)
        x = self.norm3(x)
        value = self.out(x)
        return value
    

class A2CAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.discount_factor = 0.99
        self.learning_rate = 0.0001
        self.memory_size = 1
        self.memory = []
        self.actor_model = ActorModel(self.state_size, self.action_size)
        self.critic_model = CriticModel(self.state_size)
        self.optimizer = optimizers.Adam(lr=self.learning_rate)
        
    def get_action(self, state):
        policy = self.actor_model(state.reshape(1, -1))
        policy = np.array(policy[0])
        return np.random.choice(self.action_size, 1, p=policy)[0]
    
    def store(self, state, action, reward, next_state):
        self.memory.append((state, action, reward, next_state))
        
    def calculate_G(self, done):
        rewards = [item[2] for item in self.memory]
        G = np.zeros_like(rewards)
        next_value = 0
        if not done:
            next_state = self.memory[-1][3]
            next_value = self.critic_model(next_state.reshape(1, -1))[0]
            next_value = next_value.numpy()
        for t in reversed(range(0, len(rewards))):
            value = rewards[t] + self.discount_factor * next_value
            G[t] = value
            next_value = value
        return G
    
    def train(self, done):
        if done or (len(self.memory) >= self.memory_size):
            G = self.calculate_G(done)
            for t, (state, action, _, _) in enumerate(self.memory):
                critic_model_params = self.critic_model.trainable_variables
                actor_model_params = self.actor_model.trainable_variables
                with tf.GradientTape(persistent=True) as tape:
                    policy = self.actor_model(state.reshape(1, -1))
                    value = self.critic_model(state.reshape(1, -1))
                    advantage = G[t] - value

                    one_hot_action = tf.one_hot([action], self.action_size, axis=1)
                    action_prob = tf.reduce_sum(one_hot_action * policy, axis=1)
                    cross_entropy = -tf.math.log(action_prob)

                    critic_loss = tf.reduce_mean(tf.square(advantage))
                    actor_loss = 0.1 * tf.reduce_mean(cross_entropy * advantage)
                actor_grads = tape.gradient(actor_loss, actor_model_params)
                critic_grads = tape.gradient(critic_loss, critic_model_params)
                self.optimizer.apply_gradients(zip(actor_grads, actor_model_params))
                self.optimizer.apply_gradients(zip(critic_grads, critic_model_params))
                del tape
            self.memory = []

In [98]:
SEED = 111

env = gym.make('LunarLander-v2')
env.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

STATE_SIZE = 8
ACTION_SIZE = 4
BUFFER_SIZE = 10

agent = A2CAgent(state_size=BUFFER_SIZE * STATE_SIZE, 
                 action_size=ACTION_SIZE)
scores = []
steps = []

for episode in range(1, 3001):
    score = 0
    step = 0
    buffer = deque(np.zeros((BUFFER_SIZE, STATE_SIZE), dtype='float32'), maxlen=BUFFER_SIZE)
    
    done = False
    state = env.reset()
    buffer.append(state)
    state = np.array(buffer).flatten()
    
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        buffer.append(next_state)
        next_state = np.array(buffer).flatten()
        score += reward
        step += 1
        agent.store(state, action, reward + bonus, next_state)
        agent.train(done)
        state = next_state
    
    scores.append(score)
    steps.append(step)
        
    print("episode: {:4d} | score: {:4.2f} | step: {:4d} | bonus: {:3.2f} | 10_avg_score: {:4.2f} | 10_avg_step: {:4.1f} | total score: {}".format(
        episode,
        scores[-1],
        steps[-1],
        bonus,
        np.mean(scores[-10:]),
        np.mean(steps[-10:]),
        np.sum(scores),
    ))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

episode:    1 | score: -192.65 | step:   68 | bonus: 0.00 | 10_avg_score: -192.65 | 10_avg_step: 68.0 | total score: -192.65054687030636
episode:    2 | score: -253.83 | step:  114 | bonus: 0.00 | 10_avg_score: -223.24 | 10_avg_step: 91.0 | total score: -446.4800747790084
episode:    3 | score: -151.55 | step:   59 | bonus: 0.00 | 10_avg_score: -199.34 | 10_avg_step: 80.3 | total score: -598.034

In [99]:
import pickle

with open('a2c_final_scores.pickle', 'wb') as f:
    pickle.dump(scores, f)
    
with open('a2c_final_steps.pickle', 'wb') as f:
    pickle.dump(steps, f)
    
agent.actor_model.save_weights("./a2c_final_save_model/actor_model", save_format="tf")
agent.critic_model.save_weights("./a2c_final_save_model/critic_model", save_format="tf")

In [101]:
env = gym.make('LunarLander-v2')

STATE_SIZE = 8
ACTION_SIZE = 4
BUFFER_SIZE = 10

# agent = A2CAgent(state_size=BUFFER_SIZE * STATE_SIZE, action_size=ACTION_SIZE)
# agent.actor_model.load_weights('./a2c_final_save_model/actor_model')

for episode in range(1, 11):
    score = 0
    step = 0
    buffer = deque(np.zeros((BUFFER_SIZE, STATE_SIZE), dtype='float32'), maxlen=BUFFER_SIZE)
    
    done = False
    state = env.reset()
    buffer.append(state)
    state = np.array(buffer).flatten()
    
    while not done:
        env.render()
#         action = agent.get_action(state)
        action = np.array(agent.actor_model(state.reshape(1, -1))[0]).argmax()
        next_state, reward, done, _ = env.step(action)
        buffer.append(next_state)
        next_state = np.array(buffer).flatten()
        score += reward
        step += 1
        state = next_state
        
    print("episode: {:4d} | score: {:4.2f} | step: {:4d}".format(
        episode,
        score,
        step,
    ))
env.close()

episode:    1 | score: 261.96 | step:  240
episode:    2 | score: 266.19 | step:  202
episode:    3 | score: 267.87 | step:  242
episode:    4 | score: 231.01 | step:  214
episode:    5 | score: 268.04 | step:  238
episode:    6 | score: 267.11 | step:  205
episode:    7 | score: 257.04 | step:  242
episode:    8 | score: 285.53 | step:  253
episode:    9 | score: 268.88 | step:  215
episode:   10 | score: 276.94 | step:  220
