In [26]:
import gym
import random
import numpy as np
from collections import deque
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.optimizers as optimizers
from IPython.display import clear_output

In [241]:
class ActorModel(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(__class__, self).__init__()
        self.dense1 = layers.Dense(64, activation='relu')
        self.out = layers.Dense(action_size, activation='softmax')
        self.build(input_shape=(None, state_size))
    
    def call(self, x):
        x = self.dense1(x)
        policy = self.out(x)
        return policy

    
class CriticModel(tf.keras.Model):
    def __init__(self, state_size):
        super(__class__, self).__init__()
        self.dense1 = layers.Dense(64, activation='relu')
        self.dense2 = layers.Dense(64, activation='relu')
        self.out = layers.Dense(1)
        self.build(input_shape=(None, state_size))
    
    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        value = self.out(x)
        return value
    

class A2CAgent:
    def __init__(self, env):
        self.env = env
        self.state_size = 8
        self.action_size = 4
        
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.memory_size = 1
        self.batch_size = 1
        self.train_start = 1
        
        self.memory = deque(maxlen=self.memory_size)
        
        self.actor_model = ActorModel(self.state_size, self.action_size)
        self.critic_model = CriticModel(self.state_size)
        
        self.optimizer = optimizers.Adam(lr=self.learning_rate)
        
    def get_action(self, state):
        policy = self.actor_model(state.reshape(1, -1))
        policy = np.array(policy[0])
        return np.random.choice(self.action_size, 1, p=policy)[0]
    
    def store(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def train(self):
        if len(self.memory) >= self.train_start:
            samples = samples = random.sample(self.memory, self.batch_size)

            state = np.array([sample[0] for sample in samples])
            action = np.array([sample[1] for sample in samples])
            reward = np.array([[sample[2]] for sample in samples])
            next_state = np.array([sample[3] for sample in samples])
            done = np.array([[sample[4]] for sample in samples])
            
            actor_model_params = self.actor_model.trainable_variables
            critic_model_params = self.critic_model.trainable_variables

            with tf.GradientTape(persistent=True) as tape:
                policy = self.actor_model(state)
                value = self.critic_model(state)
                next_value = self.critic_model(next_state)
                target = reward + (1 - done) * self.discount_factor * next_value
                advantage = target - value

                one_hot_action = tf.one_hot(action, self.action_size, axis=1)
                action_prob = tf.reduce_sum(one_hot_action * policy, axis=1, keepdims=True)
                cross_entropy = -tf.math.log(action_prob)

                actor_loss = tf.reduce_mean(cross_entropy * advantage)
                critic_loss = tf.reduce_mean(tf.square(advantage))
                loss = 0.1 * actor_loss + critic_loss

            actor_grads = tape.gradient(loss, actor_model_params)
            critic_grads = tape.gradient(loss, critic_model_params)
            del tape

            self.optimizer.apply_gradients(zip(actor_grads, actor_model_params))
            self.optimizer.apply_gradients(zip(critic_grads, critic_model_params))

            return np.array(cross_entropy).mean()
    

In [242]:
SEED = 123

env = gym.make('LunarLander-v2')

env.seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

agent = A2CAgent(env)

scores = []
steps = []

for episode in range(1, 3001):
    score = 0
    step = 0
    cross_entropy_sum = 0
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.store(state, action, reward, next_state, done)
        agent.train()
        state = next_state
        score += reward
        step += 1
    scores.append(score)
    steps.append(step)
        
    print("episode: {:4d} | score: {:4.2f} | step: {:4d} | 100_avg_score: {:4.2f} | 100_avg_step: {:4.1f} | total score: {}".format(
        episode,
        scores[-1],
        steps[-1],
        np.mean(scores[-100:]),
        np.mean(steps[-100:]),
        np.sum(scores),
    ))

episode:    1 | score: -386.50 | step:  124 | 100_avg_score: -386.50 | 100_avg_step: 124.0 | total score: -386.5048345167437
episode:    2 | score: -332.04 | step:  100 | 100_avg_score: -359.27 | 100_avg_step: 112.0 | total score: -718.543244344527
episode:    3 | score: -163.94 | step:   76 | 100_avg_score: -294.16 | 100_avg_step: 100.0 | total score: -882.4839390990832
episode:    4 | score: -114.53 | step:   54 | 100_avg_score: -249.25 | 100_avg_step: 88.5 | total score: -997.0112224346905
episode:    5 | score: -167.77 | step:   77 | 100_avg_score: -232.96 | 100_avg_step: 86.2 | total score: -1164.7816381275268
episode:    6 | score: -97.49 | step:   65 | 100_avg_score: -210.38 | 100_avg_step: 82.7 | total score: -1262.2725993015897
episode:    7 | score: 24.74 | step:  105 | 100_avg_score: -176.79 | 100_avg_step: 85.9 | total score: -1237.5287189898215
episode:    8 | score: -70.11 | step:  146 | 100_avg_score: -163.45 | 100_avg_step: 93.4 | total score: -1307.6358885245336
episod

In [243]:
import pickle

with open('a2c_scores.pickle', 'wb') as f:
    pickle.dump(scores, f)
    
with open('a2c_steps.pickle', 'wb') as f:
    pickle.dump(steps, f)
    
agent.actor_model.save_weights("./a2c_save_model/actor_model", save_format="tf")
agent.critic_model.save_weights("./a2c_save_model/critic_model", save_format="tf")

In [None]:
class A2CModel(tf.keras.Model):
    
    def __init__(self, action_size):
        super(__class__, self).__init__()
        self.input_layer = layers.Dense(128, activation='relu')
        self.actor_out = layers.Dense(action_size, activation='softmax')
        self.critic_out = layers.Dense(1)
        
    def call(self, x):
        x = self.input_layer(x)
        policy = self.actor_out(x)
        value = self.critic_out(x)
        return policy, value
    
    
class Agent:
    
    def __init__(self, env, action_size, state_size, gamma=.99, lr=0.02):
        self.env = env
        self.action_size = action_size
        self.state_size = state_size
        self.gamma = gamma
        self.model = A2CModel(action_size)
        self.model.build(input_shape=(None, state_size))
        self.optimizer = optimizers.Adam(lr=lr)
        
    def get_action(self, state, train=False):
        policy, _ = self.model(state.reshape(1, -1))
        policy = np.array(policy).flatten()
        if train:
            return np.random.choice(range(self.action_size), p=policy)
        else:
            return policy.argmax()
    
    def learn(self, state, action, reward, next_state, done):
        states = state.reshape(1, -1)
        actions = action.reshape(1, -1)
        rewards = np.array([reward])
        next_states = next_state.reshape(1, -1)
        dones = np.array([done])
        
        def loss_fn():
            policies, values = self.model(states)
            _, next_values = self.model(next_states)
            y = rewards + (1-dones) * self.gamma * next_values
            
            critic_loss = tf.reduce_mean(0.5 * tf.square(y - values))
            
            one_hot_actions = tf.one_hot(actions, self.action_size)
            action_probs = tf.reduce_sum(one_hot_actions * policies, axis=1)
            cross_entropy = -tf.math.log(action_probs + 1e-5)
            advantages = y - values
            actor_loss = tf.reduce_mean(cross_entropy * advantages)
            
            return critic_loss + actor_loss
        
        self.optimizer.minimize(loss_fn, var_list=self.model.trainable_variables)

In [30]:
env = gym.make('LunarLander-v2')
action_size = env.action_space.n
state_size = 8
agent = Agent(env, action_size, state_size)
scores = []
steps = []
for episode in range(1, 3001):
    done = False
    samples = []
    state = env.reset()
    score = 0
    step = 0
    while not done:
        action = agent.get_action(state, train=True)
        next_state, reward, done, _ = env.step(action)
        score += reward
        step += 1
        agent.learn(state, action, reward, next_state, done)
        state = next_state
    scores.append(score)
    steps.append(step)
    if episode % 10 == 0:
        print('Episode: {} Total score: {} Average score: {} Average step: {}'.format(episode, np.sum(scores), np.mean(scores[-10:]), np.mean(steps[-10:])))
env.close()

Episode: 10 Total score: -8211.831013163122 Average score: -821.1831013163122 Average step: 108.5
Episode: 20 Total score: -13537.533924309997 Average score: -532.5702911146875 Average step: 66.8
Episode: 30 Total score: -14891.603053624263 Average score: -135.40691293142675 Average step: 75.2
Episode: 40 Total score: -16245.016850508184 Average score: -135.34137968839178 Average step: 71.4
Episode: 50 Total score: -17300.647306007366 Average score: -105.5630455499183 Average step: 69.5
Episode: 60 Total score: -18467.00354309124 Average score: -116.63562370838736 Average step: 64.6
Episode: 70 Total score: -19938.654580338287 Average score: -147.16510372470503 Average step: 73.7
Episode: 80 Total score: -21303.80848925999 Average score: -136.5153908921701 Average step: 67.9
Episode: 90 Total score: -22702.023303896472 Average score: -139.82148146364813 Average step: 63.7
Episode: 100 Total score: -23952.782474460128 Average score: -125.07591705636534 Average step: 67.6
Episode: 110 To

In [247]:
env = gym.make('LunarLander-v2')

for episode in range(1, 12):
    done = False
    state = env.reset()
    score = 0
    step = 0
    while not done:
        env.render()
        policy = agent.actor_model(state.reshape(1, -1))
        policy = np.array(policy[0])
        action = policy.argmax()
#         action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        score += reward
        step += 1
        state = next_state
    print('Episode: {} Score: {} Step: {}'.format(episode, score, step))
env.close()

Episode: 1 Score: 293.94395889006734 Step: 172
Episode: 2 Score: 230.31418937421978 Step: 321
Episode: 3 Score: 228.41756152305467 Step: 345
Episode: 4 Score: 222.1624415291472 Step: 354
Episode: 5 Score: 24.85952595571024 Step: 100
Episode: 6 Score: 50.71778988012895 Step: 115
Episode: 7 Score: 142.44957185674087 Step: 1000
Episode: 8 Score: 101.39582803246273 Step: 96
Episode: 9 Score: 8.521440730778721 Step: 99
Episode: 10 Score: 122.46365335860594 Step: 1000
Episode: 11 Score: 259.35707298061345 Step: 199
