# Advantage Actor Critic

This is a very simple implementation of a Deep Reinforcement Learning Advantage Actor-Critic. It uses 2 independent Artificial Neuaral Networks to approximate the Policy function (Actor) and the State-value function (Critic). To test the implementation, I use the Moon Lander environment provided by OpenAI-Gym.

If you want to have a deeper understanding of the Actor-Critic algorithm, I strongly recomend you to take a look into the document `References/A2C_Summary/A2C_Summary.pdf` and `References/A2C_Presentation.pdf`. In the directory `References/A2C_Summary/` you can also find the original $\LaTeX$ document used to create the symmary.

In [1]:
# For Development and debugging:
# Reload modul without restarting the kernel
#%load_ext autoreload
#%autoreload 2

import tensorflow as tf
import os
import gym
import numpy as np
import time
from libs.Utils import plot_history
from libs.Utils import test_agent

# Uncomment the next line if you want tf to ignore your GPU
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print('Physical Devices: {}'.format(physical_devices))
try:
    #tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print('GPU memory limitated successfuly!')
except:
    print('Warning! GPU memory could not be limitated!')
    
# Define the places to save the models
path = './'
model_path = os.path.join(path, 'Models')
# Create dirs to save models
os.makedirs(model_path, exist_ok=True)
best_model_path = os.path.join(model_path, 'Best_model')
os.makedirs(best_model_path, exist_ok=True)
last_model_path = os.path.join(model_path, 'Last_model')
os.makedirs(last_model_path, exist_ok=True)

Physical Devices: []
GPU memory limitated successfuly!


# 1.- Load Lunar Lander environment

In [2]:
# Load environment
env = gym.make('LunarLander-v2')
# Number of actions
n_actions = env.action_space.n
vars(env)

{'env': <gym.envs.box2d.lunar_lander.LunarLander at 0x7f0e24c5b610>,
 'action_space': Discrete(4),
 'observation_space': Box(-inf, inf, (8,), float32),
 'reward_range': (-inf, inf),
 'metadata': {'render.modes': ['human', 'rgb_array'],
  'video.frames_per_second': 50},
 '_max_episode_steps': 1000,
 '_elapsed_steps': None}

Take a look into the LunarLander environment:

In [3]:
test_agent(env, heuristic=True, render=True, n_episodes=5)

Total Episode Reward:  265.3090217637236
Total Episode Reward:  287.0427874891634
Total Episode Reward:  257.1252867981133
Total Episode Reward:  299.9156735822128
Total Episode Reward:  273.23138272232137


# 2.1.- The Actor and the Critic

In [3]:
import tensorflow as tf
import tensorflow_probability as tfp

class Actor(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dense2 = tf.keras.layers.Dense(512, activation='relu')
        self.out = tf.keras.layers.Dense(n_actions, activation='softmax')
        
    def call(self, input_data):
        x = self.dense1(input_data)
        x = self.dense2(x)
        value = self.out(x)
        
        return value

class Critic(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dense2 = tf.keras.layers.Dense(512, activation='relu')
        self.out = tf.keras.layers.Dense(1)
        
    def call(self, input_data):
        x = self.dense1(input_data)
        x = self.dense2(x)
        value = self.out(x)
        
        return value

# 2.2 The Agent

In [4]:
class Agent():
    def __init__(self, gamma=0.99, actor_lr=5e-6, critic_lr=5e-6):
        self.gamma = gamma
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)
        
        self.actor = Actor()
        self.critic = Critic()
        
    def choose_action(self, state):
        probs = self.actor(np.array([state]))
        # tfp.distributions returns a prob dist
        # same as using np.random.choice([0,1,2,3], p=probs.numpy())
        dist = tfp.distributions.Categorical(probs=probs.numpy(), dtype=tf.float32)
        action = dist.sample().numpy()
        
        return int(action[0])
    
    # Note this is actually the performance measure J(theta)
    def actor_loss(self, probs, action, advantage):
        dist = tfp.distributions.Categorical(probs=probs, dtype=tf.float32)
        log_probs = dist.log_prob(action)
        # Since we are maximizing the agent's performance,  we need to add a minus -
        # to actually maximize instead of minimize
        loss = -log_probs * advantage
        
        return loss
    
    def learn(self, state, action, reward, next_state, done):
        # tf needs a bidimensional array as input:
        state = np.array([state])
        next_state = np.array([next_state])
        
        # Set costum losses for the actor and the critic
        with tf.GradientTape() as actor_tape, tf.GradientTape() as critic_tape:
            action_probs = self.actor(state, training=True)
            state_value = self.critic(state, training=True)
            if done:
                next_state_value = 0
            else:
                next_state_value = self.critic(next_state, training=True)
            
            advantage = reward + self.gamma * next_state_value - state_value
            
            # Agents Performance measure J(theta)
            agent_loss = self.actor_loss(action_probs, action, advantage)
            # Critic loss (MSE for one example) is basically an 
            # approximation of (v - v_hat)^2
            critic_loss = advantage**2
        
        # Apply learning rule
        actor_grads = actor_tape.gradient(agent_loss, 
                                          self.actor.trainable_variables)
        critic_grads = critic_tape.gradient(critic_loss, 
                                            self.critic.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, 
                                                 self.actor.trainable_variables))
        self.critic_optimizer.apply_gradients(zip(critic_grads, 
                                                  self.critic.trainable_variables))
        
        return agent_loss, critic_loss

# 3.- Agent training

## 3.1.- Initialize Agent

In [5]:
# Discount Factor
gamma = 0.99
# Actor learning rate
actor_lr = 1e-6
# Critic learning rate
critic_lr = 5e-6

# Init agent
agent = Agent(gamma=0.99, actor_lr=5e-6, critic_lr=5e-6)

Lets see how our agent performs without training:

In [7]:
test_agent(env, agent, render=True, n_episodes=5)

Total Episode Reward:  -89.99154013467287
Total Episode Reward:  -229.3687217007442
Total Episode Reward:  -99.52781041097954
Total Episode Reward:  -123.32629389526207
Total Episode Reward:  -182.571081715751


## 3.1.- Train the Agent

In [9]:
# Training loop
total_episode_reward_history = []
avg_episode_reward_history = []
best_avg_episode_reward = -9e6
num_episodes = 5200
total_n_inter = 0

tic = time.time()
for i in range(num_episodes):
    done = False
    state = env.reset()
    total_episode_reward = 0

    #print('Starting episode: ', i)
    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        actor_loss, critic_loss = agent.learn(state, action, reward, next_state, done)
        state = next_state
        total_episode_reward += reward
        total_n_inter += total_n_inter + 1
        
    total_episode_reward_history.append(total_episode_reward)
    avg_episode_reward = np.mean(total_episode_reward_history[-64:])
    avg_episode_reward_history.append(avg_episode_reward)
    
    # Save best policy so far
    if best_avg_episode_reward < avg_episode_reward:
        print('Saving best model so far...')
        print('Episode ', i, 'total episode reward %.2f, avg episode reward %.2f' % \
              (total_episode_reward, avg_episode_reward))
        best_avg_episode_reward = avg_episode_reward
        agent.actor.save(best_model_path)
        
    
    if (i % 100 == 0) | (i == (num_episodes-1)):
        plot_history(total_episode_reward_history, avg_episode_reward_history,
                     os.path.join(model_path, 'plot'))
        print('episode ', i, 'total episode reward %.2f, avg episode reward %.2f' % \
          (total_episode_reward, avg_episode_reward))
print('Train time (in mins): ',  ((time.time()-tic) / 60))
print(total_n_inter)

# Save history
np.savez(os.path.join(model_path, 'history'), 
         total_episode_reward_history=total_episode_reward_history, 
         avg_episode_reward_history=avg_episode_reward_history)

# Save last model
agent.actor.save(last_model_path)

Saving best model so far...
Episode  0 total episode reward -301.39, avg episode reward -301.39
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: /home/hhughes/Documents/TUM_Subjects/S5/Seminar/Seminar_Project/Models/Advantage_Actor_Critic/Best_model/assets
episode  0 total episode reward -301.39, avg episode reward -301.39
Saving best model so far...
Episode  1 total episode reward -91.36, avg episode reward -196.37
INFO:tensorflow:Assets written to: /home/hhughes/Documents/TUM_Subjects/S5/Seminar/Seminar_Project/Models/Advantage_Actor_Critic/Best_model/assets
Saving best model so far...
Episode  2 total episode reward -93.06, avg episode reward -161.93
INFO:tensorflow:Assets written to: /home/hhughes/Documents/TUM_Subjects/S5/Seminar/Seminar_Project/Models/Advantage_Actor_Critic/Best_model/assets
Saving best model so far...
Episode  6 total episode reward -76.39, avg episode reward -156.15
INFO:tensorflow:Assets writte

# 4.- Test the results

## 4.1.- Test last model

In [13]:
# Uncomment next lines if you want to reload the last model
#del(agent)
#agent = Agent()
#agent.actor = tf.keras.models.load_model(last_model_path)
test_agent(env, agent, render=True, n_episodes=5)

Total Episode Reward:  -157.83038452377954
Total Episode Reward:  -99.72781729128329
Total Episode Reward:  -135.60894452858483
Total Episode Reward:  -130.16759846478797
Total Episode Reward:  -156.41825497118842


## 4.2.- Test best model

In [8]:
# Test best model
del(agent)
agent = Agent()
agent.actor = tf.keras.models.load_model(best_model_path)
test_agent(env, agent, render=True, n_episodes=5)

Total Episode Reward:  274.0378887860743
Total Episode Reward:  221.85726019993933
Total Episode Reward:  -14.467176654759323
Total Episode Reward:  250.0321871721154
Total Episode Reward:  1.122796094880158


# 5.-References

- https://towardsdatascience.com/actor-critic-with-tensorflow-2-x-part-1-of-2-d1e26a54ce97