# Advantage Actor Critic

In [1]:
# For Development and debugging:
# Reload modul without restarting the kernel
%load_ext autoreload
%autoreload 2

import tensorflow as tf

physical_devices = tf.config.experimental.list_physical_devices('GPU')
print('Physical Devices: {}'.format(physical_devices))
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print('GPU memory limitated successfuly!')
except:
    print('Warning! GPU memory could not be limitated!')

Physical Devices: []


In [2]:
#primero poner el ambiente, mostrar el baseline, explicarlo y luego el algoritmo


What is an Environment and an Agent?<br>
Take a look into the lunar Lander Agent

In [3]:
import gym
import numpy as np
from libs.Utils import plot_history
from libs.Utils import test_agent

# Load environment
env = gym.make('LunarLander-v2')
# Number of actions
n_actions = env.action_space.n
vars(env)

{'env': <gym.envs.box2d.lunar_lander.LunarLander at 0x7f2a3c01af70>,
 'action_space': Discrete(4),
 'observation_space': Box(-inf, inf, (8,), float32),
 'reward_range': (-inf, inf),
 'metadata': {'render.modes': ['human', 'rgb_array'],
  'video.frames_per_second': 50},
 '_max_episode_steps': 1000,
 '_elapsed_steps': None}

Take a look into the LunarLander environment

In [4]:
test_agent(env, heuristic=True, render=True, n_episodes=5)

Total Episode Reward:  284.2599077715691
Total Episode Reward:  290.69490397375694
Total Episode Reward:  265.8646263073205
Total Episode Reward:  240.74516870263656
Total Episode Reward:  293.8572048127621


In [5]:
import tensorflow as tf
import tensorflow_probability as tfp

class Actor(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dense2 = tf.keras.layers.Dense(512, activation='relu')
        self.out = tf.keras.layers.Dense(n_actions, activation='softmax')
        
    def call(self, input_data):
        x = self.dense1(input_data)
        x = self.dense2(x)
        value = self.out(x)
        
        return value

class Critic(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dense2 = tf.keras.layers.Dense(512, activation='relu')
        self.out = tf.keras.layers.Dense(1)
        
    def call(self, input_data):
        x = self.dense1(input_data)
        x = self.dense2(x)
        value = self.out(x)
        
        return value

In [6]:
class Agent():
    def __init__(self, gamma=0.99, actor_lr=5e-6, critic_lr=5e-6):
        self.gamma = gamma
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)
        
        self.actor = Actor()
        self.critic = Critic()
        
    def choose_action(self, state):
        probs = self.actor(np.array([state]))
        # tfp.distributions returns a prob dist
        # same as using np.random.choice([0,1,2,3], p=probs.numpy())
        dist = tfp.distributions.Categorical(probs=probs.numpy(), dtype=tf.float32)
        action = dist.sample().numpy()
        
        return int(action[0])
    
    # Note this is actually the performance measure J(theta)
    def actor_loss(self, probs, action, advantage):
        dist = tfp.distributions.Categorical(probs=probs, dtype=tf.float32)
        log_probs = dist.log_prob(action)
        # Since we are maximizing the agent's performance,  we need to add a minus -
        # to actually maximize instead of minimize
        loss = -log_probs * advantage
        
        return loss
    
    def learn(self, state, action, reward, next_state, done):
        # tf needs a bidimensional array as input:
        state = np.array([state])
        next_state = np.array([next_state])
        
        # Set costum losses for the actor and the critic
        with tf.GradientTape() as actor_tape, tf.GradientTape() as critic_tape:
            action_probs = self.actor(state, training=True)
            state_value = self.critic(state, training=True)
            if done:
                next_state_value = 0
            else:
                next_state_value = self.critic(next_state, training=True)
            
            advantage = reward + self.gamma * next_state_value - state_value
            
            # Agents Performance measure J(theta)
            agent_loss = self.actor_loss(action_probs, action, advantage)
            # Critic loss (MSE for one example) is basically an 
            # approximation of (v - v_hat)^2
            critic_loss = advantage**2
        
        # Apply learning rule
        actor_grads = actor_tape.gradient(agent_loss, 
                                          self.actor.trainable_variables)
        critic_grads = critic_tape.gradient(critic_loss, 
                                            self.critic.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, 
                                                 self.actor.trainable_variables))
        self.critic_optimizer.apply_gradients(zip(critic_grads, 
                                                  self.critic.trainable_variables))
        
        return agent_loss, critic_loss

In [7]:
# Discount Factor
gamma = 0.99
# Actor learning rate
actor_lr = 5e-6
# Critic learning rate
critic_lr = 5e-6

# Init agent
agent = Agent(gamma=0.99, actor_lr=5e-6, critic_lr=5e-6)

Lets see how our agent performs without training:

In [9]:
test_agent(env, agent, render=True, n_episodes=5)

Total Episode Reward:  -114.73277826257134
Total Episode Reward:  -221.0962736520064
Total Episode Reward:  -305.23594311146314
Total Episode Reward:  -81.8531241754936
Total Episode Reward:  -153.59670515214492


In [10]:
import os
path = '/home/hhughes/Documents/TUM_Subjects/S5/Seminar/Seminar_Project'
os.makedirs(os.path.join(path, 'Models'), exist_ok=True)
model_path = os.path.join(path, 'Models', 'Advantage_Actor_Critic')
os.makedirs(model_path, exist_ok=True)

In [6]:
# Training loop
total_episode_reward_history = []
avg_episode_reward_history = []
num_episodes = 2000

for i in range(num_episodes):
    done = False
    state = env.reset()
    total_episode_reward = 0

    print('Starting episode: ', i)
    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        actor_loss, critic_loss = agent.learn(state, action, reward, next_state, done)
        state = next_state
        total_episode_reward += reward
        
    total_episode_reward_history.append(total_episode_reward)
    avg_episode_reward = np.mean(total_episode_reward_history[-100:])
    avg_episode_reward_history.append(avg_episode_reward)
    
    plot_history(total_episode_reward_history, avg_episode_reward_history,
                 os.path.join(model_path, 'plot'))
    print('episode ', i, 'total episode reward %.2f, avg episode reward %.2f' % \
          (total_episode_reward, avg_episode_reward))
    

Starting episode:  0
episode  0 total episode reward -125.16, avg episode reward -125.16
Starting episode:  1
episode  1 total episode reward -92.40, avg episode reward -108.78
Starting episode:  2
episode  2 total episode reward -173.11, avg episode reward -130.22
Starting episode:  3
episode  3 total episode reward -102.62, avg episode reward -123.32
Starting episode:  4
episode  4 total episode reward -99.42, avg episode reward -118.54
Starting episode:  5
episode  5 total episode reward -439.67, avg episode reward -172.06
Starting episode:  6
episode  6 total episode reward -203.14, avg episode reward -176.50
Starting episode:  7
episode  7 total episode reward -290.72, avg episode reward -190.78
Starting episode:  8
episode  8 total episode reward -210.41, avg episode reward -192.96
Starting episode:  9
episode  9 total episode reward -300.16, avg episode reward -203.68
Starting episode:  10
episode  10 total episode reward -376.36, avg episode reward -219.38
Starting episode:  11

In [None]:
# Save history
np.savez(os.path.join(model_path, 'history'), score_history=score_history, avg_score_history=avg_score_history)

# save models

References:
https://towardsdatascience.com/actor-critic-with-tensorflow-2-x-part-1-of-2-d1e26a54ce97