# Advantage Actor Critic

In [1]:
# For Development and debugging:
# Reload modul without restarting the kernel
#%load_ext autoreload
#%autoreload 2

import tensorflow as tf

physical_devices = tf.config.experimental.list_physical_devices('GPU')
print('Physical Devices: {}'.format(physical_devices))
try:
    #tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print('GPU memory limitated successfuly!')
except:
    print('Warning! GPU memory could not be limitated!')

Physical Devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU memory limitated successfuly!


In [2]:
#primero poner el ambiente, mostrar el baseline, explicarlo y luego el algoritmo


What is an Environment and an Agent?<br>
Take a look into the lunar Lander Agent

In [2]:
import gym
import numpy as np
import time
from libs.Utils import plot_history
from libs.Utils import test_agent

# Load environment
env = gym.make('LunarLander-v2')
# Number of actions
n_actions = env.action_space.n
vars(env)

{'env': <gym.envs.box2d.lunar_lander.LunarLander at 0x7f30e81f1ac0>,
 'action_space': Discrete(4),
 'observation_space': Box(-inf, inf, (8,), float32),
 'reward_range': (-inf, inf),
 'metadata': {'render.modes': ['human', 'rgb_array'],
  'video.frames_per_second': 50},
 '_max_episode_steps': 1000,
 '_elapsed_steps': None}

Take a look into the LunarLander environment

In [5]:
test_agent(env, heuristic=True, render=True, n_episodes=5)

Total Episode Reward:  265.79505142923756
Total Episode Reward:  273.82849267405766
Total Episode Reward:  298.8136812676555
Total Episode Reward:  288.49102776714193
Total Episode Reward:  277.5043662396249


In [3]:
import tensorflow as tf
import tensorflow_probability as tfp

class Actor(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dense2 = tf.keras.layers.Dense(512, activation='relu')
        self.out = tf.keras.layers.Dense(n_actions, activation='softmax')
        
    def call(self, input_data):
        x = self.dense1(input_data)
        x = self.dense2(x)
        value = self.out(x)
        
        return value

class Critic(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dense2 = tf.keras.layers.Dense(512, activation='relu')
        self.out = tf.keras.layers.Dense(1)
        
    def call(self, input_data):
        x = self.dense1(input_data)
        x = self.dense2(x)
        value = self.out(x)
        
        return value

In [4]:
class Agent():
    def __init__(self, gamma=0.99, actor_lr=5e-6, critic_lr=5e-6):
        self.gamma = gamma
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)
        
        self.actor = Actor()
        self.critic = Critic()
        
    def choose_action(self, state):
        probs = self.actor(np.array([state]))
        # tfp.distributions returns a prob dist
        # same as using np.random.choice([0,1,2,3], p=probs.numpy())
        dist = tfp.distributions.Categorical(probs=probs.numpy(), dtype=tf.float32)
        action = dist.sample().numpy()
        
        return int(action[0])
    
    # Note this is actually the performance measure J(theta)
    def actor_loss(self, probs, action, advantage):
        dist = tfp.distributions.Categorical(probs=probs, dtype=tf.float32)
        log_probs = dist.log_prob(action)
        # Since we are maximizing the agent's performance,  we need to add a minus -
        # to actually maximize instead of minimize
        loss = -log_probs * advantage
        
        return loss
    
    def learn(self, state, action, reward, next_state, done):
        # tf needs a bidimensional array as input:
        state = np.array([state])
        next_state = np.array([next_state])
        
        # Set costum losses for the actor and the critic
        with tf.GradientTape() as actor_tape, tf.GradientTape() as critic_tape:
            action_probs = self.actor(state, training=True)
            state_value = self.critic(state, training=True)
            if done:
                next_state_value = 0
            else:
                next_state_value = self.critic(next_state, training=True)
            
            advantage = reward + self.gamma * next_state_value - state_value
            
            # Agents Performance measure J(theta)
            agent_loss = self.actor_loss(action_probs, action, advantage)
            # Critic loss (MSE for one example) is basically an 
            # approximation of (v - v_hat)^2
            critic_loss = advantage**2
        
        # Apply learning rule
        actor_grads = actor_tape.gradient(agent_loss, 
                                          self.actor.trainable_variables)
        critic_grads = critic_tape.gradient(critic_loss, 
                                            self.critic.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, 
                                                 self.actor.trainable_variables))
        self.critic_optimizer.apply_gradients(zip(critic_grads, 
                                                  self.critic.trainable_variables))
        
        return agent_loss, critic_loss

In [5]:
# Discount Factor
gamma = 0.99
# Actor learning rate
#actor_lr = 5e-6
actor_lr = 1e-6
# Critic learning rate
critic_lr = 5e-6
#critic_lr = 5e-5

# Init agent
agent = Agent(gamma=0.99, actor_lr=5e-6, critic_lr=5e-6)

Lets see how our agent performs without training:

In [9]:
test_agent(env, agent, render=True, n_episodes=5)

Total Episode Reward:  -253.90722700399797
Total Episode Reward:  -375.25820864155787
Total Episode Reward:  -90.04640272231845
Total Episode Reward:  -311.3934819786125
Total Episode Reward:  -395.2617817137771


In [6]:
import os
path = '/home/hhughes/Documents/TUM_Subjects/S5/Seminar/Seminar_Project'
os.makedirs(os.path.join(path, 'Models'), exist_ok=True)
model_path = os.path.join(path, 'Models', 'Advantage_Actor_Critic')
os.makedirs(model_path, exist_ok=True)
best_model_path = os.path.join(model_path, 'Best_model')
os.makedirs(best_model_path, exist_ok=True)

In [7]:
# Training loop
total_episode_reward_history = []
avg_episode_reward_history = []
best_avg_episode_reward = -9e6
num_episodes = 5200
total_n_inter = 0

tic = time.time()
for i in range(num_episodes):
    done = False
    state = env.reset()
    total_episode_reward = 0

    #print('Starting episode: ', i)
    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        actor_loss, critic_loss = agent.learn(state, action, reward, next_state, done)
        state = next_state
        total_episode_reward += reward
        total_n_inter += total_n_inter + 1
        
    total_episode_reward_history.append(total_episode_reward)
    avg_episode_reward = np.mean(total_episode_reward_history[-64:])
    avg_episode_reward_history.append(avg_episode_reward)
    
    # Save best policy so far
    if best_avg_episode_reward < avg_episode_reward:
        print('Saving best model so far...')
        print('Episode ', i, 'total episode reward %.2f, avg episode reward %.2f' % \
              (total_episode_reward, avg_episode_reward))
        best_avg_episode_reward = avg_episode_reward
        agent.actor.save(best_model_path)
        
    
    if (i % 100 == 0) | (i == (num_episodes-1)):
        plot_history(total_episode_reward_history, avg_episode_reward_history,
                     os.path.join(model_path, 'plot'))
        print('episode ', i, 'total episode reward %.2f, avg episode reward %.2f' % \
          (total_episode_reward, avg_episode_reward))
print('Train time (in mins): ',  ((time.time()-tic) / 60))

# Save history
np.savez(os.path.join(model_path, 'history'), 
         total_episode_reward_history=total_episode_reward_history, 
         avg_episode_reward_history=avg_episode_reward_history)

# Save last model
last_model_path = os.path.join(model_path, 'Last_model')
os.makedirs(last_model_path, exist_ok=True)
agent.actor.save(last_model_path)



Saving best model so far...
Episode  0 total episode reward -65.91, avg episode reward -65.91
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: /home/hhughes/Documents/TUM_Subjects/S5/Seminar/Seminar_Project/Models/Advantage_Actor_Critic/Best_model/assets
episode  0 total episode reward -65.91, avg episode reward -65.91
episode  100 total episode reward -432.43, avg episode reward -180.71
episode  200 total episode reward -76.72, avg episode reward -191.46
episode  300 total episode reward -175.84, avg episode reward -221.58
episode  400 total episode reward -187.59, avg episode reward -189.52
episode  500 total episode reward -15.84, avg episode reward -143.19
episode  600 total episode reward -124.11, avg episode reward -134.27
episode  700 total episode reward -97.08, avg episode reward -119.83
episode  800 total episode reward 40.08, avg episode reward -77.10
Saving best model so far...
Episode  817 total episode rew

In [8]:
# Test last model
#del(agent)
#agent = Agent()
#agent.actor = tf.keras.models.load_model(os.path.join(model_path, 'Last_model'))
test_agent(env, agent, render=True, n_episodes=5)

Total Episode Reward:  -136.3167456977772
Total Episode Reward:  -154.437767639191
Total Episode Reward:  -51.45886287316006
Total Episode Reward:  -102.11646554520831
Total Episode Reward:  -92.1034108990026


In [9]:
# Test best model
del(agent)
agent = Agent()
agent.actor = tf.keras.models.load_model(best_model_path)
test_agent(env, agent, render=True, n_episodes=5)

Total Episode Reward:  233.39374415338392
Total Episode Reward:  13.873871520545634
Total Episode Reward:  2.346754614134184
Total Episode Reward:  148.43230568809872
Total Episode Reward:  296.1235679190527


In [21]:
# Test intermediant model
inter_model_path = os.path.join(model_path, 'Intermediant_model')
del(agent)
agent = Agent()
agent.actor = tf.keras.models.load_model(inter_model_path)
test_agent(env, agent, render=True, n_episodes=5)

Total Episode Reward:  -167.72769755387748
Total Episode Reward:  -30.884272412830853
Total Episode Reward:  47.111053623166754
Total Episode Reward:  201.73836078023513
Total Episode Reward:  -184.62839979410555


In [25]:
# Test model without training
del(agent)
agent = Agent()
test_agent(env, agent, render=True, n_episodes=5)

Total Episode Reward:  -74.6144368701255
Total Episode Reward:  -234.27694124610758
Total Episode Reward:  -186.26175254617286
Total Episode Reward:  -297.392797095341
Total Episode Reward:  -215.51888556515956


References:
https://towardsdatascience.com/actor-critic-with-tensorflow-2-x-part-1-of-2-d1e26a54ce97