# Advantage Actor Critic

In [1]:
# For Development and debugging:
# Reload modul without restarting the kernel
#%load_ext autoreload
#%autoreload 2

import tensorflow as tf
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print('Physical Devices: {}'.format(physical_devices))
try:
    #tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print('GPU memory limitated successfuly!')
except:
    print('Warning! GPU memory could not be limitated!')
    
path = '/home/hhughes/Documents/TUM_Subjects/S5/Seminar/Seminar_Project'
os.makedirs(os.path.join(path, 'Models'), exist_ok=True)
model_path = os.path.join(path, 'Models', 'For_presentation')
os.makedirs(model_path, exist_ok=True)
best_model_path = os.path.join(model_path, 'Best_model')
os.makedirs(best_model_path, exist_ok=True)

Physical Devices: []
GPU memory limitated successfuly!


In [2]:
import gym
import numpy as np
import time
from libs.Utils import plot_history
from libs.Utils import test_agent

# Load environment
env = gym.make('LunarLander-v2')
# Number of actions
n_actions = env.action_space.n
vars(env)

{'env': <gym.envs.box2d.lunar_lander.LunarLander at 0x7fb2802094f0>,
 'action_space': Discrete(4),
 'observation_space': Box(-inf, inf, (8,), float32),
 'reward_range': (-inf, inf),
 'metadata': {'render.modes': ['human', 'rgb_array'],
  'video.frames_per_second': 50},
 '_max_episode_steps': 1000,
 '_elapsed_steps': None}

In [None]:
from IPython.display import Video
Video('Moon_Lander.mp4')

## LunarLander-v2 Environment description

- Landing pad is always at coordinates (0,0). 
- Coordinates are the first two numbers in state vector.
- Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main engine is -0.3 points each frame. Solved is 200 points.
- Landing outside landing pad is possible. 
- Fuel is infinite, so an agent can learn to fly and then land on its first attempt. 
- Four discrete actions available: do nothing, fire left orientation engine, fire main engine, fire right orientation engine.

Take a look into the LunarLander environment

In [3]:
test_agent(env, heuristic=True, render=True, n_episodes=5)

Total Episode Reward:  272.21685030411015
Total Episode Reward:  251.08950599307943
Total Episode Reward:  257.95846565069485
Total Episode Reward:  194.3517504190541
Total Episode Reward:  23.156433943354656


In [4]:
import tensorflow as tf
import tensorflow_probability as tfp

class Actor(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dense2 = tf.keras.layers.Dense(512, activation='relu')
        self.out = tf.keras.layers.Dense(n_actions, activation='softmax')
        
    def call(self, input_data):
        x = self.dense1(input_data)
        x = self.dense2(x)
        value = self.out(x)
        
        return value

class Critic(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dense2 = tf.keras.layers.Dense(512, activation='relu')
        self.out = tf.keras.layers.Dense(1)
        
    def call(self, input_data):
        x = self.dense1(input_data)
        x = self.dense2(x)
        value = self.out(x)
        
        return value

In [5]:
class Agent():
    def __init__(self, gamma=0.99, actor_lr=5e-6, critic_lr=5e-6):
        self.gamma = gamma
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)
        
        self.actor = Actor()
        self.critic = Critic()
        
    def choose_action(self, state):
        probs = self.actor(np.array([state]))
        # tfp.distributions returns a prob dist
        # same as using np.random.choice([0,1,2,3], p=probs.numpy())
        dist = tfp.distributions.Categorical(probs=probs.numpy(), dtype=tf.float32)
        action = dist.sample().numpy()
        
        return int(action[0])
    
    # Note this is actually the performance measure J(theta)
    def actor_loss(self, probs, action, advantage):
        dist = tfp.distributions.Categorical(probs=probs, dtype=tf.float32)
        log_probs = dist.log_prob(action)
        # Since we are maximizing the agent's performance,  we need to add a minus -
        # to actually maximize instead of minimize
        loss = -log_probs * advantage
        
        return loss
    
    def learn(self, state, action, reward, next_state, done):
        # tf needs a bidimensional array as input:
        state = np.array([state])
        next_state = np.array([next_state])
        
        # Set costum losses for the actor and the critic
        with tf.GradientTape() as actor_tape, tf.GradientTape() as critic_tape:
            action_probs = self.actor(state, training=True)
            state_value = self.critic(state, training=True)
            if done:
                next_state_value = 0
            else:
                next_state_value = self.critic(next_state, training=True)
            
            advantage = reward + self.gamma * next_state_value - state_value
            
            # Agents Performance measure J(theta)
            agent_loss = self.actor_loss(action_probs, action, advantage)
            # Critic loss (MSE for one example) is basically an 
            # approximation of (v - v_hat)^2
            critic_loss = advantage**2
        
        # Apply learning rule
        actor_grads = actor_tape.gradient(agent_loss, 
                                          self.actor.trainable_variables)
        critic_grads = critic_tape.gradient(critic_loss, 
                                            self.critic.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, 
                                                 self.actor.trainable_variables))
        self.critic_optimizer.apply_gradients(zip(critic_grads, 
                                                  self.critic.trainable_variables))
        
        return agent_loss, critic_loss

## Initialize the Agent

In [6]:
# Discount Factor
gamma = 0.99
# Actor learning rate
#actor_lr = 5e-6
actor_lr = 1e-6
# Critic learning rate
critic_lr = 5e-6
#critic_lr = 5e-5

# Init agent
agent = Agent(gamma=0.99, actor_lr=5e-6, critic_lr=5e-6)

Lets see how our agent performs without training:

In [7]:
test_agent(env, agent, render=True, n_episodes=10)

Total Episode Reward:  -135.17463301430473
Total Episode Reward:  -296.78184664710193
Total Episode Reward:  -146.20941745252622
Total Episode Reward:  -132.83840163979653
Total Episode Reward:  -209.81610378393535
Total Episode Reward:  -94.2150388613217
Total Episode Reward:  -65.00133809390107
Total Episode Reward:  -122.33731847804714
Total Episode Reward:  -348.6521250885303
Total Episode Reward:  -348.5806769148139


## Main training loop:

```python
# Training loop
total_episode_reward_history = []
avg_episode_reward_history = []
best_avg_episode_reward = -9e6
num_episodes = 10000
total_n_inter = 0

tic = time.time()
for i in range(num_episodes):
    done = False
    state = env.reset()
    total_episode_reward = 0

    #print('Starting episode: ', i)
    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        actor_loss, critic_loss = agent.learn(state, action, reward, next_state, done)
        state = next_state
        total_episode_reward += reward
        total_n_inter += total_n_inter + 1
        
    total_episode_reward_history.append(total_episode_reward)
    avg_episode_reward = np.mean(total_episode_reward_history[-64:])
    avg_episode_reward_history.append(avg_episode_reward)
    
    # Save best policy so far
    if best_avg_episode_reward < avg_episode_reward:
        print('Saving best model so far...')
        print('Episode ', i, 'total episode reward %.2f, avg episode reward %.2f' % \
              (total_episode_reward, avg_episode_reward))
        best_avg_episode_reward = avg_episode_reward
        agent.actor.save(best_model_path)
        
    
    if (i % 100 == 0) | (i == (num_episodes-1)):
        plot_history(total_episode_reward_history, avg_episode_reward_history,
                     os.path.join(model_path, 'plot'))
        print('episode ', i, 'total episode reward %.2f, avg episode reward %.2f' % \
          (total_episode_reward, avg_episode_reward))
print('Train time (in mins): ',  ((time.time()-tic) / 60))

# Save last model
last_model_path = os.path.join(model_path, 'Last_model')
os.makedirs(last_model_path, exist_ok=True)
agent.actor.save(last_model_path)

# Save history
np.savez(os.path.join(model_path, 'history'), 
         total_episode_reward_history=total_episode_reward_history, 
         avg_episode_reward_history=avg_episode_reward_history)




```

The training took about 8 houres and more than 5000 Episodes.

In [8]:
# Test best model
del(agent)
agent = Agent()
agent.actor = tf.keras.models.load_model(best_model_path)
test_agent(env, agent, render=True, n_episodes=5)

Total Episode Reward:  255.08862152046314
Total Episode Reward:  -86.13646123476502
Total Episode Reward:  261.39396068880734
Total Episode Reward:  65.91627534398958
Total Episode Reward:  250.30124504959284


In [9]:
# Test intermediant model
inter_model_path = os.path.join(model_path, 'Intermediant_model')
del(agent)
agent = Agent()
agent.actor = tf.keras.models.load_model(inter_model_path)
test_agent(env, agent, render=True, n_episodes=5)

Total Episode Reward:  262.97689049859935
Total Episode Reward:  211.47017145061747
Total Episode Reward:  263.0573789450133
Total Episode Reward:  283.58904973223537
Total Episode Reward:  257.7805683820527


## But what happend at the end?

![title](./Models/For_presentation/plot.jpg)

In [10]:
# Test last model
del(agent)
agent = Agent()
agent.actor = tf.keras.models.load_model(os.path.join(model_path, 'Last_model'))
test_agent(env, agent, render=True, n_episodes=5)



AssertionError: 4 (<class 'int'>) invalid 

References:
https://towardsdatascience.com/actor-critic-with-tensorflow-2-x-part-1-of-2-d1e26a54ce97