https://www.freecodecamp.org/news/an-intro-to-advantage-actor-critic-methods-lets-play-sonic-the-hedgehog-86d6240171d/
https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f
http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/lecture_5_actor_critic_pdf.pdf
https://danieltakeshi.github.io/2017/03/28/going-deeper-into-reinforcement-learning-fundamentals-of-policy-gradients/
https://danieltakeshi.github.io/2018/06/28/a2c-a3c/
https://danieltakeshi.github.io/2017/04/02/notes-on-the-generalized-advantage-estimation-paper/
https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/algo/a2c_acktr.py
https://github.com/higgsfield/RL-Adventure-2

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as distributions

import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import numpy as np
import gym

from utils import SubprocVecEnv

In [2]:
SEED = 1234

np.random.seed(SEED);
torch.manual_seed(SEED);

In [3]:
N_ENVS = 16

def make_env(env_name, seed):
    def _thunk():
        env = gym.make(env_name)
        env.seed(seed)
        return env
    return _thunk

#all environments need different random seed!
envs = [make_env('CartPole-v1', SEED+i) for i in range(N_ENVS)]
envs = SubprocVecEnv(envs)

env = gym.make('CartPole-v1')
env.seed(SEED)

assert isinstance(envs.observation_space, gym.spaces.Box)
assert isinstance(envs.action_space, gym.spaces.Discrete)

In [4]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout = 0.25):
        super().__init__()

        self.fc_1 = nn.Linear(input_dim, hidden_dim)
        self.fc_2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.fc_1(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.fc_2(x)
        return x

In [5]:
INPUT_DIM = env.observation_space.shape[0]
HIDDEN_DIM = 128
OUTPUT_DIM = env.action_space.n

actor = MLP(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)
critic = MLP(INPUT_DIM, HIDDEN_DIM, 1)

In [6]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight)
        m.bias.data.fill_(0)
        
actor.apply(init_weights)
critic.apply(init_weights)

MLP(
  (fc_1): Linear(in_features=4, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.25)
)

In [7]:
LEARNING_RATE = 3e-4

actor_optimizer = optim.Adam(actor.parameters(), lr = LEARNING_RATE)
critic_optimizer = optim.Adam(critic.parameters(), lr = LEARNING_RATE)

In [8]:
def train(envs, actor, critic, actor_optimizer, critic_optimizer, n_steps, discount_factor):
    
    log_prob_actions = torch.zeros(n_steps, len(envs))
    entropies = torch.zeros(n_steps, len(envs))
    values = torch.zeros(n_steps, len(envs))
    rewards = torch.zeros(n_steps, len(envs))
    masks = torch.zeros(n_steps, len(envs))
    episode_reward = 0
    
    state = envs.get_state()
    
    for step in range(n_steps):

        state = torch.FloatTensor(state) #[n_envs, observation_space]

        action_preds = actor(state) #[n_envs, action_space]
        value_pred = critic(state).squeeze(-1) #[n_envs]
            
        action_probs = F.softmax(action_preds, dim = -1) #[n_envs, action_space]
                            
        dist = distributions.Categorical(action_probs)

        action = dist.sample() #[n_envs]
                
        log_prob_action = dist.log_prob(action) #[n_envs]
                
        entropy = dist.entropy() #[n_envs]
            
        #action now numpy array across all envs
        state, reward, done, _ = envs.step(action.cpu().numpy())
                
        reward = torch.FloatTensor(reward) #[n_envs]

        mask = torch.FloatTensor(1 - done) #[n_envs]
        
        log_prob_actions[step] = log_prob_action
        entropies[step] = entropy
        values[step] = value_pred
        rewards[step] = reward
        masks[step] = mask
    
    next_value = critic(torch.FloatTensor(state)).squeeze(-1)
    
    returns = calculate_returns(rewards, next_value, masks, discount_factor)
    advantages = calculate_advantages(returns, values)
    
    policy_loss, value_loss = update_policy(advantages, log_prob_actions, returns, values, entropies, actor_optimizer, critic_optimizer)

    return policy_loss, value_loss

In [9]:
def evaluate(env, actor, critic):
    
    rewards = []
    done = False
    episode_reward = 0
    
    state = env.reset()
    
    while not done:
        
        state = torch.FloatTensor(state).unsqueeze(0)
        
        action_preds = actor(state)
        
        action_probs = F.softmax(action_preds, dim = -1)
        
        dist = distributions.Categorical(action_probs)

        action = dist.sample() 
        
        state, reward, done, _ = env.step(action.item())
        
        episode_reward += reward
        
    return episode_reward

In [10]:
def calculate_returns(rewards, next_value, masks, discount_factor, normalize = True):
    
    R = next_value
    returns = torch.zeros_like(rewards)
    
    for i, (r, m) in enumerate(zip(reversed(rewards), reversed(masks))):
        R = r + discount_factor * R * m
        returns[i] = R
   
    if normalize:
        returns = (returns - returns.mean()) / returns.std()

    return returns

In [11]:
def calculate_advantages(returns, values, normalize = False):
    
    advantages = returns - values
    
    if normalize:
        advantages = (advantages - advantages.mean()) / advantages.std()
        
    return advantages

In [12]:
def update_policy(advantages, log_prob_actions, returns, values, entropies, actor_optimizer, critic_optimizer):
        
    advantages = advantages.detach()
    returns = returns.detach()
    
    policy_loss = - (advantages * log_prob_actions).mean() - 0.001 * entropies.mean()
    
    value_loss = F.smooth_l1_loss(returns, values).mean()
        
    actor_optimizer.zero_grad()
    critic_optimizer.zero_grad()
    
    policy_loss.backward()
    value_loss.backward()
    
    actor_optimizer.step()
    critic_optimizer.step()
    
    return policy_loss.item(), value_loss.item()

In [13]:
MAX_STEPS = 100_000
N_UPDATE_STEPS =  5
DISCOUNT_FACTOR = 0.99
N_TRIALS = 25
REWARD_THRESHOLD = 475
PRINT_EVERY = 10

episode_rewards = []

_ = envs.reset()

for step in tqdm(range(1, MAX_STEPS+1, N_UPDATE_STEPS)):
        
    policy_loss, value_loss = train(envs, actor, critic, actor_optimizer, critic_optimizer, N_UPDATE_STEPS, DISCOUNT_FACTOR)
    
    episode_reward = evaluate(env, actor, critic)
    
    episode_rewards.append(episode_reward)
    
    mean_trial_rewards = np.mean(episode_rewards[-N_TRIALS:])
    
    if step % (1+(N_UPDATE_STEPS*PRINT_EVERY)) == 0:
            
        print(f'| Steps: {step:6} | Mean Rewards: {mean_trial_rewards:6.2f} |')
    
    if mean_trial_rewards >= REWARD_THRESHOLD:
        
        print(f'Reached reward threshold in {episode} episodes')
        
        break

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

| Steps:     51 | Mean Rewards:  23.82 |
| Steps:    306 | Mean Rewards:  30.24 |
| Steps:    561 | Mean Rewards:  28.04 |
| Steps:    816 | Mean Rewards:  26.64 |
| Steps:   1071 | Mean Rewards:  28.24 |
| Steps:   1326 | Mean Rewards:  27.72 |
| Steps:   1581 | Mean Rewards:  25.96 |
| Steps:   1836 | Mean Rewards:  30.88 |
| Steps:   2091 | Mean Rewards:  34.64 |
| Steps:   2346 | Mean Rewards:  25.52 |
| Steps:   2601 | Mean Rewards:  28.24 |
| Steps:   2856 | Mean Rewards:  26.84 |
| Steps:   3111 | Mean Rewards:  27.24 |
| Steps:   3366 | Mean Rewards:  33.76 |
| Steps:   3621 | Mean Rewards:  28.56 |
| Steps:   3876 | Mean Rewards:  28.12 |
| Steps:   4131 | Mean Rewards:  24.28 |
| Steps:   4386 | Mean Rewards:  28.28 |
| Steps:   4641 | Mean Rewards:  22.28 |
| Steps:   4896 | Mean Rewards:  25.76 |
| Steps:   5151 | Mean Rewards:  27.00 |
| Steps:   5406 | Mean Rewards:  26.04 |
| Steps:   5661 | Mean Rewards:  23.08 |
| Steps:   5916 | Mean Rewards:  18.52 |
| Steps:   6171 

| Steps:  51051 | Mean Rewards: 121.12 |
| Steps:  51306 | Mean Rewards:  94.96 |
| Steps:  51561 | Mean Rewards:  98.24 |
| Steps:  51816 | Mean Rewards: 103.76 |
| Steps:  52071 | Mean Rewards:  85.56 |
| Steps:  52326 | Mean Rewards:  77.28 |
| Steps:  52581 | Mean Rewards:  89.72 |
| Steps:  52836 | Mean Rewards: 118.24 |
| Steps:  53091 | Mean Rewards: 106.20 |
| Steps:  53346 | Mean Rewards: 131.72 |
| Steps:  53601 | Mean Rewards: 121.96 |
| Steps:  53856 | Mean Rewards: 104.48 |
| Steps:  54111 | Mean Rewards: 149.56 |
| Steps:  54366 | Mean Rewards: 138.04 |
| Steps:  54621 | Mean Rewards: 152.72 |
| Steps:  54876 | Mean Rewards: 167.84 |
| Steps:  55131 | Mean Rewards: 198.20 |
| Steps:  55386 | Mean Rewards: 190.56 |
| Steps:  55641 | Mean Rewards: 168.96 |
| Steps:  55896 | Mean Rewards: 172.44 |
| Steps:  56151 | Mean Rewards: 199.80 |
| Steps:  56406 | Mean Rewards: 186.56 |
| Steps:  56661 | Mean Rewards: 185.68 |
| Steps:  56916 | Mean Rewards: 156.64 |
| Steps:  57171 