## Advantage Actor-Critic (A2C) on CartPole

Actor-critic is an algorithm that combines both policy gradient (the actor) and value function (the critic).

![A2C](imgs/Advantage_actor_critic.png)
Credit: Sergey Levine

A2C is a more sophisticated version of the actor-critic that use the advantage, n-step return and a policy is run in multiple (synchronous) environments. 
[A3C](https://arxiv.org/pdf/1602.01783.pdf) is an asynchronous A2C with the environments that are run in parallel. 

The Actor and Critic can share the same neural network or have two separate network design. In this example, I used a shared network.
<img src="imgs/nn_ac.png" alt="drawing" width="600"/>
Credit: Sergey Levine

In [None]:
import numpy as np
import gym
from tensorboardX import SummaryWriter

import datetime
from collections import namedtuple
from collections import deque

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.clip_grad import clip_grad_norm_

In [None]:
class A2C_nn(nn.Module):
    '''
    Advantage actor-critic neural net
    '''

    def __init__(self, input_shape, n_actions):
        super(A2C_nn, self).__init__()

        self.lp = nn.Sequential(
            nn.Linear(input_shape[0], 64),
            nn.ReLU())

        self.policy = nn.Linear(64, n_actions)
        self.value = nn.Linear(64, 1)

    def forward(self, x):
        l = self.lp(x.float())
        # return the actor and the critic
        return self.policy(l), self.value(l)

The total loss contains:
- actor loss $\partial\theta_v\leftarrow\partial\theta_v + \dfrac{\partial(R-V_\theta(s))^2}{\partial\theta_v}$
- policy loss $\partial\theta_\pi\leftarrow\partial\theta_\pi + \alpha\triangledown_\theta log\pi_\theta(a|s)(R-V_\theta(s))$
- entropy loss $\beta\sum_i\pi_\theta(s)log\pi_\theta(s)$

In [None]:
def calculate_loss(memories, nn, writer):
    '''
    Calculate the loss of the memories
    '''

    #batch_mem = np.random.choice(len(memories), size=32)

    rewards = torch.tensor(np.array([m.reward for m in memories], dtype=np.float32))
    log_val = nn(torch.tensor(np.array([m.obs for m in memories], dtype=np.float32)))

    act_log_softmax = F.log_softmax(log_val[0], dim=1)[:,np.array([m.action for m in memories])]
    # Calculate the advantage
    adv = (rewards - log_val[1].detach())

    # actor loss (policy gradient)
    pg_loss = - torch.mean(act_log_softmax * adv)
    # critic loss (value loss)
    vl_loss = F.mse_loss(log_val[1].squeeze(-1), rewards)
    # entropy loss
    entropy_loss = ENTROPY_BETA * torch.mean(torch.sum(F.softmax(log_val[0], dim=1) * F.log_softmax(log_val[0], dim=1), dim=1))

    # total loss
    loss = pg_loss + vl_loss - entropy_loss

    # add scalar to the writer
    writer.add_scalar('loss', float(loss), n_iter)
    writer.add_scalar('pg_loss', float(pg_loss), n_iter)
    writer.add_scalar('vl_loss', float(vl_loss), n_iter)
    writer.add_scalar('entropy_loss', float(entropy_loss), n_iter)
    writer.add_scalar('actions', np.mean([m.action for m in memories]), n_iter)
    writer.add_scalar('adv', float(torch.mean(adv)), n_iter)
    writer.add_scalar('act_lgsoft', float(torch.mean(act_log_softmax)), n_iter)

    return loss

In [None]:
class Env:
    '''
    Environment class. Used to deal with multiple environments
    '''

    game_rew = 0
    last_game_rew = 0

    def __init__(self, env_name, n_steps, gamma):
        super(Env, self).__init__()

        # create the new environment
        self.env = gym.make(env_name)
        self.obs = self.env.reset()

        self.n_steps = n_steps
        self.action_n = self.env.action_space.n
        self.observation_n = self.env.observation_space.shape[0]
        self.gamma = gamma

    def step(self, agent):
        '''
        Execute the agent n_steps in the environment
        '''
        memories = []
        for s in range(self.n_steps):

            # get the agent policy
            pol_val = agent(torch.tensor(self.obs))
            s_act = F.softmax(pol_val[0])

            # get an action following the policy distribution
            action = int(np.random.choice(np.arange(self.action_n), p=s_act.detach().numpy(), size=1))

            # Perform a step in the environment
            new_obs, reward, done, _ = self.env.step(action)

            # update the memory
            memories.append(Memory(obs=self.obs, action=action, new_obs=new_obs, reward=reward, done=done))

            self.game_rew += reward
            self.obs = new_obs

            if done:
                # if done reset the env and the variables
                self.done = True
                # if the game is over, run_add take the 0 value
                self.run_add = 0
                self.obs = self.env.reset()

                self.last_game_rew = self.game_rew
                self.game_rew = 0
                break
            else:
                self.done = False

        if not self.done:
            # if the game isn't over, run_add take the value of the last state
            self.run_add = float(agent(torch.tensor(self.obs))[1])

        # compute the discount reward of the memories and return it
        return self.discounted_rewards(memories)


    def discounted_rewards(self, memories):
        '''
        Compute the discounted reward backward
        '''
        upd_memories = []

        for t in reversed(range(len(memories))):
            if memories[t].done: self.run_add = 0
            self.run_add = self.run_add * self.gamma + memories[t].reward

            # Update the memories with the discounted reward
            upd_memories.append(Memory(obs=memories[t].obs, action=memories[t].action, new_obs=memories[t].new_obs, reward=self.run_add, done=memories[t].done))

        return upd_memories[::-1]


In [None]:
Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], verbose=False, rename=False)

# Hyperparameters
GAMMA = 0.95
LEARNING_RATE = 0.003
ENTROPY_BETA = 0.01
ENV_NAME = 'CartPole-v0'

MAX_ITER = 100000
# Number of the env
N_ENVS = 40

# Max normalized gradient
CLIP_GRAD = 0.1

device = 'cpu'

now = datetime.datetime.now()
date_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)

In [None]:
# create N_ENVS environments
envs = [Env(ENV_NAME, 1, GAMMA) for _ in range(N_ENVS)]

writer = SummaryWriter(log_dir='content/runs/A2C'+ENV_NAME+'_'+date_time)

# initialize the actor-critic NN
agent_nn = A2C_nn(gym.make(ENV_NAME).observation_space.shape, gym.make(ENV_NAME).action_space.n).to(device)

# Adam optimizer
optimizer = optim.Adam(agent_nn.parameters(), lr=LEARNING_RATE, eps=1e-3)

experience = []
n_iter = 0

while n_iter < MAX_ITER:
    n_iter += 1

    # list containing all the memories
    memories = [mem for env in envs for mem in env.step(agent_nn)]

    # calculate the loss
    losses = calculate_loss(memories, agent_nn, writer)

    # optimizer step
    optimizer.zero_grad()
    losses.backward()
    # clip the gradient
    clip_grad_norm_(agent_nn.parameters(), CLIP_GRAD)
    optimizer.step()


    writer.add_scalar('rew', np.mean([env.last_game_rew for env in envs]), n_iter)
    print(n_iter, np.round(float(losses),2), 'rew:', np.round(np.mean([env.last_game_rew for env in envs]),2))

writer.close()

### ATTENTION! the model is not working, look at the graph below. Why this strange behavior? I tried to tune the hyperparameters but the results are the same.
![Reward plot](imgs/reward_plot_a2c.png)

#### Why is the loss decreasing so fast? 
![Reward plot](imgs/loss_plot_a2c.png)

#### In some cases, the model start preferring always the same action..
![Reward plot](imgs/actions_plot_a2c.png)

Some idea:
 - Use two different neural networks and optimizer