# Implementing actor-critic with deep reinforcement learning

## Introduction

Actor-critic merges both actor methods with critic methods it optimizes both on-policy and off-policy algorithms and uses one for the other.

- The algorithm will be implemented using the same environment and the same neural network used with dqn
- The main difference is that the neural net will be implemented twice, with two different fc layers.
    - One unique valued fc for the value function
    - A softmax with dimension num_actions (like the dqn)

In [1]:
import gym
from ale_py import ALEInterface
ale = ALEInterface()

A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


In [2]:
import stable_baselines3.common.atari_wrappers as atari_wrappers

In [8]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from collections import deque

## Installing the environment

In [4]:
# initial environment
env = gym.make('PongNoFrameskip-v4')

# Atari preprocessing wrapper
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, 
                                      frame_skip=4, screen_size=84, 
                                      terminal_on_life_loss=False, 
                                      grayscale_obs=True, grayscale_newaxis=False, 
                                      scale_obs=False)

# Frame stacking
env = gym.wrappers.FrameStack(env, 4)

# using atari_wrappers
env = atari_wrappers.ClipRewardEnv(env)

A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


In [5]:
#actions in this environment
env.unwrapped.get_action_meanings()

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [6]:
# just testing a state to make sure it works
s0 = env.reset()
a0 = env.action_space.sample()

# execute the action
s1, r1, is_final_state, info = env.step(a0)

# RL utils

In [9]:
def epsilon_greedy(eps, model, env, state):
    if np.random.random() < eps:
        # exploration
        action = np.random.randint(0, env.action_space.n)
        return action
    else:
        # exploitation
        q_vals = model.predict(state)
        action = np.argmax(q_vals)
        return action
    


def memory_initialization(env, MAX_MEM, INIT_MEM):
    """Initializes the memory for experience replay"""
    
    memory = deque(maxlen=MAX_MEM)
    
    while len(memory) < INIT_MEM:
        
        state = env.reset()
        
        is_final_state = False
        
        while not is_final_state:
            
            # generate a random action
            action = env.action_space.sample()
            
            # execute the action
            new_state, reward, is_final_state, info = env.step(action)
            
            # add transition quadruple to memory
            memory.append([np.expand_dims(state, axis=0), action, reward, 
                           np.expand_dims(new_state, axis=0), is_final_state])
            
            # update state
            state = new_state
            
    return memory

## Implementing the neural network

In [7]:
class ConvNet(nn.Module):
    
    def __init__(self, n_actions):
        super(ConvNet, self).__init__()
        self.n_actions
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU()
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        self.hidden = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512, bias=True),
            nn.ReLU()
        )
        self.out_actor = nn.Sequential(
            nn.Linear(512, self.n_actions, bias=True)
        )
        
        self.out_critic = nn.Sequential(
            nn.Linear(512, 1, bias=True)
        )
        
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.view(x.size(0), -1)
        x = self.hidden(x)
        x_actor = self.out_actor(x)
        x_critic = self.out_critic(x)
        
        return x_actor, x_critic

# Implementing actor critic algorithm

In [10]:
from functools import reduce

In [None]:

# initializing network parameters (no need?)

# Initializing max episodes
max_episodes = 100
horizon = 20

# initializing training memory and agent memory
agent_mem = []
train_mem = []
Tmax = 1000

eps = 1e-2

gamma = 0.5

for episode in range(max_episodes):
    for t in horizon:
        state = env.reset()
        
        env.render()
        
        frames += 1
        
        #epsilon greedy choose action
        action = epsilon_greedy(eps, model, env, np.expand_dims(state, axis=0))
        
        #execute action
        new_state, reward, done, info = env.step(action)
        agent_mem.append(tuple(state, reward, action, new_state))
    
    # Computing rho_t with rewards in agent memory
    # think of adding .to(device) later for gpu use
    agents_states = torch.FloatTensor([tr[0] for tr in agent_mem]) 
    agents_rewards = torch.FloatTensor([tr[1] for tr in agent_mem])
    agents_actions = torch.LongTensor([tr[2] for tr in agent_mem]).view(-1, 1)
    agents_new_states = torch.FloatTensor([sars[3] for sars in trajectory])

    rho = reduce(lambda x, y: x + gamma*y, [tr[1] for tr in agent_mem])
    train_mem.append()
