In [None]:
from keras.models import Sequential
from keras.layers import Dense, Input, Flatten, add, subtract, multiply
from keras.optimizers import Adam, RMSprop
from keras.models import Model

import tensorflow.keras.backend as K
import tensorflow as tf

from random import sample
from collections import deque
from math import log


def a2c_loss(B):
    """
    Defines a loss function that can access the bandit B while conforming to Keras's expected inputs (only y_true and y_pred)
    """
    
    def loss_function(y_true,y_pred):
        K.get_variable_shape(y_pred)
        return K.variable(B.loss)

    return loss_function

def a2c_actor_loss(B):
    """
    Defines a loss function for the actor that can access the bandit B while conforming to Keras's expected inputs (only y_true and y_pred)
    """
    
    def loss_function(y_true,y_pred): 
        if len(B.advantage) == 0: 
            answer = np.ones( (1,1), dtype = np.float32 )
        else:
            B.advantage = B.Q - np.asarray(B.values)         
            logP = np.stack( B.logP )
            B.actor_loss = np.mean( -logP * B.advantage )
            answer = B.actor_loss
        return tf.convert_to_tensor( answer )

    return loss_function

def a2c_critic_loss(B):
    """
    Defines a loss function for the critic that can access the bandit B while conforming to Keras's expected inputs (only y_true and y_pred)
    """
    
    def loss_function(y_true,y_pred): 
        return K.variable(B.critic_loss)

    return loss_function

def init_a2c(B, kwargs):
    """
    Initializes an agent that implements the Advantage Actor-Critic (A2C) method
    """
    B.N_arms = kwargs['N_arms'] 
    B.context = env.reset()
    B.context = B.context.reshape(1,B.context.shape[0])
    B.done = False
    
    B.Q = []
    B.logP = [ [] ]
    B.values = []
    B.actions = []
    B.policies = []
    B.advantage = []
    B.discount = kwargs['discount']
    B.maxlen = kwargs['maxlen']
    
    
    inputs = Input( [ env.observation_space.shape[0] ] )
    shared = Dense(64, activation='relu')(inputs)
    shared = Dense(128, activation='relu')(shared)
    shared = Model(inputs, shared)
    
    actor_hidden = Dense(128, activation='relu')(shared.output)
    actor_out = Dense(kwargs['N_arms'], activation='softmax')(actor_hidden)
    B.actor = Model(shared.input, actor_out)
    
    critic_hidden = Dense(128, activation='relu')(shared.output)
    critic_out = Dense(1, activation='softmax')(critic_hidden)
    B.critic = Model(shared.input, critic_out)
    
    
    B.optimizer =  RMSprop(lr=kwargs['lr'])
    
    B.action_pl = K.placeholder(shape=(None, B.N_arms))
    B.advantages_pl = K.placeholder(shape=(None,))
    B.discounted_r = K.placeholder(shape=(None,))
    
    B.weighted_actions = K.sum(B.action_pl * B.actor.output, axis=1)
    B.eligibility = K.log(B.weighted_actions + 1e-10) * K.stop_gradient(B.advantages_pl)
    B.entropy = K.sum(B.actor.output * K.log(B.actor.output + 1e-10), axis=1)
    B.loss = 0.001 * B.entropy - K.sum(B.eligibility)
    
    updates = B.optimizer.get_updates(B.actor.trainable_weights, [], B.loss)
    B.actor_opt = K.function([B.actor.input, B.action_pl, B.advantages_pl], B.actor.output, updates=updates)
    
    B.critic_loss = K.mean( K.square( B.discounted_r - B.critic.output ) )
    updates = B.optimizer.get_updates(B.critic.trainable_weights, [], B.critic_loss)
    B.critic_opt = K.function([B.critic.input, B.discounted_r], B.critic.output, updates=updates)
    
    init = tf.initialize_all_variables()
    sess = tf.Session()
    sess.run(init)
    
    B.history = []
    B.logP = []
    B.entropy = 0
    return


def a2c(B):
    """
    Selects policy from a weighted distribution calculated by the agent from the current state 
    """
    value = B.critic.predict( B.context )[0][0]
    policy = B.actor.predict( B.context )[0] 
    B.values.append( value )
    B.policies.append( policy )
    return np.random.choice( B.N_arms, p=policy )


def update_a2c(B, arm, reward, observation):
    """
    Updates the agent based on the most recent action and reward using the Advantage Actor-Critic (A2C) method
    """
    observation = observation.reshape(1, observation.shape[0])
    B.history.append( (B.context, arm, reward, observation) )
    
    B.logP.append( -log( B.policies[-1][arm] + 1e-10 ) )
    
    if B.done or len(B.history) == B.maxlen:
        N = len(B.history)
        B.R = [ 0 for n in range(N) ]
        B.R[-1] = B.history[-1][2]
        for i in range(N-1, 0, -1):
            B.R[i-1] = B.history[i-1][2] + B.R[i] * B.discount
        
        B.R = np.asarray(B.R, dtype=np.float32)
        B.discounted_r = tf.convert_to_tensor(B.R)
        
        B.action_pl = tf.convert_to_tensor( B.policies[-1].reshape( (1,2) ) )
        B.advantages_pl = tf.convert_to_tensor( (B.R - np.asarray(B.values)).astype(np.float32) )
        
        weighted_actions = K.sum(B.action_pl * B.actor.output, axis=1)
        eligibility = K.log(weighted_actions + 1e-10) * K.stop_gradient(B.advantages_pl)
        entropy = K.sum(B.actor.output * K.log(B.actor.output + 1e-10), axis=1)
        B.loss = 0.001 * entropy - K.sum(eligibility)
        
        c2t = lambda x: tf.convert_to_tensor(x)
        
        states, actions, rewards, next_states = zip(*B.history)
        states = c2t( np.squeeze( np.asarray(states,dtype=np.float32) ) )
        actions = c2t( np.squeeze( np.asarray(actions,dtype=np.float32) ) )
        rewards = c2t( np.squeeze( np.asarray(rewards,dtype=np.float32) ) )
        
        B.actor_opt( [states, actions, rewards] )
        B.critic_opt( [states, B.discounted_r] )
        
        B.history = []
        B.logP = []
        B.values = []
        B.actions = []
        B.policies = []
        
    B.context = observation
    return



The following cell uses the functions above to create an A2C bandit and trains it on the defined environment

In [None]:
import gym


env = gym.make("CartPole-v0")
env.reset()

kwargs = { "N_arms": env.action_space.n, "discount": 0.99, "lr": 0.001, "layer_size": [256], "maxlen": 20000}

# Creates a multi-armed bandit using the defined A2C functions and parameters
B = Bandit(env, a2c, update_a2c, init_a2c, kwargs )



# Initialize variables to track iterations, rewards and weights
counter = 0
best_return = -200
rewards = []
all_weights = []


# Sets the number of iterations and interval to print average reward (never prints if set to None)
N_iterations = 10000
interval = 100

# Toggles whether the animation of the environment is rendered during training
render = False

for i in range(N_iterations):
    done = False
    counter += 1
    if interval is not None and counter%interval == 0:
        print( counter, "\t", sum(rewards[-interval:])/float(interval) )
    total_return = 0
    env.reset()
    j = 0
    while not done:
        if render:
            env.render()
        arm = B.policy(B)
        observation, reward, done, info = env.step(arm)
        reward = 1 if reward > 0 else -1 if reward < 0 else 0
        B.done = done
        j += 1
        B.update(B, arm, reward, observation )
        total_return += reward
        if done:
            rewards.append( total_return )
            env.reset()
            if total_return > best_return:
                best_return = total_return
            break
env.close()

print( "Finished after ", counter, " episodes with a top score of", best_return )