# Simple Reinforcement Learning with Keras: Part 2 - Policy-based Agents

Re-write of code from [Simple Reinforcement Learning with Tensorflow: Part 2 - Policy-based Agents](https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-2-ded33892c724#.zh7rnjs25)

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import gym
env = gym.make("CartPole-v0")

In [3]:
# Try running environment with random actions
env.reset()
reward_sum = 0
num_games = 10
num_game = 0
while num_game < num_games:
#     env.render()
    observation, reward, done, _ = env.step(env.action_space.sample())
    reward_sum += reward
    if done:
        print("Reward for this episode was: {}".format(reward_sum))
        reward_sum = 0
        num_game += 1
        env.reset()

Reward for this episode was: 15.0
Reward for this episode was: 13.0
Reward for this episode was: 19.0
Reward for this episode was: 14.0
Reward for this episode was: 13.0
Reward for this episode was: 22.0
Reward for this episode was: 21.0
Reward for this episode was: 15.0
Reward for this episode was: 17.0
Reward for this episode was: 36.0


## Setting up a Neural Network agent
We will use a policy neural network that takes observations, passes them through a single hidden layer and then produces a probability of choosing a left/right movement.

In [4]:
import keras.layers as layers
from keras.models import Model
from keras.optimizers import Adam
import keras.backend as K
from keras.initializers import glorot_uniform

def get_policy_model(env, hidden_layer_neurons, lr):
    dimen = env.reset().shape
    num_actions = env.action_space.n
    inp = layers.Input(shape=dimen,name="input_x")
    adv = layers.Input(shape=[1], name="advantages")
    x = layers.Dense(hidden_layer_neurons, activation="relu", name="dense_1",kernel_initializer=glorot_uniform(seed=42))(inp)
    out = layers.Dense(num_actions, activation="softmax", name="out",kernel_initializer=glorot_uniform(seed=42),use_bias=False)(x)

    def custom_loss(y_true, y_pred):
        # actual: 0 predict: 0 -> log(0 * (0 - 0) + (1 - 0) * (0 + 0)) = -inf
        # actual: 1 predict: 1 -> log(1 * (1 - 1) + (1 - 1) * (1 + 1)) = -inf
        # actual: 1 predict: 0 -> log(1 * (1 - 0) + (1 - 1) * (1 + 0)) = 0
        # actual: 0 predict: 1 -> log(0 * (0 - 1) + (1 - 0) * (0 + 1)) = 0
        log_lik = K.log(y_true * (y_true - y_pred) + (1 - y_true) * (y_true + y_pred))
        return -1 * K.mean(log_lik * adv, keepdims=True)

    model_train = Model(inputs=[inp, adv], outputs=out)
    model_train.compile(loss=custom_loss, optimizer=Adam(lr))
    model_predict = Model(inputs=[inp], outputs=out)
    return model_train, model_predict

Using TensorFlow backend.


In [5]:
def discount_rewards(r, gamma=0.99):
    """Takes 1d float array of rewards and computes discounted reward
    e.g. f([1, 1, 1], 0.99) -> [1, 0.99, 0.9801]
    """
    prior = 0
    out = []
    for val in r:
        new_val = val + prior * gamma
        out.append(new_val)
        prior = new_val
    return np.array(out[::-1])

In [6]:
# Constants defining our neural network
hidden_layer_neurons = 10
gamma = .99
dimen = len(env.reset())
print_every = 100
batch_size = 50
num_episodes = 10000
render = False
lr = 1e-2
goal = 190

In [7]:
model_train, model_predict = get_policy_model(env, hidden_layer_neurons, lr)
model_predict.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_x (InputLayer)         (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                50        
_________________________________________________________________
out (Dense)                  (None, 2)                 20        
Total params: 70
Trainable params: 70
Non-trainable params: 0
_________________________________________________________________


In [8]:
reward_sum = 0

num_actions = env.action_space.n

# Placeholders for our observations, outputs and rewards
states = np.empty(0).reshape(0,dimen)
actions = np.empty(0).reshape(0,1)
rewards = np.empty(0).reshape(0,1)
discounted_rewards = np.empty(0).reshape(0,1)

# Setting up our environment
observation = env.reset()

num_episode = 0

losses = []

while num_episode < num_episodes:
    # Append the observations to our batch
    state = np.reshape(observation, [1, dimen])
    
    predict = model_predict.predict([state])[0]
    action = np.random.choice(range(num_actions),p=predict)
    action = 1 if action == 0 else 0
    
    # Append the observations and outputs for learning
    states = np.vstack([states, state])
    actions = np.vstack([actions, action])
    
    # Determine the oucome of our action
    observation, reward, done, _ = env.step(action)
    reward_sum += reward
    rewards = np.vstack([rewards, reward])
    
    if done:
        # Determine standardized rewards
        discounted_rewards_episode = discount_rewards(rewards, gamma)       
        discounted_rewards_episode -= discounted_rewards_episode.mean()
        discounted_rewards_episode /= discounted_rewards_episode.std()
        
        discounted_rewards = np.vstack([discounted_rewards, discounted_rewards_episode])
        
        rewards = np.empty(0).reshape(0,1)

        if (num_episode + 1) % batch_size == 0:
            discounted_rewards -= discounted_rewards.mean()
            discounted_rewards = discounted_rewards.squeeze()
            actions = actions.squeeze().astype(int)
            
           
            actions_train = np.zeros([len(actions), num_actions])
            actions_train[np.arange(len(actions)), actions] = 1
            
            loss = model_train.train_on_batch([states, discounted_rewards], actions_train)
            losses.append(loss)

            # Clear out game variables
            states = np.empty(0).reshape(0,dimen)
            actions = np.empty(0).reshape(0,1)
            discounted_rewards = np.empty(0).reshape(0,1)


        # Print periodically
        if (num_episode + 1) % print_every == 0:
            # Print status
            print("Average reward for episode {}: {:0.2f} Loss: {:0.6f} ".format(
                num_episode, reward_sum/print_every, np.mean(losses[-print_every:])))
            
            if reward_sum / print_every >= goal:
                print("Solved in {} episodes!".format(num_episode))
                break
            reward_sum = 0
        
        num_episode += 1
        observation = env.reset()
            

Average reward for episode 99: 28.35 Loss: -0.007301 
Average reward for episode 199: 31.17 Loss: -0.006219 
Average reward for episode 299: 33.93 Loss: -0.006680 
Average reward for episode 399: 34.06 Loss: -0.006691 
Average reward for episode 499: 41.42 Loss: -0.007206 
Average reward for episode 599: 38.15 Loss: -0.008004 
Average reward for episode 699: 37.66 Loss: -0.008707 
Average reward for episode 799: 50.07 Loss: -0.009151 
Average reward for episode 899: 50.87 Loss: -0.009290 
Average reward for episode 999: 59.21 Loss: -0.009579 
Average reward for episode 1099: 63.03 Loss: -0.009431 
Average reward for episode 1199: 78.96 Loss: -0.009816 
Average reward for episode 1299: 77.03 Loss: -0.009845 
Average reward for episode 1399: 87.46 Loss: -0.010118 
Average reward for episode 1499: 102.37 Loss: -0.010320 
Average reward for episode 1599: 111.84 Loss: -0.010568 
Average reward for episode 1699: 116.17 Loss: -0.010714 
Average reward for episode 1799: 132.99 Loss: -0.010862 

In [10]:
# See our trained bot in action

observation = env.reset()
observation
reward_sum = 0

while True:
#     env.render()
    
    state = np.reshape(observation, [1, dimen])
    predict = model_predict.predict([state])[0]
    action = np.argmax(predict)
    action = 1 if action == 0 else 0
    print(action,end=" ")
    observation, reward, done, _ = env.step(action)
    reward_sum += reward
    if done:
        print("Total score: {}".format(reward_sum))
        break
env.close()

1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 0 1 0 0 1 1 0 1 0 0 1 0 1 0 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 1 0 1 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 1 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 1 0 0 1 0 1 Total score: 200.0
