In [1]:
import gym
import numpy as np
import keras
from keras.models import Model
from keras.layers import Input, Dense, Lambda
from keras.optimizers import Adam
import keras.backend as K
from IPython.display import clear_output

Using TensorFlow backend.


In [2]:
env = gym.make("CartPole-v0")
action_space = env.action_space.n
state_space = env.observation_space.shape
print("Action space:", action_space)
print("State space:", state_space)

Action space: 2
State space: (4,)


In [3]:
num_episodes    = 2000
learning_rate   = 0.001
discount_factor = 0.95

In [4]:
input_states = Input(shape=state_space)
x = Dense(16, activation="relu")(input_states)
x = Dense(16, activation="relu")(x)
action_prob = Dense(action_space, activation="softmax")(x)
model = Model(input_states, action_prob)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 34        
Total params: 386
Trainable params: 386
Non-trainable params: 0
_________________________________________________________________


In [5]:
# Policy gradient train function
def build_train_fn(model):
    action_prob_placeholder = model.output
    action_onehot_placeholder = K.placeholder(shape=(None, action_space))
    discount_reward_placeholder = K.placeholder(shape=(None,))
    
    log_action_prob = K.log(K.sum(action_prob_placeholder * action_onehot_placeholder, axis=1))
    loss = K.mean(discount_reward_placeholder * -log_action_prob)
    
    adam = Adam()
    updates = adam.get_updates(params=model.trainable_weights, loss=loss)
    return K.function(inputs=[model.input, action_onehot_placeholder, discount_reward_placeholder],
                      outputs=[],
                      updates=updates)
train_fn = build_train_fn(model)

In [6]:
def discount_rewards(r, gamma=0.99):
    """ Take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, len(r))):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [8]:
rewards_per_episode = []
episode_rewards = float("nan")
for episode in range(num_episodes):
    clear_output(wait=True)
    print("-- Episode {}/{} --".format(episode+1, num_episodes))
    print("Total Reward:", np.sum(episode_rewards))
    print("Avg Episode Reward:", np.mean(rewards_per_episode))
    
    state = env.reset()
    episode_rewards = []
    states = []
    actions = []
    done = False
    while not done:
        #env.render()
        
        # Predict action given state pi(a_t|s_t)
        action_prob = np.squeeze(model.predict_on_batch(np.expand_dims(state, axis=0)))
        action = np.random.choice(np.arange(0, action_space), p=action_prob)
        action_one_hot = np.zeros((2,))
        action_one_hot[action] = 1.0
        
        # Take best action
        new_state, reward, done, info = env.step(action)
        
        states.append(state)
        actions.append(action_one_hot)
        episode_rewards.append(reward)
        state = new_state
        
        # If simulation is done
        if done:
            # Calculate discounted rewards
            discounted_rewards = discount_rewards(episode_rewards)
            
            # Normalize
            mean = np.mean(discounted_rewards)
            std  = np.std(discounted_rewards)
            discounted_rewards = (discounted_rewards - mean) / std
            
            # Train on episode
            train_fn([np.stack(states, axis=0), np.stack(actions, axis=0), discounted_rewards])
    rewards_per_episode.append(np.sum(episode_rewards))

-- Episode 2000/2000 --
Total Reward: 200.0
Avg Episode Reward: 158.2121060530265


In [13]:
rewards_per_episode = []
episode_rewards = float("nan")
for episode in range(10):
    state = env.reset()
    done = False
    episode_reward = 0
    while not done:
        env.render()
        
        # Predict action given state pi(a_t|s_t)
        action_prob = np.squeeze(model.predict_on_batch(np.expand_dims(state, axis=0)))
        action = np.random.choice(np.arange(0, action_space), p=action_prob)
        action_one_hot = np.zeros((2,))
        action_one_hot[action] = 1.0
        
        # Take best action
        state, reward, done, _ = env.step(action)
        episode_reward += reward
    rewards_per_episode.append(episode_reward)
print("Avg Episode Reward:", np.mean(rewards_per_episode))

Avg Episode Reward: 193.2
