In [1]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.memory import EpisodeParameterMemory

ENV_NAME = 'CartPole-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

# Option 1 : Simple model
# model = Sequential()
# model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
# model.add(Dense(nb_actions))
# model.add(Activation('softmax'))

# Option 2: deep network
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))


print(model.summary())


# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = EpisodeParameterMemory(limit=1000, window_length=1)

cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
cem.compile()

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
cem.fit(env, nb_steps=100000, visualize=False, verbose=2)

# After training is done, we save the best weights.
cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
cem.test(env, nb_episodes=20, visualize=True)

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 10        
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
Total params: 10
Trainable params: 10
Non-trainable params: 0
_________________________________________________________________
None
Training for 1000000 steps ...
     13/1000000: episode: 1, duration: 0.350s, episode steps: 13, steps per second: 37, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: -0.107 [-1.319, 0.772], mean_best_reward: --
     24/1000000: episode: 2, duration: 0.018s, episode steps: 11, steps per second: 597, episode reward: 11.000, mean reward: 1.0

    672/1000000: episode: 36, duration: 0.023s, episode steps: 20, steps per second: 887, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.350 [0.000, 1.000], mean observation: 0.053 [-1.201, 1.977], mean_best_reward: --
    716/1000000: episode: 37, duration: 0.052s, episode steps: 44, steps per second: 841, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.568 [0.000, 1.000], mean observation: -0.055 [-2.395, 1.395], mean_best_reward: --
    728/1000000: episode: 38, duration: 0.015s, episode steps: 12, steps per second: 786, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.107 [-0.774, 1.524], mean_best_reward: --
    742/1000000: episode: 39, duration: 0.017s, episode steps: 14, steps per second: 816, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.786 [0.000, 1.000], mean observation: -0.106 [-2.619, 1.567], mean_best_reward: --
    795/100000

   1577/1000000: episode: 72, duration: 0.010s, episode steps: 9, steps per second: 858, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.222 [0.000, 1.000], mean observation: 0.154 [-1.148, 1.991], mean_best_reward: --
   1591/1000000: episode: 73, duration: 0.017s, episode steps: 14, steps per second: 829, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.214 [0.000, 1.000], mean observation: 0.086 [-1.613, 2.588], mean_best_reward: --
   1622/1000000: episode: 74, duration: 0.052s, episode steps: 31, steps per second: 598, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.160 [-0.540, 1.168], mean_best_reward: --
   1638/1000000: episode: 75, duration: 0.017s, episode steps: 16, steps per second: 937, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.093 [-0.963, 1.647], mean_best_reward: --
   1651/1000000: e

   2206/1000000: episode: 106, duration: 0.018s, episode steps: 17, steps per second: 927, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.588 [0.000, 1.000], mean observation: -0.106 [-1.396, 0.600], mean_best_reward: --
   2228/1000000: episode: 107, duration: 0.026s, episode steps: 22, steps per second: 861, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.591 [0.000, 1.000], mean observation: -0.064 [-1.633, 0.816], mean_best_reward: --
   2244/1000000: episode: 108, duration: 0.017s, episode steps: 16, steps per second: 914, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.625 [0.000, 1.000], mean observation: -0.050 [-1.873, 1.219], mean_best_reward: --
   2252/1000000: episode: 109, duration: 0.010s, episode steps: 8, steps per second: 820, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.125 [0.000, 1.000], mean observation: 0.148 [-1.144, 2.016], mean_best_reward: --
   2267/100

   2901/1000000: episode: 141, duration: 0.024s, episode steps: 19, steps per second: 796, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: -0.078 [-1.497, 0.957], mean_best_reward: --
   2912/1000000: episode: 142, duration: 0.017s, episode steps: 11, steps per second: 649, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.636 [0.000, 1.000], mean observation: -0.124 [-1.473, 0.779], mean_best_reward: --
   2926/1000000: episode: 143, duration: 0.021s, episode steps: 14, steps per second: 662, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.357 [0.000, 1.000], mean observation: 0.136 [-0.943, 1.819], mean_best_reward: --
   2940/1000000: episode: 144, duration: 0.017s, episode steps: 14, steps per second: 818, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.643 [0.000, 1.000], mean observation: -0.094 [-1.441, 0.953], mean_best_reward: --
   2960/1

   3950/1000000: episode: 174, duration: 0.040s, episode steps: 37, steps per second: 918, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.459 [0.000, 1.000], mean observation: 0.010 [-0.841, 1.407], mean_best_reward: --
   3959/1000000: episode: 175, duration: 0.013s, episode steps: 9, steps per second: 705, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.111 [0.000, 1.000], mean observation: 0.132 [-1.350, 2.300], mean_best_reward: --
   4038/1000000: episode: 176, duration: 0.078s, episode steps: 79, steps per second: 1011, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.506 [0.000, 1.000], mean observation: 0.110 [-0.923, 1.271], mean_best_reward: --
   4066/1000000: episode: 177, duration: 0.030s, episode steps: 28, steps per second: 934, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.393 [0.000, 1.000], mean observation: 0.050 [-1.351, 2.126], mean_best_reward: --
   4088/10000

   5160/1000000: episode: 208, duration: 0.013s, episode steps: 12, steps per second: 904, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.109 [-0.984, 1.543], mean_best_reward: --
   5171/1000000: episode: 209, duration: 0.014s, episode steps: 11, steps per second: 792, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.364 [0.000, 1.000], mean observation: 0.132 [-1.128, 1.791], mean_best_reward: --
   5193/1000000: episode: 210, duration: 0.025s, episode steps: 22, steps per second: 894, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.107 [-0.635, 1.100], mean_best_reward: --
   5210/1000000: episode: 211, duration: 0.019s, episode steps: 17, steps per second: 895, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.088 [-1.361, 0.752], mean_best_reward: --
   5227/100

   6525/1000000: episode: 246, duration: 0.041s, episode steps: 34, steps per second: 835, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.559 [0.000, 1.000], mean observation: -0.048 [-1.664, 0.762], mean_best_reward: --
   6556/1000000: episode: 247, duration: 0.035s, episode steps: 31, steps per second: 875, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.039 [-0.761, 1.241], mean_best_reward: --
   6588/1000000: episode: 248, duration: 0.035s, episode steps: 32, steps per second: 907, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: -0.066 [-1.222, 0.736], mean_best_reward: --
   6641/1000000: episode: 249, duration: 0.055s, episode steps: 53, steps per second: 959, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.001 [-1.003, 1.263], mean_best_reward: --
   6687/1

   7549/1000000: episode: 280, duration: 0.020s, episode steps: 15, steps per second: 747, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.104 [-0.779, 1.502], mean_best_reward: --
   7572/1000000: episode: 281, duration: 0.026s, episode steps: 23, steps per second: 888, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.391 [0.000, 1.000], mean observation: 0.047 [-1.128, 1.858], mean_best_reward: --
   7607/1000000: episode: 282, duration: 0.034s, episode steps: 35, steps per second: 1030, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.079 [-0.939, 0.570], mean_best_reward: --
   7624/1000000: episode: 283, duration: 0.018s, episode steps: 17, steps per second: 919, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: -0.074 [-1.558, 0.932], mean_best_reward: --
   7636/1

   8415/1000000: episode: 315, duration: 0.040s, episode steps: 41, steps per second: 1033, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.145 [-0.499, 1.228], mean_best_reward: --
   8438/1000000: episode: 316, duration: 0.025s, episode steps: 23, steps per second: 907, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.565 [0.000, 1.000], mean observation: -0.034 [-1.658, 1.209], mean_best_reward: --
   8469/1000000: episode: 317, duration: 0.032s, episode steps: 31, steps per second: 957, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.452 [0.000, 1.000], mean observation: -0.178 [-1.037, 0.558], mean_best_reward: --
   8491/1000000: episode: 318, duration: 0.021s, episode steps: 22, steps per second: 1041, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.409 [0.000, 1.000], mean observation: 0.089 [-0.798, 1.691], mean_best_reward: --
   8529/

<keras.callbacks.History at 0x15db1d9d198>