In [1]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


Initialize CartPole environment

In [2]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

  result = entry_point.load(False)


In [3]:
print(env.action_space)

Discrete(2)


Build a single hidden layer neural network model.

In [4]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


Configure and compile the agent. Set policy to Epsilon Greedy. Set memory as sequential memory to store results of previous actions.

In [5]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)

dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, 
               target_model_update=1e-2, policy=policy)

dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [6]:
dqn.fit(env, nb_steps=5000, visualize=False, verbose=2)


Training for 5000 steps ...




   79/5000: episode: 1, duration: 0.810s, episode steps: 79, steps per second: 98, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.060 [-0.402, 0.722], loss: 0.428581, mean_absolute_error: 0.496816, mean_q: 0.051927
  113/5000: episode: 2, duration: 0.133s, episode steps: 34, steps per second: 256, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.151 [-0.159, 0.753], loss: 0.352513, mean_absolute_error: 0.446764, mean_q: 0.192479
  163/5000: episode: 3, duration: 0.172s, episode steps: 50, steps per second: 290, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.082 [-0.295, 0.778], loss: 0.307210, mean_absolute_error: 0.458905, mean_q: 0.320537
  196/5000: episode: 4, duration: 0.091s, episode steps: 33, steps per second: 363, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean acti

  723/5000: episode: 32, duration: 0.042s, episode steps: 9, steps per second: 214, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.222 [0.000, 1.000], mean observation: 0.125 [-1.415, 2.198], loss: 0.550246, mean_absolute_error: 2.425180, mean_q: 4.629381
  735/5000: episode: 33, duration: 0.048s, episode steps: 12, steps per second: 248, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.108 [-1.154, 1.932], loss: 0.405450, mean_absolute_error: 2.460753, mean_q: 4.707531
  746/5000: episode: 34, duration: 0.034s, episode steps: 11, steps per second: 328, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.130 [-1.125, 1.891], loss: 0.507794, mean_absolute_error: 2.512162, mean_q: 4.787333
  754/5000: episode: 35, duration: 0.025s, episode steps: 8, steps per second: 323, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean act

 1019/5000: episode: 61, duration: 0.053s, episode steps: 9, steps per second: 170, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.222 [0.000, 1.000], mean observation: 0.112 [-1.381, 2.095], loss: 0.851325, mean_absolute_error: 3.599264, mean_q: 6.630264
 1029/5000: episode: 62, duration: 0.077s, episode steps: 10, steps per second: 130, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.115 [-1.364, 2.152], loss: 0.705191, mean_absolute_error: 3.591359, mean_q: 6.794271
 1041/5000: episode: 63, duration: 0.081s, episode steps: 12, steps per second: 147, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.167 [0.000, 1.000], mean observation: 0.108 [-1.523, 2.393], loss: 1.249551, mean_absolute_error: 3.724857, mean_q: 6.841347
 1053/5000: episode: 64, duration: 0.045s, episode steps: 12, steps per second: 266, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean a

 1594/5000: episode: 94, duration: 0.059s, episode steps: 10, steps per second: 170, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.145 [-2.565, 1.546], loss: 0.832654, mean_absolute_error: 5.084817, mean_q: 9.800223
 1607/5000: episode: 95, duration: 0.079s, episode steps: 13, steps per second: 164, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.923 [0.000, 1.000], mean observation: -0.089 [-3.239, 2.182], loss: 1.098931, mean_absolute_error: 5.153320, mean_q: 9.887808
 1616/5000: episode: 96, duration: 0.040s, episode steps: 9, steps per second: 226, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.133 [-2.803, 1.807], loss: 1.213763, mean_absolute_error: 5.239963, mean_q: 9.999411
 1628/5000: episode: 97, duration: 0.070s, episode steps: 12, steps per second: 171, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mea

 2173/5000: episode: 125, duration: 0.130s, episode steps: 20, steps per second: 154, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: -0.087 [-1.121, 0.409], loss: 2.966466, mean_absolute_error: 6.720218, mean_q: 12.614009
 2183/5000: episode: 126, duration: 0.041s, episode steps: 10, steps per second: 245, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.122 [-1.267, 0.811], loss: 2.345402, mean_absolute_error: 6.601795, mean_q: 12.487072
 2194/5000: episode: 127, duration: 0.039s, episode steps: 11, steps per second: 285, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.636 [0.000, 1.000], mean observation: -0.106 [-1.382, 0.823], loss: 1.727130, mean_absolute_error: 6.677570, mean_q: 12.830669
 2212/5000: episode: 128, duration: 0.049s, episode steps: 18, steps per second: 370, episode reward: 18.000, mean reward: 1.000 [1.000, 1.

 3404/5000: episode: 154, duration: 0.205s, episode steps: 61, steps per second: 297, episode reward: 61.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.099 [-0.838, 0.415], loss: 3.463993, mean_absolute_error: 8.686560, mean_q: 16.619293
 3455/5000: episode: 155, duration: 0.144s, episode steps: 51, steps per second: 355, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.087 [-0.639, 0.213], loss: 2.748727, mean_absolute_error: 8.707896, mean_q: 16.757130
 3494/5000: episode: 156, duration: 0.132s, episode steps: 39, steps per second: 295, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.132 [-0.753, 0.190], loss: 3.166207, mean_absolute_error: 8.903388, mean_q: 17.184328
 3593/5000: episode: 157, duration: 0.373s, episode steps: 99, steps per second: 265, episode reward: 99.000, mean reward: 1.000 [1.000, 1.

<keras.callbacks.History at 0x123065d30>

In [None]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 132.000, steps: 132
Episode 2: reward: 51.000, steps: 51
