In [1]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
ENV_NAME = 'CartPole-v0'

In [3]:
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

[33mWARN: gym.spaces.Box autodetected dtype as <type 'numpy.float32'>. Please provide explicit dtype.[0m


In [4]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

In [5]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [6]:
dqn.fit(env, nb_steps=50000, visualize=True, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
   11/10000 [..............................] - ETA: 16:03 - reward: 1.0000



96 episodes - episode_reward: 102.354 [9.000, 200.000] - loss: 3.548 - mean_absolute_error: 20.933 - mean_q: 42.200

Interval 2 (10000 steps performed)
51 episodes - episode_reward: 196.078 [158.000, 200.000] - loss: 8.211 - mean_absolute_error: 42.103 - mean_q: 84.593

Interval 3 (20000 steps performed)
51 episodes - episode_reward: 196.529 [146.000, 200.000] - loss: 10.087 - mean_absolute_error: 46.523 - mean_q: 93.374

Interval 4 (30000 steps performed)
52 episodes - episode_reward: 191.885 [155.000, 200.000] - loss: 8.529 - mean_absolute_error: 46.127 - mean_q: 92.383

Interval 5 (40000 steps performed)
done, took 840.931 seconds


<keras.callbacks.History at 0x7fd378a48f90>

In [7]:
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [9]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200


<keras.callbacks.History at 0x7fd378a48d90>