In [14]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [15]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

[2017-10-07 22:06:26,973] Making new env: CartPole-v0


In [16]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_5 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_6 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=500, visualize=True, verbose=2)


Training for 500 steps ...




  85/500: episode: 1, duration: 1.899s, episode steps: 85, steps per second: 45, episode reward: 85.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.038 [-0.372, 0.596], loss: 0.420946, mean_absolute_error: 0.488172, mean_q: 0.068860
 115/500: episode: 2, duration: 0.502s, episode steps: 30, steps per second: 60, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: 0.147 [-0.348, 0.762], loss: 0.340661, mean_absolute_error: 0.445406, mean_q: 0.222139
 159/500: episode: 3, duration: 0.724s, episode steps: 44, steps per second: 61, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.523 [0.000, 1.000], mean observation: 0.088 [-0.340, 0.779], loss: 0.303838, mean_absolute_error: 0.464542, mean_q: 0.347636
 189/500: episode: 4, duration: 0.504s, episode steps: 30, steps per second: 59, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [

ArgumentError: argument 2: <class 'TypeError'>: wrong type

In [None]:
dqn.test(env, nb_episodes=5, visualize=True)