In [1]:
import numpy as np
import gym
from keras.models import Sequential
from keras.layers import Dense,Activation,Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [3]:
#Setting variable
ENV_NAME='CartPole-v0'
#Get the environment and extract the number of actions
#in the cartpole problem
env=gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions=env.action_space.n

In [4]:
#We will build a simple single hidden layer neural network model
model=Sequential()
model.add(Flatten(input_shape=(1,)+env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
#Now configure and compile our agent.WE will set set our policy as Epsilon Greedy and our memory is 
#Sequential because we want to store the result of actions we performed and the rewards we get for 
#each action
policy=EpsGreedyQPolicy()
memory=SequentialMemory(limit=50000,window_length=1)
dqn=DQNAgent(model=model,nb_actions=nb_actions,memory=memory,nb_steps_warmup=10,policy=policy)
dqn.compile(Adam(lr=1e-3),metrics=['mae'])
#Okay,now its time to learn something! We visualize the training gere for show,
#but this slows down training quite a lot
dqn.fit(env,nb_steps=5000,visualize=True,verbose=2)


Training for 5000 steps ...




Instructions for updating:
Use tf.cast instead.
   83/5000: episode: 1, duration: 7.874s, episode steps: 83, steps per second: 11, episode reward: 83.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.050 [-0.270, 0.792], loss: 0.407578, mean_absolute_error: 0.482795, mean_q: 0.059641
  119/5000: episode: 2, duration: 0.605s, episode steps: 36, steps per second: 60, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.160 [-0.159, 0.653], loss: 0.284334, mean_absolute_error: 0.406116, mean_q: 0.203982
  169/5000: episode: 3, duration: 0.835s, episode steps: 50, steps per second: 60, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.082 [-0.351, 0.843], loss: 0.202572, mean_absolute_error: 0.388582, mean_q: 0.325452
  205/5000: episode: 4, duration: 0.606s, episode steps: 36, steps per second: 59, episode reward: 36.000,

 1633/5000: episode: 30, duration: 0.885s, episode steps: 53, steps per second: 60, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: -0.055 [-1.178, 0.322], loss: 0.000712, mean_absolute_error: 0.445366, mean_q: 0.960351
 1660/5000: episode: 31, duration: 0.457s, episode steps: 27, steps per second: 59, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.117 [-1.158, 0.358], loss: 0.000626, mean_absolute_error: 0.445386, mean_q: 0.961402
 1763/5000: episode: 32, duration: 1.719s, episode steps: 103, steps per second: 60, episode reward: 103.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.102 [-0.279, 0.993], loss: 0.000576, mean_absolute_error: 0.445601, mean_q: 0.960093
 1799/5000: episode: 33, duration: 0.599s, episode steps: 36, steps per second: 60, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean

 3225/5000: episode: 59, duration: 1.217s, episode steps: 73, steps per second: 60, episode reward: 73.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: -0.033 [-0.968, 0.733], loss: 0.000308, mean_absolute_error: 0.443797, mean_q: 0.949199
 3266/5000: episode: 60, duration: 0.683s, episode steps: 41, steps per second: 60, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: -0.102 [-0.787, 0.162], loss: 0.000228, mean_absolute_error: 0.444179, mean_q: 0.947762
 3310/5000: episode: 61, duration: 0.733s, episode steps: 44, steps per second: 60, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.104 [-0.774, 0.379], loss: 0.000211, mean_absolute_error: 0.441969, mean_q: 0.947054
 3345/5000: episode: 62, duration: 0.584s, episode steps: 35, steps per second: 60, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean 

 4574/5000: episode: 88, duration: 0.467s, episode steps: 28, steps per second: 60, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.111 [-0.830, 0.366], loss: 0.000381, mean_absolute_error: 0.449126, mean_q: 0.945426
 4608/5000: episode: 89, duration: 0.567s, episode steps: 34, steps per second: 60, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.097 [-0.955, 0.564], loss: 0.000235, mean_absolute_error: 0.448470, mean_q: 0.948240
 4642/5000: episode: 90, duration: 0.565s, episode steps: 34, steps per second: 60, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.107 [-0.725, 0.248], loss: 0.000210, mean_absolute_error: 0.445038, mean_q: 0.944205
 4671/5000: episode: 91, duration: 0.483s, episode steps: 29, steps per second: 60, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean 

<keras.callbacks.History at 0x20ed86e14e0>