## ONLY KERAS VERSION < 2.0

In [10]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from __future__ import print_function

In [11]:
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [12]:
ENV_NAME = 'CartPole-v0'

#### Making New Environment For CartPole

In [13]:
env = gym.make(ENV_NAME)

In [14]:
np.random.seed(43)
env.seed(43)

[43L]

In [15]:
nb_actions = env.action_space.n

#### Time For Making a Single Layer Feed Forward Network

In [7]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
flatten_1 (Flatten)              (None, 4)             0           flatten_input_1[0][0]            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 16)            80          flatten_1[0][0]                  
____________________________________________________________________________________________________
activation_1 (Activation)        (None, 16)            0           dense_1[0][0]                    
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 2)             34          activation_1[0][0]               
___________________________________________________________________________________________

In [8]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=5000, visualize=True, verbose=2)


Training for 5000 steps ...




   13/5000: episode: 1, duration: 8.073s, episode steps: 13, steps per second: 2, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.846 [0.000, 1.000], mean observation: -0.109 [-2.811, 1.747], loss: 0.672139, mean_absolute_error: 0.702761, mean_q: 0.232073
   33/5000: episode: 2, duration: 0.205s, episode steps: 20, steps per second: 98, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.123 [-0.344, 0.964], loss: 0.496271, mean_absolute_error: 0.578393, mean_q: 0.298828
   57/5000: episode: 3, duration: 0.271s, episode steps: 24, steps per second: 89, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.750 [0.000, 1.000], mean observation: -0.008 [-3.157, 2.322], loss: 0.413725, mean_absolute_error: 0.512727, mean_q: 0.346002
   65/5000: episode: 4, duration: 0.087s, episode steps: 8, steps per second: 92, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 

  317/5000: episode: 30, duration: 0.115s, episode steps: 10, steps per second: 87, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.166 [-3.119, 1.953], loss: 0.365766, mean_absolute_error: 1.352933, mean_q: 2.568481
  327/5000: episode: 31, duration: 0.106s, episode steps: 10, steps per second: 95, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.157 [-3.087, 1.952], loss: 0.367804, mean_absolute_error: 1.410998, mean_q: 2.687538
  337/5000: episode: 32, duration: 0.094s, episode steps: 10, steps per second: 106, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.130 [-3.030, 1.996], loss: 0.425276, mean_absolute_error: 1.456456, mean_q: 2.805487
  345/5000: episode: 33, duration: 0.111s, episode steps: 8, steps per second: 72, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean a

  607/5000: episode: 59, duration: 0.098s, episode steps: 9, steps per second: 92, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.778 [0.000, 1.000], mean observation: -0.137 [-1.992, 1.177], loss: 0.372353, mean_absolute_error: 2.160094, mean_q: 4.101202
  618/5000: episode: 60, duration: 0.111s, episode steps: 11, steps per second: 99, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.727 [0.000, 1.000], mean observation: -0.127 [-1.928, 1.190], loss: 0.338922, mean_absolute_error: 2.177845, mean_q: 4.200922
  627/5000: episode: 61, duration: 0.085s, episode steps: 9, steps per second: 105, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.145 [-2.208, 1.336], loss: 0.356563, mean_absolute_error: 2.197706, mean_q: 4.353114
  638/5000: episode: 62, duration: 0.123s, episode steps: 11, steps per second: 90, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean act

  919/5000: episode: 89, duration: 0.133s, episode steps: 11, steps per second: 83, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.727 [0.000, 1.000], mean observation: -0.111 [-1.832, 1.158], loss: 0.260476, mean_absolute_error: 2.855440, mean_q: 5.597764
  927/5000: episode: 90, duration: 0.106s, episode steps: 8, steps per second: 76, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.875 [0.000, 1.000], mean observation: -0.157 [-2.010, 1.184], loss: 0.225982, mean_absolute_error: 2.816130, mean_q: 5.539452
  936/5000: episode: 91, duration: 0.096s, episode steps: 9, steps per second: 93, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.778 [0.000, 1.000], mean observation: -0.114 [-1.877, 1.169], loss: 0.245550, mean_absolute_error: 2.816853, mean_q: 5.519217
  945/5000: episode: 92, duration: 0.141s, episode steps: 9, steps per second: 64, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action

 1277/5000: episode: 119, duration: 0.204s, episode steps: 16, steps per second: 79, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.625 [0.000, 1.000], mean observation: -0.084 [-1.424, 0.755], loss: 0.322163, mean_absolute_error: 3.535089, mean_q: 6.772120
 1290/5000: episode: 120, duration: 0.138s, episode steps: 13, steps per second: 94, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.615 [0.000, 1.000], mean observation: -0.134 [-1.297, 0.565], loss: 0.450723, mean_absolute_error: 3.658382, mean_q: 7.015777
 1303/5000: episode: 121, duration: 0.172s, episode steps: 13, steps per second: 76, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: -0.098 [-1.305, 0.830], loss: 0.383727, mean_absolute_error: 3.674816, mean_q: 7.075556
 1321/5000: episode: 122, duration: 0.224s, episode steps: 18, steps per second: 80, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], m

 1732/5000: episode: 148, duration: 0.272s, episode steps: 22, steps per second: 81, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.096 [-0.953, 0.225], loss: 0.700739, mean_absolute_error: 4.338141, mean_q: 8.219877
 1758/5000: episode: 149, duration: 0.295s, episode steps: 26, steps per second: 88, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: 0.076 [-0.598, 1.332], loss: 0.858818, mean_absolute_error: 4.419960, mean_q: 8.396388
 1781/5000: episode: 150, duration: 0.282s, episode steps: 23, steps per second: 82, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: -0.068 [-0.894, 0.409], loss: 0.631219, mean_absolute_error: 4.408230, mean_q: 8.439411
 1813/5000: episode: 151, duration: 0.367s, episode steps: 32, steps per second: 87, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], me

 2602/5000: episode: 177, duration: 0.444s, episode steps: 48, steps per second: 108, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.479 [0.000, 1.000], mean observation: 0.037 [-0.545, 1.070], loss: 1.257220, mean_absolute_error: 5.834306, mean_q: 11.208236
 2626/5000: episode: 178, duration: 0.247s, episode steps: 24, steps per second: 97, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.105 [-0.257, 0.801], loss: 0.623783, mean_absolute_error: 5.840294, mean_q: 11.315808
 2670/5000: episode: 179, duration: 0.424s, episode steps: 44, steps per second: 104, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: 0.070 [-0.421, 1.289], loss: 1.351152, mean_absolute_error: 6.003668, mean_q: 11.558526
 2712/5000: episode: 180, duration: 0.463s, episode steps: 42, steps per second: 91, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000],

 4293/5000: episode: 206, duration: 1.463s, episode steps: 159, steps per second: 109, episode reward: 159.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.071 [-0.677, 0.524], loss: 2.283167, mean_absolute_error: 8.426129, mean_q: 16.376465
 4399/5000: episode: 207, duration: 1.028s, episode steps: 106, steps per second: 103, episode reward: 106.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.102 [-0.848, 0.537], loss: 2.291833, mean_absolute_error: 8.631959, mean_q: 16.844940
 4543/5000: episode: 208, duration: 1.371s, episode steps: 144, steps per second: 105, episode reward: 144.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.080 [-0.777, 0.842], loss: 2.547509, mean_absolute_error: 8.947972, mean_q: 17.445082
 4601/5000: episode: 209, duration: 0.548s, episode steps: 58, steps per second: 106, episode reward: 58.000, mean reward: 1.000 [1.0

<keras.callbacks.History at 0x7fcaa4454490>

In [9]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 65.000, steps: 65
Episode 2: reward: 131.000, steps: 131
Episode 3: reward: 94.000, steps: 94
Episode 4: reward: 135.000, steps: 135
Episode 5: reward: 180.000, steps: 180


<keras.callbacks.History at 0x7fcaa78172d0>