In [5]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [6]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

model = Sequential()
input_shape = (1,) + env.observation_space.shape
print(input_shape)
model.add(Flatten(input_shape=input_shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
(1, 4)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=5000, visualize=True, verbose=2)

Training for 5000 steps ...




   79/5000: episode: 1, duration: 2.406s, episode steps: 79, steps per second: 33, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.060 [-0.402, 0.722], loss: 0.427375, mean_absolute_error: 0.495651, mean_q: 0.053356
  113/5000: episode: 2, duration: 0.583s, episode steps: 34, steps per second: 58, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.151 [-0.159, 0.753], loss: 0.350887, mean_absolute_error: 0.444512, mean_q: 0.194190
  163/5000: episode: 3, duration: 0.882s, episode steps: 50, steps per second: 57, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.082 [-0.295, 0.778], loss: 0.314964, mean_absolute_error: 0.465016, mean_q: 0.319203
  197/5000: episode: 4, duration: 0.565s, episode steps: 34, steps per second: 60, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action:

  680/5000: episode: 31, duration: 0.200s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.167 [0.000, 1.000], mean observation: 0.098 [-1.789, 2.681], loss: 0.440390, mean_absolute_error: 2.218537, mean_q: 4.223358
  689/5000: episode: 32, duration: 0.184s, episode steps: 9, steps per second: 49, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.111 [0.000, 1.000], mean observation: 0.135 [-1.611, 2.534], loss: 0.426713, mean_absolute_error: 2.294608, mean_q: 4.376363
  698/5000: episode: 33, duration: 0.147s, episode steps: 9, steps per second: 61, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.145 [-1.737, 2.764], loss: 0.418507, mean_absolute_error: 2.304470, mean_q: 4.448545
  708/5000: episode: 34, duration: 0.166s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action:

  991/5000: episode: 62, duration: 0.167s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.125 [-1.560, 2.498], loss: 0.999470, mean_absolute_error: 3.448232, mean_q: 6.427165
 1003/5000: episode: 63, duration: 0.199s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.083 [0.000, 1.000], mean observation: 0.129 [-1.913, 3.015], loss: 1.046327, mean_absolute_error: 3.477995, mean_q: 6.478175
 1013/5000: episode: 64, duration: 0.166s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.145 [-1.913, 3.061], loss: 1.127366, mean_absolute_error: 3.527216, mean_q: 6.569734
 1023/5000: episode: 65, duration: 0.164s, episode steps: 10, steps per second: 61, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean act

 1653/5000: episode: 92, duration: 0.149s, episode steps: 9, steps per second: 60, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.123 [-2.247, 1.350], loss: 1.117953, mean_absolute_error: 5.065046, mean_q: 9.688518
 1663/5000: episode: 93, duration: 0.166s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: -0.145 [-2.053, 1.179], loss: 1.192308, mean_absolute_error: 5.038626, mean_q: 9.595671
 1672/5000: episode: 94, duration: 0.150s, episode steps: 9, steps per second: 60, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.152 [-2.806, 1.739], loss: 1.440661, mean_absolute_error: 5.179167, mean_q: 9.865060
 1689/5000: episode: 95, duration: 0.283s, episode steps: 17, steps per second: 60, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean acti

 2132/5000: episode: 121, duration: 0.318s, episode steps: 19, steps per second: 60, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: -0.119 [-0.911, 0.367], loss: 2.411001, mean_absolute_error: 6.409554, mean_q: 12.186410
 2207/5000: episode: 122, duration: 1.251s, episode steps: 75, steps per second: 60, episode reward: 75.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: -0.008 [-0.890, 0.389], loss: 2.266498, mean_absolute_error: 6.503018, mean_q: 12.271247
 2231/5000: episode: 123, duration: 0.398s, episode steps: 24, steps per second: 60, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.121 [-0.983, 0.187], loss: 1.990455, mean_absolute_error: 6.531693, mean_q: 12.447601
 2268/5000: episode: 124, duration: 0.618s, episode steps: 37, steps per second: 60, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000]

 3293/5000: episode: 150, duration: 0.770s, episode steps: 46, steps per second: 60, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.119 [-0.906, 0.209], loss: 2.885216, mean_absolute_error: 8.621620, mean_q: 16.548582
 3349/5000: episode: 151, duration: 0.934s, episode steps: 56, steps per second: 60, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.102 [-0.274, 0.859], loss: 3.495818, mean_absolute_error: 8.729193, mean_q: 16.714167
 3400/5000: episode: 152, duration: 0.866s, episode steps: 51, steps per second: 59, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.091 [-0.841, 0.205], loss: 3.923305, mean_absolute_error: 8.826670, mean_q: 16.833826
 3443/5000: episode: 153, duration: 0.731s, episode steps: 43, steps per second: 59, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000],

<keras.callbacks.History at 0x7fe9d57fbc18>

In [8]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 40.000, steps: 40
Episode 2: reward: 54.000, steps: 54
Episode 3: reward: 81.000, steps: 81
Episode 4: reward: 120.000, steps: 120
Episode 5: reward: 41.000, steps: 41


<keras.callbacks.History at 0x7fe9d56cb278>

In [9]:
ENV_NAME = 'MountainCar-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [10]:
print(env.action_space)

Discrete(3)


In [11]:
print(env.observation_space)

Box(2,)


In [12]:
model = Sequential()
input_shape = (1,) + env.observation_space.shape
print(input_shape)
model.add(Flatten(input_shape=input_shape))
model.add(Dense(24))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

(1, 2)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 2)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 24)                72        
_________________________________________________________________
activation_3 (Activation)    (None, 24)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 75        
_________________________________________________________________
activation_4 (Activation)    (None, 3)                 0         
Total params: 147
Trainable params: 147
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
dqn.fit(env, nb_steps=500, visualize=True, verbose=2)

Training for 500 steps ...




 200/500: episode: 1, duration: 4.139s, episode steps: 200, steps per second: 48, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.875 [0.000, 2.000], mean observation: -0.258 [-1.058, 0.042], loss: 0.236920, mean_absolute_error: 0.672265, mean_q: -0.594136
 400/500: episode: 2, duration: 3.397s, episode steps: 200, steps per second: 59, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.725 [0.000, 2.000], mean observation: -0.282 [-0.802, 0.024], loss: 0.015733, mean_absolute_error: 1.511167, mean_q: -2.092125
done, took 9.215 seconds


<keras.callbacks.History at 0x7fe9d572f780>

In [14]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: -200.000, steps: 200
Episode 2: reward: -200.000, steps: 200
Episode 3: reward: -200.000, steps: 200
Episode 4: reward: -200.000, steps: 200
Episode 5: reward: -200.000, steps: 200


<keras.callbacks.History at 0x7fe9d572f710>

In [15]:
for i_episode in range(1):
    observation = env.reset()
    for t in range(10):
        env.render()
        print(observation)
        action = env.action_space.sample()
        print('action: {}'.format(action))
        observation, reward, done, info = env.step(action)
        print('observation: {}, reward: {}, done: {}, info: {}'.format(observation, reward, done, info))
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

[-0.46295365  0.        ]
action: 0
observation: [-0.46440598 -0.00145233], reward: -1.0, done: False, info: {}
[-0.46440598 -0.00145233]
action: 1
observation: [-0.46629993 -0.00189395], reward: -1.0, done: False, info: {}
[-0.46629993 -0.00189395]
action: 0
observation: [-0.46962151 -0.00332158], reward: -1.0, done: False, info: {}
[-0.46962151 -0.00332158]
action: 1
observation: [-0.47334615 -0.00372464], reward: -1.0, done: False, info: {}
[-0.47334615 -0.00372464]
action: 1
observation: [-0.47744626 -0.00410011], reward: -1.0, done: False, info: {}
[-0.47744626 -0.00410011]
action: 2
observation: [-0.48089141 -0.00344515], reward: -1.0, done: False, info: {}
[-0.48089141 -0.00344515]
action: 0
observation: [-0.48565598 -0.00476458], reward: -1.0, done: False, info: {}
[-0.48565598 -0.00476458]
action: 2
observation: [-0.48970452 -0.00404853], reward: -1.0, done: False, info: {}
[-0.48970452 -0.00404853]
action: 0
observation: [-0.49500682 -0.0053023 ], reward: -1.0, done: False, i