In [1]:
import numpy as np
import gym
import time

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from gym import Env, spaces
from gym.utils import seeding
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [77]:
class TestEnv(Env):
    metadata = {'render.modes': ['human', 'ansi']}

    def __init__(self) -> None:
        self.__max_step = 10

        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(np.array([0, 0]), np.array([0, 0]))

        self._seed()
        self.state = None
        self.__step = None
        self.state_list = None

    def _seed(self, seed=None):
        pass

    def _step(self, action):
        if (np.sum(self.state) == 1 and action == 1) or (np.sum(self.state) != 1 and action == 0):
            reward = 1
        else:
            reward = 0
        self.state = np.array([self.state[1], action])
        self.state_list.append(action)
        self.__step += 1
        if self.__step < self.__max_step:
            done = False
        else:
            done = True
        return self.state, reward, done, {}

    def _reset(self):
        self.__step = 0
        self.state = np.random.randint(0, 2, 2)
        self.state_list = []
        self.state_list.extend(self.state.tolist())
        return self.state

    def _render(self, mode='ansi', close=False):
        print(self.state_list)

    def _close(self):
        super()._close()

In [82]:
env = TestEnv()
env.reset()
nb_actions = env.action_space.n

[1, 0]


In [79]:
for i in range(10):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    print(env.render())
    print(action, reward, done, info)

[1, 0, 0]
None
0 0 False {}
[1, 0, 0, 1]
None
1 0 False {}
[1, 0, 0, 1, 0]
None
0 0 False {}
[1, 0, 0, 1, 0, 1]
None
1 1 False {}
[1, 0, 0, 1, 0, 1, 1]
None
1 1 False {}
[1, 0, 0, 1, 0, 1, 1, 0]
None
0 1 False {}
[1, 0, 0, 1, 0, 1, 1, 0, 1]
None
1 1 False {}
[1, 0, 0, 1, 0, 1, 1, 0, 1, 1]
None
1 1 False {}
[1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0]
None
0 1 False {}
[1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0]
None
0 0 True {}


In [89]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_5 (Flatten)          (None, 2)                 0         
_________________________________________________________________
dense_13 (Dense)             (None, 16)                48        
_________________________________________________________________
activation_13 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_14 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 2)                 34        
_________________________________________________________________
activation_15 (Activation)   (None, 2)                 0         
Total para

In [84]:
memory = SequentialMemory(limit=500, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [85]:
dqn.fit(env, nb_steps=500, visualize=False, verbose=2)

Training for 500 steps ...
  10/500: episode: 1, duration: 0.136s, episode steps: 10, steps per second: 73, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --




  20/500: episode: 2, duration: 1.107s, episode steps: 10, steps per second: 9, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.250 [0.000, 1.000], loss: 0.337917, mean_absolute_error: 0.408877, mean_q: 0.167197
  30/500: episode: 3, duration: 0.101s, episode steps: 10, steps per second: 99, episode reward: 4.000, mean reward: 0.400 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.263050, mean_absolute_error: 0.400221, mean_q: 0.241944
  40/500: episode: 4, duration: 0.099s, episode steps: 10, steps per second: 101, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.225481, mean_absolute_error: 0.415928, mean_q: 0.310252




  50/500: episode: 5, duration: 0.105s, episode steps: 10, steps per second: 95, episode reward: 4.000, mean reward: 0.400 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.199773, mean_absolute_error: 0.457749, mean_q: 0.446613
  60/500: episode: 6, duration: 0.098s, episode steps: 10, steps per second: 102, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.350 [0.000, 1.000], loss: 0.191278, mean_absolute_error: 0.489537, mean_q: 0.541736
  70/500: episode: 7, duration: 0.097s, episode steps: 10, steps per second: 103, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.400 [0.000, 1.000], loss: 0.174735, mean_absolute_error: 0.507318, mean_q: 0.619115


  80/500: episode: 8, duration: 0.107s, episode steps: 10, steps per second: 93, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.250 [0.000, 1.000], loss: 0.175602, mean_absolute_error: 0.525114, mean_q: 0.647795
  90/500: episode: 9, duration: 0.100s, episode steps: 10, steps per second: 100, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.450 [0.000, 1.000], loss: 0.155098, mean_absolute_error: 0.538775, mean_q: 0.745204
 100/500: episode: 10, duration: 0.095s, episode steps: 10, steps per second: 105, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.139489, mean_absolute_error: 0.576457, mean_q: 0.818834


 110/500: episode: 11, duration: 0.105s, episode steps: 10, steps per second: 95, episode reward: 4.000, mean reward: 0.400 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.400 [0.000, 1.000], loss: 0.134069, mean_absolute_error: 0.584335, mean_q: 0.839020
 120/500: episode: 12, duration: 0.099s, episode steps: 10, steps per second: 101, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.124601, mean_absolute_error: 0.614123, mean_q: 0.932788
 130/500: episode: 13, duration: 0.096s, episode steps: 10, steps per second: 105, episode reward: 3.000, mean reward: 0.300 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.121168, mean_absolute_error: 0.650771, mean_q: 0.981786


 140/500: episode: 14, duration: 0.102s, episode steps: 10, steps per second: 98, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.108065, mean_absolute_error: 0.639395, mean_q: 1.031867
 150/500: episode: 15, duration: 0.095s, episode steps: 10, steps per second: 106, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.088837, mean_absolute_error: 0.650921, mean_q: 1.113602
 160/500: episode: 16, duration: 0.095s, episode steps: 10, steps per second: 106, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.086388, mean_absolute_error: 0.672477, mean_q: 1.199000


 170/500: episode: 17, duration: 0.108s, episode steps: 10, steps per second: 92, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.078254, mean_absolute_error: 0.699682, mean_q: 1.314943
 180/500: episode: 18, duration: 0.101s, episode steps: 10, steps per second: 99, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.450 [0.000, 1.000], loss: 0.087805, mean_absolute_error: 0.721596, mean_q: 1.377077
 190/500: episode: 19, duration: 0.098s, episode steps: 10, steps per second: 102, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.080740, mean_absolute_error: 0.720896, mean_q: 1.444887


 200/500: episode: 20, duration: 0.100s, episode steps: 10, steps per second: 100, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.400 [0.000, 1.000], loss: 0.077798, mean_absolute_error: 0.737388, mean_q: 1.521979
 210/500: episode: 21, duration: 0.110s, episode steps: 10, steps per second: 91, episode reward: 4.000, mean reward: 0.400 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.400 [0.000, 1.000], loss: 0.073368, mean_absolute_error: 0.776878, mean_q: 1.615908


 220/500: episode: 22, duration: 0.100s, episode steps: 10, steps per second: 100, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.068221, mean_absolute_error: 0.799127, mean_q: 1.705293
 230/500: episode: 23, duration: 0.096s, episode steps: 10, steps per second: 104, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.088004, mean_absolute_error: 0.840753, mean_q: 1.809823
 240/500: episode: 24, duration: 0.098s, episode steps: 10, steps per second: 102, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.082048, mean_absolute_error: 0.827027, mean_q: 1.846254


 250/500: episode: 25, duration: 0.113s, episode steps: 10, steps per second: 89, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.071997, mean_absolute_error: 0.859212, mean_q: 1.953596
 260/500: episode: 26, duration: 0.102s, episode steps: 10, steps per second: 98, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.300 [0.000, 1.000], loss: 0.087864, mean_absolute_error: 0.908066, mean_q: 2.037089
 270/500: episode: 27, duration: 0.098s, episode steps: 10, steps per second: 102, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.052256, mean_absolute_error: 0.895177, mean_q: 2.133923


 280/500: episode: 28, duration: 0.111s, episode steps: 10, steps per second: 90, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.250 [0.000, 1.000], loss: 0.100134, mean_absolute_error: 0.978426, mean_q: 2.226198
 290/500: episode: 29, duration: 0.099s, episode steps: 10, steps per second: 101, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.104527, mean_absolute_error: 0.988657, mean_q: 2.199039
 300/500: episode: 30, duration: 0.099s, episode steps: 10, steps per second: 101, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.118772, mean_absolute_error: 1.033763, mean_q: 2.327578


 310/500: episode: 31, duration: 0.112s, episode steps: 10, steps per second: 89, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.137312, mean_absolute_error: 1.100361, mean_q: 2.412626
 320/500: episode: 32, duration: 0.088s, episode steps: 10, steps per second: 113, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.150201, mean_absolute_error: 1.138124, mean_q: 2.456976
 330/500: episode: 33, duration: 0.095s, episode steps: 10, steps per second: 105, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.139134, mean_absolute_error: 1.126259, mean_q: 2.500190


 340/500: episode: 34, duration: 0.102s, episode steps: 10, steps per second: 98, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.205035, mean_absolute_error: 1.210470, mean_q: 2.621015
 350/500: episode: 35, duration: 0.091s, episode steps: 10, steps per second: 110, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.750 [0.000, 1.000], loss: 0.203854, mean_absolute_error: 1.217808, mean_q: 2.653921
 360/500: episode: 36, duration: 0.096s, episode steps: 10, steps per second: 104, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.250 [0.000, 1.000], loss: 0.182369, mean_absolute_error: 1.225339, mean_q: 2.756103


 370/500: episode: 37, duration: 0.110s, episode steps: 10, steps per second: 91, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.400 [0.000, 1.000], loss: 0.184554, mean_absolute_error: 1.280071, mean_q: 2.887839
 380/500: episode: 38, duration: 0.087s, episode steps: 10, steps per second: 115, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.166971, mean_absolute_error: 1.310342, mean_q: 2.889965
 390/500: episode: 39, duration: 0.086s, episode steps: 10, steps per second: 116, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.294916, mean_absolute_error: 1.424397, mean_q: 3.014135


 400/500: episode: 40, duration: 0.098s, episode steps: 10, steps per second: 102, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.278308, mean_absolute_error: 1.424289, mean_q: 3.016028
 410/500: episode: 41, duration: 0.090s, episode steps: 10, steps per second: 111, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.750 [0.000, 1.000], loss: 0.233476, mean_absolute_error: 1.408067, mean_q: 3.085822
 420/500: episode: 42, duration: 0.090s, episode steps: 10, steps per second: 112, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.304183, mean_absolute_error: 1.493122, mean_q: 3.215440


 430/500: episode: 43, duration: 0.099s, episode steps: 10, steps per second: 101, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.276547, mean_absolute_error: 1.481874, mean_q: 3.207118
 440/500: episode: 44, duration: 0.091s, episode steps: 10, steps per second: 110, episode reward: 4.000, mean reward: 0.400 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.400 [0.000, 1.000], loss: 0.322279, mean_absolute_error: 1.571972, mean_q: 3.300253
 450/500: episode: 45, duration: 0.090s, episode steps: 10, steps per second: 111, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.294751, mean_absolute_error: 1.568468, mean_q: 3.340798


 460/500: episode: 46, duration: 0.095s, episode steps: 10, steps per second: 105, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: 0.750 [0.000, 1.000], loss: 0.280441, mean_absolute_error: 1.613695, mean_q: 3.461193
 470/500: episode: 47, duration: 0.086s, episode steps: 10, steps per second: 116, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.428859, mean_absolute_error: 1.684250, mean_q: 3.494181
 480/500: episode: 48, duration: 0.083s, episode steps: 10, steps per second: 120, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.750 [0.000, 1.000], loss: 0.331430, mean_absolute_error: 1.686467, mean_q: 3.506333


 490/500: episode: 49, duration: 0.098s, episode steps: 10, steps per second: 103, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.395121, mean_absolute_error: 1.743760, mean_q: 3.640089
 500/500: episode: 50, duration: 0.090s, episode steps: 10, steps per second: 111, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.321413, mean_absolute_error: 1.753417, mean_q: 3.748550
done, took 5.984 seconds


<keras.callbacks.History at 0x25d88e00e10>

In [86]:
env.reset()
dqn.test(env)
print(env.state_list)

Testing for 1 episodes ...
[1, 0, 1]
[1, 0, 1, 1]
[1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1]
Episode 1: reward: 10.000, steps: 10
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1]


In [64]:
env.reset()

array([1, 1])

In [73]:
np.argmax(model.predict(env.state.reshape(1, 1, 2)))

1

In [72]:
env.step(0)

(array([1, 0]), 1, False, {})

In [53]:
model.get_layer(index=0).input

<tf.Tensor 'flatten_2_input:0' shape=(?, 1, 2) dtype=float32>

In [87]:
dqn.test(env, 2, visualize=True)

Testing for 2 episodes ...
[1, 0, 1]
[1, 0, 1, 1]
[1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1]
Episode 1: reward: 10.000, steps: 10
[1, 0, 1]
[1, 0, 1, 1]
[1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1]
Episode 2: reward: 10.000, steps: 10


<keras.callbacks.History at 0x25d88e008d0>