In [1]:
import numpy as np
import gym
import time

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from gym import Env, spaces
from gym.utils import seeding
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [72]:
class TestEnv(Env):
    metadata = {'render.modes': ['human', 'ansi']}

    def __init__(self) -> None:
        self.__max_step = 10

        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(np.array([0]), np.array([1]))

        self._seed()
        self.state = None
        self.__step = None
        self.state_list = None

    def _seed(self, seed=None):
        pass

    def _step(self, action):
        state = self.state
        if state != action:
            reward = 1
        else:
            reward = 0
        self.state = action
        self.state_list.append(action)
        self.__step += 1
        if self.__step < self.__max_step:
            done = False
        else:
            done = True
        return np.array(self.state), reward, done, {}

    def _reset(self):
        self.__step = 0
        self.state = np.random.randint(2)
        self.state_list = []
        self.state_list.append(self.state)
        return np.array(self.state)

    def _render(self, mode='human', close=False):
        pass

    def _close(self):
        super()._close()

In [76]:
env = TestEnv()
env.reset()
nb_actions = env.action_space.n

In [74]:
for i in range(10):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    print(env.render())
    print(action, reward, done, info)

None
1 0 False {}
None
0 1 False {}
None
1 1 False {}
None
1 0 False {}
None
1 0 False {}
None
1 0 False {}
None
0 1 False {}
None
1 1 False {}
None
1 0 False {}
None
0 1 True {}


In [89]:
model = Sequential()
model.add(Dense(16, input_dim=1))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 16)                32        
_________________________________________________________________
activation_16 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_19 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_17 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 2)                 34        
_________________________________________________________________
activation_18 (Activation)   (None, 2)                 0         
Total params: 338
Trainable params: 338
Non-trainable params: 0
_________________________________________________________________
None


In [90]:
memory = SequentialMemory(limit=500, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [91]:
dqn.fit(env, nb_steps=500, visualize=True, verbose=2)

Training for 500 steps ...


  10/500: episode: 1, duration: 0.260s, episode steps: 10, steps per second: 38, episode reward: 4.000, mean reward: 0.400 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --




  20/500: episode: 2, duration: 0.847s, episode steps: 10, steps per second: 12, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.204305, mean_absolute_error: 0.244222, mean_q: -0.036266
  30/500: episode: 3, duration: 0.072s, episode steps: 10, steps per second: 138, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.208128, mean_absolute_error: 0.276934, mean_q: 0.073283
  40/500: episode: 4, duration: 0.035s, episode steps: 10, steps per second: 282, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.203746, mean_absolute_error: 0.318454, mean_q: 0.172380
  50/500: episode: 5, duration: 0.028s, episode steps: 10, steps per second: 363, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.300 [0.0




  70/500: episode: 7, duration: 0.034s, episode steps: 10, steps per second: 294, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.138510, mean_absolute_error: 0.391101, mean_q: 0.455652
  80/500: episode: 8, duration: 0.030s, episode steps: 10, steps per second: 336, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.300 [0.000, 1.000], loss: 0.126541, mean_absolute_error: 0.413581, mean_q: 0.561518
  90/500: episode: 9, duration: 0.027s, episode steps: 10, steps per second: 369, episode reward: 4.000, mean reward: 0.400 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.300 [0.000, 1.000], loss: 0.119551, mean_absolute_error: 0.442308, mean_q: 0.661144
 100/500: episode: 10, duration: 0.028s, episode steps: 10, steps per second: 363, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.200 [0

 150/500: episode: 15, duration: 0.027s, episode steps: 10, steps per second: 366, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.083586, mean_absolute_error: 0.511489, mean_q: 0.979446
 160/500: episode: 16, duration: 0.030s, episode steps: 10, steps per second: 337, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.400 [0.000, 1.000], loss: 0.074307, mean_absolute_error: 0.510527, mean_q: 1.051722
 170/500: episode: 17, duration: 0.027s, episode steps: 10, steps per second: 369, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.066230, mean_absolute_error: 0.544103, mean_q: 1.154430
 180/500: episode: 18, duration: 0.027s, episode steps: 10, steps per second: 367, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.500 

 230/500: episode: 23, duration: 0.031s, episode steps: 10, steps per second: 318, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.037955, mean_absolute_error: 0.661222, mean_q: 1.720475
 240/500: episode: 24, duration: 0.028s, episode steps: 10, steps per second: 358, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.300 [0.000, 1.000], loss: 0.037892, mean_absolute_error: 0.712880, mean_q: 1.799912
 250/500: episode: 25, duration: 0.027s, episode steps: 10, steps per second: 368, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.048039, mean_absolute_error: 0.721043, mean_q: 1.899179
 260/500: episode: 26, duration: 0.028s, episode steps: 10, steps per second: 361, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.500 

 310/500: episode: 31, duration: 0.030s, episode steps: 10, steps per second: 329, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.400 [0.000, 1.000], loss: 0.083581, mean_absolute_error: 0.963539, mean_q: 2.273774
 320/500: episode: 32, duration: 0.028s, episode steps: 10, steps per second: 357, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.119693, mean_absolute_error: 1.016399, mean_q: 2.365076
 330/500: episode: 33, duration: 0.027s, episode steps: 10, steps per second: 373, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.128177, mean_absolute_error: 1.043213, mean_q: 2.415470
 340/500: episode: 34, duration: 0.027s, episode steps: 10, steps per second: 368, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.700 

 390/500: episode: 39, duration: 0.031s, episode steps: 10, steps per second: 322, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.201779, mean_absolute_error: 1.285506, mean_q: 2.746770
 400/500: episode: 40, duration: 0.029s, episode steps: 10, steps per second: 342, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.215626, mean_absolute_error: 1.325908, mean_q: 2.900595
 410/500: episode: 41, duration: 0.027s, episode steps: 10, steps per second: 369, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.231291, mean_absolute_error: 1.377240, mean_q: 2.906313
 420/500: episode: 42, duration: 0.027s, episode steps: 10, steps per second: 372, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.600 

 470/500: episode: 47, duration: 0.029s, episode steps: 10, steps per second: 347, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.400 [0.000, 1.000], loss: 0.353905, mean_absolute_error: 1.586446, mean_q: 3.241910
 480/500: episode: 48, duration: 0.029s, episode steps: 10, steps per second: 341, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.325093, mean_absolute_error: 1.597956, mean_q: 3.368885
 490/500: episode: 49, duration: 0.028s, episode steps: 10, steps per second: 355, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.300 [0.000, 1.000], loss: 0.300789, mean_absolute_error: 1.648817, mean_q: 3.410449
 500/500: episode: 50, duration: 0.028s, episode steps: 10, steps per second: 358, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.400 

<keras.callbacks.History at 0x263a8112668>

In [100]:
env.reset()
dqn.test(env)
print(env.state_list)

Testing for 1 episodes ...
Episode 1: reward: 10.000, steps: 10
[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
