In [1]:
import numpy as np
import gym
import time

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from gym import Env, spaces
from gym.utils import seeding
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [10]:
class TestEnv(Env):
    metadata = {'render.modes': ['human', 'ansi']}

    def __init__(self) -> None:
        self.__max_step = 9
        
        # 0-8分别代表此次落子位置
        self.action_space = spaces.Discrete(9)
        # 初始全零，下标0为：1、2分别代表黑白两方（1黑方先行），下标1-9为：0空、1黑方、2白方
        self.observation_space = spaces.Box(
            np.zeros(10, int), np.zeros(10, int) + 2)
        # 代表AI所属方（1或2）
        self.ai = None
        self._seed()
        self.state = None
        self.__step = None

    def _seed(self, seed=None):
        # 随机AI所属方
        self.ai = np.random.choice([1, 2])
    
    def _reset(self):
        self.__step = 0
        self.state = [self.ai]
        self.state = self.state.extend(np.zeros(9, int).tolist())
        return np.array(self.state)

    def _step(self, action):
        grid = self.state[1:]
        # 不可重复落子
        if grid[action] != 0:
            return np.array(self.state), -100, True, {}
        # 黑方先手
        grid = np.array(grid, int)
        dif = np.sum(grid == 1) - np.sum(grid == 2)
        if dif == 0:
            current = 1
        elif dif == 1:
            current = 2
        else:
            return np.array(self.state), -100, True, {}
        # 胜负
        grid[action] = current
        self.state = [self.ai]
        self.state = self.state.extend(grid.tolist())
        is_win = lambda b: b[0:3].all() or b[3:6].all() or b[6:9].all() or \
            b[0::3].all() or b[1::3].all() or b[2::3] or \
            b[0::4].all() or b[2:7:2].all()
        if is_win(grid == current):
            if self.ai == current:
                reward = 10
            else:
                reward = -10
            return np.array(self.state), reward, True, {}

        reward = 0
        self.__step += 1
        if self.__step < self.__max_step:
            done = False
        else:
            done = True
        return np.array(self.state), reward, done, {}

    def _render(self, mode='ansi', close=False):
        print(self.state[0], np.array(self.state[1:], int).reshape(3, 3))

    def _close(self):
        super()._close()

In [11]:
env = TestEnv()
env.reset()
nb_actions = env.action_space.n

In [12]:
for i in range(10):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    print(env.render())
    print(action, reward, done, info)

TypeError: 'NoneType' object is not subscriptable

In [45]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 2)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                48        
_________________________________________________________________
activation_4 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_5 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_6 (Activation)    (None, 2)                 0         
Total para

In [46]:
memory = SequentialMemory(limit=500, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [47]:
dqn.fit(env, nb_steps=500, visualize=False, verbose=2)

Training for 500 steps ...
[0, 0, 0]
[0, 0, 0, 1]
[0, 0, 0, 1, 0]
[0, 0, 0, 1, 0, 1]
[0, 0, 0, 1, 0, 1, 0]
[0, 0, 0, 1, 0, 1, 0, 1]
[0, 0, 0, 1, 0, 1, 0, 1, 0]
[0, 0, 0, 1, 0, 1, 0, 1, 0, 0]
[0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0]
[0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0]
  10/500: episode: 1, duration: 0.125s, episode steps: 10, steps per second: 80, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.300 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
[0, 0, 0]
[0, 0, 0, 1]




[0, 0, 0, 1, 1]
[0, 0, 0, 1, 1, 0]
[0, 0, 0, 1, 1, 0, 0]
[0, 0, 0, 1, 1, 0, 0, 1]
[0, 0, 0, 1, 1, 0, 0, 1, 1]
[0, 0, 0, 1, 1, 0, 0, 1, 1, 0]
[0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1]
[0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1]
  20/500: episode: 2, duration: 0.982s, episode steps: 10, steps per second: 10, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.269254, mean_absolute_error: 0.327091, mean_q: 0.081300
[0, 0, 0]
[0, 0, 0, 1]
[0, 0, 0, 1, 1]
[0, 0, 0, 1, 1, 0]
[0, 0, 0, 1, 1, 0, 1]
[0, 0, 0, 1, 1, 0, 1, 1]
[0, 0, 0, 1, 1, 0, 1, 1, 0]
[0, 0, 0, 1, 1, 0, 1, 1, 0, 1]
[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0]
[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0]
  30/500: episode: 3, duration: 0.096s, episode steps: 10, steps per second: 104, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.289700, mean_absolute_error: 0.366818, mean_q: 0.136



[1, 0, 0, 0, 1]
[1, 0, 0, 0, 1, 1]
[1, 0, 0, 0, 1, 1, 1]
[1, 0, 0, 0, 1, 1, 1, 0]
[1, 0, 0, 0, 1, 1, 1, 0, 1]
[1, 0, 0, 0, 1, 1, 1, 0, 1, 1]
[1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0]
[1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0]
  40/500: episode: 4, duration: 0.120s, episode steps: 10, steps per second: 83, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.241171, mean_absolute_error: 0.362811, mean_q: 0.211130
[1, 0, 0]
[1, 0, 0, 0]
[1, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 1]
[1, 0, 0, 0, 0, 0, 1, 0]
[1, 0, 0, 0, 0, 0, 1, 0, 0]
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1]


  50/500: episode: 5, duration: 0.113s, episode steps: 10, steps per second: 89, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.250 [0.000, 1.000], loss: 0.209490, mean_absolute_error: 0.380769, mean_q: 0.291070
[1, 0, 0]
[1, 0, 0, 1]
[1, 0, 0, 1, 1]
[1, 0, 0, 1, 1, 1]
[1, 0, 0, 1, 1, 1, 0]
[1, 0, 0, 1, 1, 1, 0, 0]
[1, 0, 0, 1, 1, 1, 0, 0, 1]
[1, 0, 0, 1, 1, 1, 0, 0, 1, 1]
[1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0]
[1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1]
  60/500: episode: 6, duration: 0.130s, episode steps: 10, steps per second: 77, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.178774, mean_absolute_error: 0.393363, mean_q: 0.355312
[1, 1, 1]
[1, 1, 1, 1]
[1, 1, 1, 1, 0]
[1, 1, 1, 1, 0, 1]
[1, 1, 1, 1, 0, 1, 0]
[1, 1, 1, 1, 0, 1, 0, 1]
[1, 1, 1, 1, 0, 1, 0, 1, 0]


[1, 1, 1, 1, 0, 1, 0, 1, 0, 0]
[1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0]
[1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1]
  70/500: episode: 7, duration: 0.136s, episode steps: 10, steps per second: 74, episode reward: 4.000, mean reward: 0.400 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.148677, mean_absolute_error: 0.415424, mean_q: 0.446061
[1, 1, 0]
[1, 1, 0, 0]
[1, 1, 0, 0, 0]
[1, 1, 0, 0, 0, 0]
[1, 1, 0, 0, 0, 0, 1]
[1, 1, 0, 0, 0, 0, 1, 0]
[1, 1, 0, 0, 0, 0, 1, 0, 1]
[1, 1, 0, 0, 0, 0, 1, 0, 1, 0]
[1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0]
[1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1]
  80/500: episode: 8, duration: 0.122s, episode steps: 10, steps per second: 82, episode reward: 4.000, mean reward: 0.400 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.300 [0.000, 1.000], loss: 0.122876, mean_absolute_error: 0.450900, mean_q: 0.548571
[1, 0, 1]
[1, 0, 1, 1]
[1, 0, 1, 1, 1]
[1, 0, 1, 1, 1, 0]


[1, 0, 1, 1, 1, 0, 0]
[1, 0, 1, 1, 1, 0, 0, 1]
[1, 0, 1, 1, 1, 0, 0, 1, 0]
[1, 0, 1, 1, 1, 0, 0, 1, 0, 1]
[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0]
[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1]
  90/500: episode: 9, duration: 0.113s, episode steps: 10, steps per second: 88, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.095010, mean_absolute_error: 0.468839, mean_q: 0.653137
[0, 1, 0]
[0, 1, 0, 1]
[0, 1, 0, 1, 1]
[0, 1, 0, 1, 1, 1]
[0, 1, 0, 1, 1, 1, 1]
[0, 1, 0, 1, 1, 1, 1, 0]
[0, 1, 0, 1, 1, 1, 1, 0, 1]
[0, 1, 0, 1, 1, 1, 1, 0, 1, 1]
[0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1]
[0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0]
 100/500: episode: 10, duration: 0.093s, episode steps: 10, steps per second: 107, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.750 [0.000, 1.000], loss: 0.084146, mean_absolute_error: 0.483551, mean_q: 0.751638
[0, 0, 0]
[0, 0, 0, 1]
[0, 0,

[0, 0, 0, 1, 1, 0, 1]
[0, 0, 0, 1, 1, 0, 1, 0]
[0, 0, 0, 1, 1, 0, 1, 0, 0]
[0, 0, 0, 1, 1, 0, 1, 0, 0, 0]
[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1]
[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1]
 110/500: episode: 11, duration: 0.105s, episode steps: 10, steps per second: 95, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.450 [0.000, 1.000], loss: 0.086877, mean_absolute_error: 0.530042, mean_q: 0.819648
[0, 1, 1]
[0, 1, 1, 0]
[0, 1, 1, 0, 1]
[0, 1, 1, 0, 1, 1]
[0, 1, 1, 0, 1, 1, 0]
[0, 1, 1, 0, 1, 1, 0, 0]
[0, 1, 1, 0, 1, 1, 0, 0, 0]
[0, 1, 1, 0, 1, 1, 0, 0, 0, 0]
[0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0]
[0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]
 120/500: episode: 12, duration: 0.101s, episode steps: 10, steps per second: 99, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.350 [0.000, 1.000], loss: 0.080334, mean_absolute_error: 0.530983, mean_q: 0.871117
[0, 0, 0]
[0, 0, 0, 0]
[0, 0, 


[0, 0, 0, 0, 0, 1, 0, 1]
[0, 0, 0, 0, 0, 1, 0, 1, 0]
[0, 0, 0, 0, 0, 1, 0, 1, 0, 1]
[0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0]
[0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1]
 130/500: episode: 13, duration: 0.103s, episode steps: 10, steps per second: 97, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.350 [0.000, 1.000], loss: 0.075550, mean_absolute_error: 0.533048, mean_q: 0.902327
[0, 1, 1]
[0, 1, 1, 0]
[0, 1, 1, 0, 1]
[0, 1, 1, 0, 1, 1]
[0, 1, 1, 0, 1, 1, 0]
[0, 1, 1, 0, 1, 1, 0, 1]
[0, 1, 1, 0, 1, 1, 0, 1, 1]
[0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
[0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1]
[0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0]
 140/500: episode: 14, duration: 0.103s, episode steps: 10, steps per second: 97, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.750 [0.000, 1.000], loss: 0.075981, mean_absolute_error: 0.552129, mean_q: 0.980188
[1, 1, 1]
[1, 1, 1, 0]
[1, 1, 1, 0, 0]
[1, 1, 1, 0,


[1, 1, 1, 0, 0, 1, 0, 0, 1]
[1, 1, 1, 0, 0, 1, 0, 0, 1, 1]
[1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0]
[1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1]
 150/500: episode: 15, duration: 0.116s, episode steps: 10, steps per second: 86, episode reward: 4.000, mean reward: 0.400 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.069901, mean_absolute_error: 0.601390, mean_q: 1.079453
[0, 1, 1]
[0, 1, 1, 1]
[0, 1, 1, 1, 0]
[0, 1, 1, 1, 0, 1]
[0, 1, 1, 1, 0, 1, 1]
[0, 1, 1, 1, 0, 1, 1, 0]
[0, 1, 1, 1, 0, 1, 1, 0, 1]
[0, 1, 1, 1, 0, 1, 1, 0, 1, 1]
[0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0]
[0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1]
 160/500: episode: 16, duration: 0.098s, episode steps: 10, steps per second: 102, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.062792, mean_absolute_error: 0.595768, mean_q: 1.113277
[1, 1, 0]
[1, 1, 0, 0]
[1, 1, 0, 0, 1]
[1, 1, 0, 0, 1, 1]


[1, 1, 0, 0, 1, 1, 1]
[1, 1, 0, 0, 1, 1, 1, 0]
[1, 1, 0, 0, 1, 1, 1, 0, 0]
[1, 1, 0, 0, 1, 1, 1, 0, 0, 0]
[1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0]
[1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1]
 170/500: episode: 17, duration: 0.101s, episode steps: 10, steps per second: 99, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.400 [0.000, 1.000], loss: 0.068018, mean_absolute_error: 0.619285, mean_q: 1.167351
[0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0, 1]
[0, 0, 0, 0, 1, 0]
[0, 0, 0, 0, 1, 0, 1]
[0, 0, 0, 0, 1, 0, 1, 0]
[0, 0, 0, 0, 1, 0, 1, 0, 1]
[0, 0, 0, 0, 1, 0, 1, 0, 1, 1]
[0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1]
[0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0]
 180/500: episode: 18, duration: 0.098s, episode steps: 10, steps per second: 102, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.050341, mean_absolute_error: 0.632588, mean_q: 1.246435
[1, 1, 1]
[1, 1, 1, 0]
[1, 1,

[1, 1, 1, 0, 1, 1, 0, 1]
[1, 1, 1, 0, 1, 1, 0, 1, 1]
[1, 1, 1, 0, 1, 1, 0, 1, 1, 1]
[1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0]
[1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0]
 190/500: episode: 19, duration: 0.102s, episode steps: 10, steps per second: 98, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.059351, mean_absolute_error: 0.641359, mean_q: 1.300895
[1, 1, 0]
[1, 1, 0, 1]
[1, 1, 0, 1, 0]
[1, 1, 0, 1, 0, 1]
[1, 1, 0, 1, 0, 1, 0]
[1, 1, 0, 1, 0, 1, 0, 0]
[1, 1, 0, 1, 0, 1, 0, 0, 0]
[1, 1, 0, 1, 0, 1, 0, 0, 0, 0]
[1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0]
[1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1]
 200/500: episode: 20, duration: 0.098s, episode steps: 10, steps per second: 102, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.300 [0.000, 1.000], loss: 0.054247, mean_absolute_error: 0.656456, mean_q: 1.376240
[1, 1, 1]
[1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1,

[1, 1, 1, 1, 1, 0, 1, 1, 0]
[1, 1, 1, 1, 1, 0, 1, 1, 0, 1]
[1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0]
[1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1]
 210/500: episode: 21, duration: 0.105s, episode steps: 10, steps per second: 95, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.065969, mean_absolute_error: 0.717294, mean_q: 1.497033
[0, 1, 1]
[0, 1, 1, 0]
[0, 1, 1, 0, 1]
[0, 1, 1, 0, 1, 1]
[0, 1, 1, 0, 1, 1, 0]
[0, 1, 1, 0, 1, 1, 0, 1]
[0, 1, 1, 0, 1, 1, 0, 1, 1]
[0, 1, 1, 0, 1, 1, 0, 1, 1, 0]
[0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1]
[0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0]
 220/500: episode: 22, duration: 0.100s, episode steps: 10, steps per second: 100, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.055879, mean_absolute_error: 0.732470, mean_q: 1.595320
[1, 1, 1]
[1, 1, 1, 1]
[1, 1, 1, 1, 0]
[1, 1, 1, 1, 0, 1]
[1, 1, 1, 1, 0, 1,


[1, 1, 1, 1, 0, 1, 1, 0, 1]
[1, 1, 1, 1, 0, 1, 1, 0, 1, 1]
[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0]
[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0]
 230/500: episode: 23, duration: 0.117s, episode steps: 10, steps per second: 85, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.057372, mean_absolute_error: 0.730190, mean_q: 1.647084
[0, 0, 0]
[0, 0, 0, 1]
[0, 0, 0, 1, 1]
[0, 0, 0, 1, 1, 0]
[0, 0, 0, 1, 1, 0, 0]
[0, 0, 0, 1, 1, 0, 0, 0]
[0, 0, 0, 1, 1, 0, 0, 0, 0]
[0, 0, 0, 1, 1, 0, 0, 0, 0, 1]
[0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1]
[0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0]
 240/500: episode: 24, duration: 0.101s, episode steps: 10, steps per second: 99, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.400 [0.000, 1.000], loss: 0.058817, mean_absolute_error: 0.774644, mean_q: 1.706526
[1, 1, 0]
[1, 1, 0, 1]
[1, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 0]
[1, 1, 0, 1, 1, 0,


[1, 1, 0, 1, 1, 0, 1, 0, 1]
[1, 1, 0, 1, 1, 0, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0]
[1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1]
 250/500: episode: 25, duration: 0.101s, episode steps: 10, steps per second: 99, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.071371, mean_absolute_error: 0.826683, mean_q: 1.826334
[1, 1, 0]
[1, 1, 0, 1]
[1, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 0]
[1, 1, 0, 1, 1, 0, 1]
[1, 1, 0, 1, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 0, 1, 1, 0]
[1, 1, 0, 1, 1, 0, 1, 1, 0, 1]
[1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
 260/500: episode: 26, duration: 0.095s, episode steps: 10, steps per second: 105, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.072941, mean_absolute_error: 0.838560, mean_q: 1.872267
[1, 0, 1]
[1, 0, 1, 0]
[1, 0, 1, 0, 1]
[1, 0, 1, 0, 1, 0]
[1, 0, 1, 0, 1, 0

[1, 0, 1, 0, 1, 0, 1, 0, 1, 1]
[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0]
[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1]
 270/500: episode: 27, duration: 0.115s, episode steps: 10, steps per second: 87, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.084253, mean_absolute_error: 0.852800, mean_q: 1.910809
[0, 1, 1]
[0, 1, 1, 0]
[0, 1, 1, 0, 1]
[0, 1, 1, 0, 1, 0]
[0, 1, 1, 0, 1, 0, 0]
[0, 1, 1, 0, 1, 0, 0, 1]
[0, 1, 1, 0, 1, 0, 0, 1, 0]
[0, 1, 1, 0, 1, 0, 0, 1, 0, 1]
[0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1]
[0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0]
 280/500: episode: 28, duration: 0.096s, episode steps: 10, steps per second: 104, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.068525, mean_absolute_error: 0.875100, mean_q: 2.055014
[0, 1, 0]
[0, 1, 0, 1]
[0, 1, 0, 1, 1]
[0, 1, 0, 1, 1, 0]
[0, 1, 0, 1, 1, 0, 1]
[0, 1, 0, 1, 1, 0, 1, 0]


[0, 1, 0, 1, 1, 0, 1, 0, 0, 0]
[0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0]
[0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]
 290/500: episode: 29, duration: 0.110s, episode steps: 10, steps per second: 91, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.350 [0.000, 1.000], loss: 0.068281, mean_absolute_error: 0.898057, mean_q: 2.152126
[0, 0, 0]
[0, 0, 0, 1]
[0, 0, 0, 1, 1]
[0, 0, 0, 1, 1, 0]
[0, 0, 0, 1, 1, 0, 1]
[0, 0, 0, 1, 1, 0, 1, 1]
[0, 0, 0, 1, 1, 0, 1, 1, 0]
[0, 0, 0, 1, 1, 0, 1, 1, 0, 1]
[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0]
[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1]
 300/500: episode: 30, duration: 0.110s, episode steps: 10, steps per second: 91, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.105475, mean_absolute_error: 0.987345, mean_q: 2.204072
[0, 1, 1]
[0, 1, 1, 1]
[0, 1, 1, 1, 0]
[0, 1, 1, 1, 0, 0]
[0, 1, 1, 1, 0, 0, 1]


[0, 1, 1, 1, 0, 0, 1, 1]
[0, 1, 1, 1, 0, 0, 1, 1, 0]
[0, 1, 1, 1, 0, 0, 1, 1, 0, 1]
[0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1]
[0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0]
 310/500: episode: 31, duration: 0.124s, episode steps: 10, steps per second: 81, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.104838, mean_absolute_error: 0.979407, mean_q: 2.247802
[0, 1, 1]
[0, 1, 1, 0]
[0, 1, 1, 0, 1]
[0, 1, 1, 0, 1, 0]
[0, 1, 1, 0, 1, 0, 1]
[0, 1, 1, 0, 1, 0, 1, 1]
[0, 1, 1, 0, 1, 0, 1, 1, 0]
[0, 1, 1, 0, 1, 0, 1, 1, 0, 1]
[0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1]
[0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1]
 320/500: episode: 32, duration: 0.109s, episode steps: 10, steps per second: 92, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.117579, mean_absolute_error: 1.005916, mean_q: 2.340500
[1, 0, 1]
[1, 0, 1, 1]


[1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0]
 330/500: episode: 33, duration: 0.110s, episode steps: 10, steps per second: 91, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.106108, mean_absolute_error: 1.075372, mean_q: 2.417346
[0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0, 1]
[0, 0, 0, 0, 1, 1]
[0, 0, 0, 0, 1, 1, 0]
[0, 0, 0, 0, 1, 1, 0, 1]
[0, 0, 0, 0, 1, 1, 0, 1, 1]
[0, 0, 0, 0, 1, 1, 0, 1, 1, 0]
[0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1]
[0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1]
 340/500: episode: 34, duration: 0.094s, episode steps: 10, steps per second: 107, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.139572, mean_absolute_error: 1.100892, mean_q: 2.


[1, 1, 0, 1, 1, 0]
[1, 1, 0, 1, 1, 0, 1]
[1, 1, 0, 1, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 0, 1, 1, 0]
[1, 1, 0, 1, 1, 0, 1, 1, 0, 0]
[1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0]
[1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0]
 350/500: episode: 35, duration: 0.111s, episode steps: 10, steps per second: 90, episode reward: 9.000, mean reward: 0.900 [0.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.450 [0.000, 1.000], loss: 0.176558, mean_absolute_error: 1.141138, mean_q: 2.490771
[1, 0, 1]
[1, 0, 1, 1]
[1, 0, 1, 1, 1]
[1, 0, 1, 1, 1, 0]
[1, 0, 1, 1, 1, 0, 1]
[1, 0, 1, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0]
[1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1]
 360/500: episode: 36, duration: 0.093s, episode steps: 10, steps per second: 107, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.107361, mean_absolute_error: 1.107930, mean_q: 2.565652
[1, 1, 0]

[1, 1, 0, 1, 1, 1]
[1, 1, 0, 1, 1, 1, 0]
[1, 1, 0, 1, 1, 1, 0, 1]
[1, 1, 0, 1, 1, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 1, 0, 1, 1, 1]
[1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1]
[1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0]
 370/500: episode: 37, duration: 0.102s, episode steps: 10, steps per second: 98, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.750 [0.000, 1.000], loss: 0.199844, mean_absolute_error: 1.216608, mean_q: 2.771676
[1, 0, 1]
[1, 0, 1, 0]
[1, 0, 1, 0, 1]
[1, 0, 1, 0, 1, 1]
[1, 0, 1, 0, 1, 1, 0]
[1, 0, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 0, 1, 1, 0, 1, 0]
[1, 0, 1, 0, 1, 1, 0, 1, 0, 0]
[1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0]
[1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1]
 380/500: episode: 38, duration: 0.098s, episode steps: 10, steps per second: 102, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.450 [0.000, 1.000], loss: 0.143917, mean_absolute_error: 1.219301, mean_q: 2.705633
[0, 0, 0]


[0, 0, 0, 0, 1, 0, 1]
[0, 0, 0, 0, 1, 0, 1, 1]
[0, 0, 0, 0, 1, 0, 1, 1, 1]
[0, 0, 0, 0, 1, 0, 1, 1, 1, 1]
[0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0]
 390/500: episode: 39, duration: 0.107s, episode steps: 10, steps per second: 93, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.169549, mean_absolute_error: 1.231380, mean_q: 2.880740
[0, 1, 0]
[0, 1, 0, 1]
[0, 1, 0, 1, 1]
[0, 1, 0, 1, 1, 1]
[0, 1, 0, 1, 1, 1, 0]
[0, 1, 0, 1, 1, 1, 0, 1]
[0, 1, 0, 1, 1, 1, 0, 1, 0]
[0, 1, 0, 1, 1, 1, 0, 1, 0, 0]
[0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1]
[0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1]
 400/500: episode: 40, duration: 0.100s, episode steps: 10, steps per second: 100, episode reward: 5.000, mean reward: 0.500 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.600 [0.000, 1.000], loss: 0.234701, mean_absolute_error: 1.321127, mean_q: 2.894804
[1, 0, 1]
[1, 0, 1, 1]
[1, 0,

[1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1]
 410/500: episode: 41, duration: 0.111s, episode steps: 10, steps per second: 90, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: 0.750 [0.000, 1.000], loss: 0.291819, mean_absolute_error: 1.388030, mean_q: 2.919295
[0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0, 1]
[0, 0, 0, 0, 1, 1]
[0, 0, 0, 0, 1, 1, 0]
[0, 0, 0, 0, 1, 1, 0, 1]
[0, 0, 0, 0, 1, 1, 0, 1, 1]
[0, 0, 0, 0, 1, 1, 0, 1, 1, 1]
[0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0]
[0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0]
 420/500: episode: 42, duration: 0.095s, episode steps: 10, steps per second: 105, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.190540, mean_absolute_error: 1.362183, mean_q: 2.955077
[1, 0, 1]
[1, 0, 1, 1]
[1, 0,


[1, 0, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1]
 430/500: episode: 43, duration: 0.111s, episode steps: 10, steps per second: 90, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.294704, mean_absolute_error: 1.445764, mean_q: 3.144175
[0, 0, 1]
[0, 0, 1, 1]
[0, 0, 1, 1, 0]
[0, 0, 1, 1, 0, 0]
[0, 0, 1, 1, 0, 0, 0]
[0, 0, 1, 1, 0, 0, 0, 1]
[0, 0, 1, 1, 0, 0, 0, 1, 0]
[0, 0, 1, 1, 0, 0, 0, 1, 0, 1]
[0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1]
[0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0]
 440/500: episode: 44, duration: 0.100s, episode steps: 10, steps per second: 100, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.261528, mean_absolute_error: 1.424281, mean_q: 3.087670
[1, 0, 0]
[1, 0, 0, 0]
[1, 0, 0, 0, 1]
[1, 0, 0, 


[1, 0, 0, 0, 1, 1, 0, 1]
[1, 0, 0, 0, 1, 1, 0, 1, 0]
[1, 0, 0, 0, 1, 1, 0, 1, 0, 1]
[1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1]
[1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1]
 450/500: episode: 45, duration: 0.101s, episode steps: 10, steps per second: 99, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.550 [0.000, 1.000], loss: 0.301147, mean_absolute_error: 1.487703, mean_q: 3.216314
[1, 1, 0]
[1, 1, 0, 1]
[1, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 1]
[1, 1, 0, 1, 1, 1, 0]
[1, 1, 0, 1, 1, 1, 0, 1]
[1, 1, 0, 1, 1, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 1, 0, 1, 1, 0]
[1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1]
[1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0]
 460/500: episode: 46, duration: 0.089s, episode steps: 10, steps per second: 113, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.249543, mean_absolute_error: 1.530695, mean_q: 3.274679
[1, 0, 1]
[1, 0, 1, 1]
[1, 0, 1, 1, 0]
[1, 0, 1, 1


[1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0]
 470/500: episode: 47, duration: 0.091s, episode steps: 10, steps per second: 109, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.301767, mean_absolute_error: 1.562073, mean_q: 3.393026
[0, 0, 1]
[0, 0, 1, 1]
[0, 0, 1, 1, 0]
[0, 0, 1, 1, 0, 1]
[0, 0, 1, 1, 0, 1, 1]
[0, 0, 1, 1, 0, 1, 1, 1]
[0, 0, 1, 1, 0, 1, 1, 1, 1]
[0, 0, 1, 1, 0, 1, 1, 1, 1, 0]
[0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1]
[0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0]
 480/500: episode: 48, duration: 0.095s, episode steps: 10, steps per second: 105, episode reward: 6.000, mean reward: 0.600 [0.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: 0.700 [0.000, 1.000], loss: 0.248140, mean_absolute_error: 1.583939, mean_q: 3.446417
[1, 0, 1]
[1, 0, 1, 0]
[1, 0, 1, 0, 1]
[1, 0, 1, 0, 1, 1]
[1, 0, 1, 0, 1, 1, 0]
[1, 0, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 0, 1, 1, 0, 1, 1]
[1, 0, 1, 0, 1, 1, 0, 1, 1, 0]
[1,

 490/500: episode: 49, duration: 0.086s, episode steps: 10, steps per second: 117, episode reward: 8.000, mean reward: 0.800 [0.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.500 [0.000, 1.000], loss: 0.317350, mean_absolute_error: 1.620282, mean_q: 3.551276
[0, 1, 1]
[0, 1, 1, 0]
[0, 1, 1, 0, 1]
[0, 1, 1, 0, 1, 1]
[0, 1, 1, 0, 1, 1, 0]
[0, 1, 1, 0, 1, 1, 0, 0]
[0, 1, 1, 0, 1, 1, 0, 0, 1]
[0, 1, 1, 0, 1, 1, 0, 0, 1, 1]
[0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1]
[0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0]
 500/500: episode: 50, duration: 0.098s, episode steps: 10, steps per second: 103, episode reward: 7.000, mean reward: 0.700 [0.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.650 [0.000, 1.000], loss: 0.360748, mean_absolute_error: 1.667379, mean_q: 3.601879
done, took 6.166 seconds


<keras.callbacks.History at 0x25d1fef5320>

In [29]:
env.reset()
dqn.test(env)
print(env.state_list)

Testing for 1 episodes ...
Episode 1: reward: 10.000, steps: 10
[1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0]


In [64]:
env.reset()

array([1, 1])

In [73]:
np.argmax(model.predict(env.state.reshape(1, 1, 2)))

1

In [72]:
env.step(0)

(array([1, 0]), 1, False, {})

In [53]:
model.get_layer(index=0).input

<tf.Tensor 'flatten_2_input:0' shape=(?, 1, 2) dtype=float32>

In [76]:
dqn.test(env, 2, visualize=True)

Testing for 2 episodes ...
[0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Episode 1: reward: 10.000, steps: 10
[1, 1, 0]
[1, 1, 0, 1]
[1, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 0]
[1, 1, 0, 1, 1, 0, 1]
[1, 1, 0, 1, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 0, 1, 1, 0]
[1, 1, 0, 1, 1, 0, 1, 1, 0, 1]
[1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0]
Episode 2: reward: 10.000, steps: 10


<keras.callbacks.History at 0x25d88f546d8>