In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, Reshape
from keras.optimizers import Adam
from keras.regularizers import l2

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
import cupcake_game

# ゲームクラスをロード
env = cupcake_game.Game(step=True, image=True)
# プレイヤーの行動種類数（上下左右の移動）
nb_actions = 4

# CNNで使うパラメータ
hidden_size = 128
n_filters = 8
kernel = (13, 13)
strides = (3, 3)

# CNNモデル
model = Sequential()
model.add(Reshape((env.observation_space.shape), input_shape=(1,) + env.observation_space.shape))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Flatten())
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(nb_actions, activation='linear'))
print(model.summary())

memory = SequentialMemory(limit=100000, window_length=1)
policy = EpsGreedyQPolicy(eps=0.001)

# DQNモデル
dqn = DQNAgent(model=model, nb_actions=nb_actions, gamma=0.99, memory=memory, nb_steps_warmup=100,
               target_model_update=1e-2, policy=policy)

dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# パラメータの重みを保存しているファイル
fname = "cupcake_dqn_weights.bin"
try:
    dqn.load_weights(fname)
    print("Weights are loaded.")
except:
    print("Weights are NOT loaded.")

# 学習実行
history = dqn.fit(env, nb_steps=200000, verbose=2)

dqn.save_weights(fname, overwrite=True)

# テスト実行
dqn.test(env, nb_episodes=50)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 151, 120, 3)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 51, 40, 8)         4064      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 17, 14, 8)         10824     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 6, 5, 8)           10824     
_________________________________________________________________
flatten_1 (Flatten)          (None, 240)               0         
_________________________________________________________________
dense_1 (Dense)              

   1045/200000: episode: 23, duration: 21.189s, episode steps: 40, steps per second: 2, episode reward: 0.450, mean reward: 0.011 [0.000, 0.450], mean action: 2.450 [2.000, 3.000], mean observation: 168.752 [0.000, 255.000], loss: 0.004182, mae: 0.511463, mean_q: 0.762312
   1085/200000: episode: 24, duration: 21.322s, episode steps: 40, steps per second: 2, episode reward: -10.000, mean reward: -0.250 [-10.000, 0.000], mean action: 3.000 [3.000, 3.000], mean observation: 165.544 [0.000, 255.000], loss: 0.002667, mae: 0.505368, mean_q: 0.755643
   1124/200000: episode: 25, duration: 21.285s, episode steps: 39, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.103 [2.000, 3.000], mean observation: 171.041 [0.000, 255.000], loss: 0.141228, mae: 0.455891, mean_q: 0.740716
   1164/200000: episode: 26, duration: 21.259s, episode steps: 40, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.0

   2244/200000: episode: 53, duration: 21.297s, episode steps: 40, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.675 [0.000, 255.000], loss: 0.087581, mae: 0.342243, mean_q: 0.533888
   2283/200000: episode: 54, duration: 21.059s, episode steps: 39, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.703 [0.000, 255.000], loss: 0.045370, mae: 0.332210, mean_q: 0.536138
   2323/200000: episode: 55, duration: 21.289s, episode steps: 40, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.675 [0.000, 255.000], loss: 0.086948, mae: 0.334436, mean_q: 0.528418
   2363/200000: episode: 56, duration: 21.213s, episode steps: 40, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], 

   3473/200000: episode: 84, duration: 21.291s, episode steps: 40, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.675 [0.000, 255.000], loss: 0.000596, mae: 0.252545, mean_q: 0.396290
   3513/200000: episode: 85, duration: 21.326s, episode steps: 40, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.675 [0.000, 255.000], loss: 0.125869, mae: 0.255997, mean_q: 0.384944
   3552/200000: episode: 86, duration: 20.970s, episode steps: 39, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.703 [0.000, 255.000], loss: 0.001117, mae: 0.235794, mean_q: 0.388051
   3592/200000: episode: 87, duration: 21.394s, episode steps: 40, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], 

   4650/200000: episode: 114, duration: 20.984s, episode steps: 39, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.703 [0.000, 255.000], loss: 0.000614, mae: 0.162559, mean_q: 0.292301
   4689/200000: episode: 115, duration: 20.984s, episode steps: 39, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.703 [0.000, 255.000], loss: 0.083706, mae: 0.168946, mean_q: 0.288024
   4728/200000: episode: 116, duration: 20.956s, episode steps: 39, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.703 [0.000, 255.000], loss: 0.042193, mae: 0.160378, mean_q: 0.285709
   4767/200000: episode: 117, duration: 21.068s, episode steps: 39, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.00

   5818/200000: episode: 144, duration: 21.034s, episode steps: 39, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.703 [0.000, 255.000], loss: 0.000367, mae: 0.128220, mean_q: 0.228303
   5857/200000: episode: 145, duration: 21.088s, episode steps: 39, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.703 [0.000, 255.000], loss: 0.000267, mae: 0.128281, mean_q: 0.227746
   5896/200000: episode: 146, duration: 21.248s, episode steps: 39, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.000], mean observation: 171.703 [0.000, 255.000], loss: 0.041418, mae: 0.130978, mean_q: 0.225909
   5935/200000: episode: 147, duration: 21.397s, episode steps: 39, steps per second: 2, episode reward: 0.150, mean reward: 0.004 [0.000, 0.150], mean action: 2.000 [2.000, 2.00

   6962/200000: episode: 174, duration: 21.534s, episode steps: 32, steps per second: 1, episode reward: 0.050, mean reward: 0.002 [0.000, 0.050], mean action: 0.000 [0.000, 0.000], mean observation: 172.038 [0.000, 255.000], loss: 0.001639, mae: 0.155655, mean_q: 0.267325
   6995/200000: episode: 175, duration: 21.475s, episode steps: 33, steps per second: 2, episode reward: 0.050, mean reward: 0.002 [0.000, 0.050], mean action: 0.000 [0.000, 0.000], mean observation: 171.989 [0.000, 255.000], loss: 0.001007, mae: 0.160608, mean_q: 0.280250
   7028/200000: episode: 176, duration: 21.681s, episode steps: 33, steps per second: 2, episode reward: 0.050, mean reward: 0.002 [0.000, 0.050], mean action: 0.000 [0.000, 0.000], mean observation: 171.989 [0.000, 255.000], loss: 0.000693, mae: 0.162762, mean_q: 0.278309
   7061/200000: episode: 177, duration: 21.587s, episode steps: 33, steps per second: 2, episode reward: 0.050, mean reward: 0.002 [0.000, 0.050], mean action: 0.000 [0.000, 0.00

   8051/200000: episode: 204, duration: 21.400s, episode steps: 39, steps per second: 2, episode reward: 1.150, mean reward: 0.029 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.551 [0.000, 255.000], loss: 0.043640, mae: 0.225033, mean_q: 0.373813
   8090/200000: episode: 205, duration: 21.422s, episode steps: 39, steps per second: 2, episode reward: 1.150, mean reward: 0.029 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.551 [0.000, 255.000], loss: 0.043736, mae: 0.228224, mean_q: 0.382090
   8129/200000: episode: 206, duration: 21.329s, episode steps: 39, steps per second: 2, episode reward: 1.150, mean reward: 0.029 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.551 [0.000, 255.000], loss: 0.002800, mae: 0.228453, mean_q: 0.382761
   8168/200000: episode: 207, duration: 21.211s, episode steps: 39, steps per second: 2, episode reward: 1.250, mean reward: 0.032 [0.000, 1.250], mean action: 0.897 [0.000, 1.00

   9205/200000: episode: 234, duration: 21.333s, episode steps: 39, steps per second: 2, episode reward: 1.150, mean reward: 0.029 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.551 [0.000, 255.000], loss: 0.004980, mae: 0.317498, mean_q: 0.487722
   9243/200000: episode: 235, duration: 21.037s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.004589, mae: 0.330708, mean_q: 0.511061
   9281/200000: episode: 236, duration: 21.069s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.003877, mae: 0.340809, mean_q: 0.522276
   9319/200000: episode: 237, duration: 21.122s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.00

  10347/200000: episode: 264, duration: 21.030s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.003981, mae: 0.401109, mean_q: 0.603379
  10385/200000: episode: 265, duration: 21.337s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.003208, mae: 0.401091, mean_q: 0.600822
  10423/200000: episode: 266, duration: 20.929s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.004573, mae: 0.408664, mean_q: 0.604840
  10461/200000: episode: 267, duration: 21.102s, episode steps: 38, steps per second: 2, episode reward: 1.250, mean reward: 0.033 [0.000, 1.250], mean action: 0.105 [0.000, 1.00

  11484/200000: episode: 294, duration: 21.267s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.002966, mae: 0.467363, mean_q: 0.651188
  11522/200000: episode: 295, duration: 21.238s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.049397, mae: 0.465161, mean_q: 0.651357
  11560/200000: episode: 296, duration: 21.546s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.003506, mae: 0.464673, mean_q: 0.656960
  11596/200000: episode: 297, duration: 21.372s, episode steps: 36, steps per second: 2, episode reward: 1.250, mean reward: 0.035 [0.000, 1.250], mean action: 0.833 [0.000, 1.00

  12687/200000: episode: 324, duration: 21.493s, episode steps: 39, steps per second: 2, episode reward: 1.150, mean reward: 0.029 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.551 [0.000, 255.000], loss: 0.005296, mae: 0.488597, mean_q: 0.704475
  12726/200000: episode: 325, duration: 21.525s, episode steps: 39, steps per second: 2, episode reward: 1.350, mean reward: 0.035 [0.000, 1.350], mean action: 0.846 [0.000, 1.000], mean observation: 164.184 [0.000, 255.000], loss: 0.003408, mae: 0.491113, mean_q: 0.704695
  12764/200000: episode: 326, duration: 21.214s, episode steps: 38, steps per second: 2, episode reward: 1.350, mean reward: 0.036 [0.000, 1.350], mean action: 0.368 [0.000, 1.000], mean observation: 164.117 [0.000, 255.000], loss: 0.003566, mae: 0.492430, mean_q: 0.703824
  12803/200000: episode: 327, duration: 21.408s, episode steps: 39, steps per second: 2, episode reward: 1.150, mean reward: 0.029 [0.000, 1.150], mean action: 1.000 [1.000, 1.00

  13851/200000: episode: 354, duration: 21.441s, episode steps: 39, steps per second: 2, episode reward: 1.250, mean reward: 0.032 [0.000, 1.250], mean action: 0.897 [0.000, 1.000], mean observation: 165.321 [0.000, 255.000], loss: 0.003324, mae: 0.523300, mean_q: 0.737334
  13890/200000: episode: 355, duration: 21.527s, episode steps: 39, steps per second: 2, episode reward: 1.150, mean reward: 0.029 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.551 [0.000, 255.000], loss: 0.003322, mae: 0.525860, mean_q: 0.740747
  13929/200000: episode: 356, duration: 21.439s, episode steps: 39, steps per second: 2, episode reward: 1.250, mean reward: 0.032 [0.000, 1.250], mean action: 0.974 [0.000, 1.000], mean observation: 164.369 [0.000, 255.000], loss: 0.004152, mae: 0.528489, mean_q: 0.737348
  13968/200000: episode: 357, duration: 21.355s, episode steps: 39, steps per second: 2, episode reward: 1.150, mean reward: 0.029 [0.000, 1.150], mean action: 1.000 [1.000, 1.00

  15011/200000: episode: 384, duration: 21.617s, episode steps: 39, steps per second: 2, episode reward: 1.250, mean reward: 0.032 [0.000, 1.250], mean action: 0.949 [0.000, 1.000], mean observation: 164.942 [0.000, 255.000], loss: 0.049317, mae: 0.545336, mean_q: 0.766317
  15049/200000: episode: 385, duration: 20.995s, episode steps: 38, steps per second: 2, episode reward: 1.350, mean reward: 0.036 [0.000, 1.350], mean action: 0.947 [0.000, 1.000], mean observation: 164.230 [0.000, 255.000], loss: 0.003229, mae: 0.544241, mean_q: 0.768914
  15088/200000: episode: 386, duration: 21.417s, episode steps: 39, steps per second: 2, episode reward: 1.150, mean reward: 0.029 [0.000, 1.150], mean action: 0.077 [0.000, 1.000], mean observation: 171.391 [0.000, 255.000], loss: 0.003419, mae: 0.547924, mean_q: 0.773411
  15126/200000: episode: 387, duration: 21.049s, episode steps: 38, steps per second: 2, episode reward: 1.350, mean reward: 0.036 [0.000, 1.350], mean action: 0.763 [0.000, 1.00

  16154/200000: episode: 414, duration: 21.174s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 0.553 [0.000, 1.000], mean observation: 167.920 [0.000, 255.000], loss: 0.003686, mae: 0.536850, mean_q: 0.774122
  16192/200000: episode: 415, duration: 21.210s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.050128, mae: 0.542872, mean_q: 0.783364
  16230/200000: episode: 416, duration: 21.348s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.004484, mae: 0.539549, mean_q: 0.782434
  16268/200000: episode: 417, duration: 21.050s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.00

  17293/200000: episode: 444, duration: 21.233s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.003629, mae: 0.561327, mean_q: 0.790122
  17331/200000: episode: 445, duration: 21.233s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.002910, mae: 0.560357, mean_q: 0.786179
  17369/200000: episode: 446, duration: 21.211s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.003007, mae: 0.561217, mean_q: 0.789516
  17407/200000: episode: 447, duration: 21.152s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.00

  18433/200000: episode: 474, duration: 21.158s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.002745, mae: 0.562857, mean_q: 0.792245
  18471/200000: episode: 475, duration: 21.240s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.002402, mae: 0.566581, mean_q: 0.794994
  18509/200000: episode: 476, duration: 21.371s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.003920, mae: 0.569360, mean_q: 0.798075
  18547/200000: episode: 477, duration: 21.267s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 0.684 [0.000, 1.00

  19570/200000: episode: 504, duration: 21.459s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.004030, mae: 0.577852, mean_q: 0.804205
  19608/200000: episode: 505, duration: 21.464s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.003002, mae: 0.577608, mean_q: 0.803149
  19645/200000: episode: 506, duration: 21.019s, episode steps: 37, steps per second: 2, episode reward: 1.150, mean reward: 0.031 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.631 [0.000, 255.000], loss: 0.002598, mae: 0.581531, mean_q: 0.803848
  19683/200000: episode: 507, duration: 21.499s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 0.395 [0.000, 1.00

  20702/200000: episode: 534, duration: 21.401s, episode steps: 36, steps per second: 2, episode reward: 1.150, mean reward: 0.032 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.675 [0.000, 255.000], loss: 0.003367, mae: 0.575981, mean_q: 0.816420
  20738/200000: episode: 535, duration: 21.596s, episode steps: 36, steps per second: 2, episode reward: 1.350, mean reward: 0.038 [0.000, 1.350], mean action: 0.917 [0.000, 1.000], mean observation: 164.297 [0.000, 255.000], loss: 0.003790, mae: 0.574673, mean_q: 0.808889
  20774/200000: episode: 536, duration: 21.480s, episode steps: 36, steps per second: 2, episode reward: 1.150, mean reward: 0.032 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.675 [0.000, 255.000], loss: 0.002296, mae: 0.578852, mean_q: 0.815942
  20809/200000: episode: 537, duration: 21.099s, episode steps: 35, steps per second: 2, episode reward: 1.150, mean reward: 0.033 [0.000, 1.150], mean action: 0.314 [0.000, 1.00

  21800/200000: episode: 564, duration: 21.503s, episode steps: 38, steps per second: 2, episode reward: 1.150, mean reward: 0.030 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.590 [0.000, 255.000], loss: 0.004041, mae: 0.564364, mean_q: 0.814970
  21837/200000: episode: 565, duration: 21.113s, episode steps: 37, steps per second: 2, episode reward: 1.250, mean reward: 0.034 [0.000, 1.250], mean action: 0.378 [0.000, 1.000], mean observation: 164.335 [0.000, 255.000], loss: 0.003397, mae: 0.572914, mean_q: 0.823206
  21874/200000: episode: 566, duration: 21.165s, episode steps: 37, steps per second: 2, episode reward: 1.250, mean reward: 0.034 [0.000, 1.250], mean action: 0.649 [0.000, 1.000], mean observation: 166.469 [0.000, 255.000], loss: 0.004244, mae: 0.572882, mean_q: 0.821017
  21912/200000: episode: 567, duration: 21.569s, episode steps: 38, steps per second: 2, episode reward: 0.050, mean reward: 0.001 [0.000, 0.050], mean action: 0.000 [0.000, 0.00

  22909/200000: episode: 594, duration: 21.188s, episode steps: 37, steps per second: 2, episode reward: 1.150, mean reward: 0.031 [0.000, 1.150], mean action: 1.000 [1.000, 1.000], mean observation: 164.631 [0.000, 255.000], loss: 0.003677, mae: 0.579745, mean_q: 0.836974
  22946/200000: episode: 595, duration: 21.126s, episode steps: 37, steps per second: 2, episode reward: 1.350, mean reward: 0.036 [0.000, 1.350], mean action: 0.216 [0.000, 1.000], mean observation: 164.118 [0.000, 255.000], loss: 0.051816, mae: 0.582657, mean_q: 0.831830
