In [1]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, Reshape
from keras.optimizers import Adam
from keras.regularizers import l2

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
import cupcake_game

# ゲームクラスをロード
env = cupcake_game.Game(step=True, image=True)
# プレイヤーの行動種類数（上下左右の移動）
nb_actions = 4

# CNNで使うパラメータ
hidden_size = 128
n_filters = 8
kernel = (13, 13)
strides = (3, 3)

# CNNモデル
model = Sequential()
model.add(Reshape((env.observation_space.shape), input_shape=(1,) + env.observation_space.shape))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Flatten())
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(nb_actions, activation='linear'))
print(model.summary())

memory = SequentialMemory(limit=100000, window_length=1)
policy = EpsGreedyQPolicy(eps=0.1)

# DQNモデル
dqn = DQNAgent(model=model, nb_actions=nb_actions, gamma=0.99, memory=memory, nb_steps_warmup=100,
               target_model_update=1e-2, policy=policy)

dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# パラメータの重みを保存しているファイル
fname = "cupcake_dqn_weights.bin"
try:
    dqn.load_weights(fname)
    print("Weights are loaded.")
except:
    print("Weights are NOT loaded.")

# 学習実行
history = dqn.fit(env, nb_steps=20000, verbose=2)

dqn.save_weights(fname, overwrite=True)

# テスト実行
dqn.test(env, nb_episodes=10)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 151, 120, 3)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 51, 40, 8)         4064      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 17, 14, 8)         10824     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 6, 5, 8)           10824     
_________________________________________________________________
flatten_1 (Flatten)          (None, 240)               0         
_________________________________________________________________
dense_1 (Dense)              

   997/20000: episode: 23, duration: 20.838s, episode steps: 45, steps per second: 2, episode reward: 2.500, mean reward: 0.056 [0.000, 2.500], mean action: 1.889 [0.000, 2.000], mean observation: 158.758 [0.000, 255.000], loss: 0.046139, mae: 2.780423, mean_q: 3.825707
  1041/20000: episode: 24, duration: 21.114s, episode steps: 44, steps per second: 2, episode reward: 4.100, mean reward: 0.093 [0.000, 4.100], mean action: 1.409 [0.000, 3.000], mean observation: 161.493 [0.000, 255.000], loss: 0.039330, mae: 2.739099, mean_q: 3.731833
  1084/20000: episode: 25, duration: 21.122s, episode steps: 43, steps per second: 2, episode reward: 3.000, mean reward: 0.070 [0.000, 3.000], mean action: 0.791 [0.000, 2.000], mean observation: 160.849 [0.000, 255.000], loss: 0.042308, mae: 2.797349, mean_q: 3.809203
  1126/20000: episode: 26, duration: 20.965s, episode steps: 42, steps per second: 2, episode reward: 2.700, mean reward: 0.064 [0.000, 2.700], mean action: 1.000 [0.000, 2.000], mean obs

  2249/20000: episode: 54, duration: 20.995s, episode steps: 39, steps per second: 2, episode reward: 1.250, mean reward: 0.032 [0.000, 1.250], mean action: 0.333 [0.000, 3.000], mean observation: 163.271 [0.000, 255.000], loss: 0.024901, mae: 2.574172, mean_q: 3.480442
  2289/20000: episode: 55, duration: 21.126s, episode steps: 40, steps per second: 2, episode reward: 2.550, mean reward: 0.064 [0.000, 2.550], mean action: 1.550 [0.000, 3.000], mean observation: 163.510 [0.000, 255.000], loss: 0.022722, mae: 2.579547, mean_q: 3.485327
  2329/20000: episode: 56, duration: 21.351s, episode steps: 40, steps per second: 2, episode reward: 1.450, mean reward: 0.036 [0.000, 1.450], mean action: 0.725 [0.000, 3.000], mean observation: 163.582 [0.000, 255.000], loss: 0.087097, mae: 2.573868, mean_q: 3.466062
  2368/20000: episode: 57, duration: 20.986s, episode steps: 39, steps per second: 2, episode reward: 1.250, mean reward: 0.032 [0.000, 1.250], mean action: 0.487 [0.000, 3.000], mean obs

  3255/20000: episode: 85, duration: 21.628s, episode steps: 23, steps per second: 1, episode reward: 4.800, mean reward: 0.209 [0.000, 4.800], mean action: 1.304 [0.000, 2.000], mean observation: 164.255 [0.000, 255.000], loss: 0.119612, mae: 2.169650, mean_q: 2.928690
  3277/20000: episode: 86, duration: 22.750s, episode steps: 22, steps per second: 1, episode reward: 0.200, mean reward: 0.009 [0.000, 0.200], mean action: 2.727 [0.000, 3.000], mean observation: 167.558 [0.000, 255.000], loss: 0.136926, mae: 2.211826, mean_q: 2.990405
  3297/20000: episode: 87, duration: 22.779s, episode steps: 20, steps per second: 1, episode reward: 0.100, mean reward: 0.005 [0.000, 0.100], mean action: 2.750 [0.000, 3.000], mean observation: 167.269 [0.000, 255.000], loss: 0.255952, mae: 2.208281, mean_q: 2.978340
  3320/20000: episode: 88, duration: 22.528s, episode steps: 23, steps per second: 1, episode reward: 2.450, mean reward: 0.107 [0.000, 2.450], mean action: 1.000 [0.000, 3.000], mean obs

  3903/20000: episode: 116, duration: 22.815s, episode steps: 22, steps per second: 1, episode reward: 1.250, mean reward: 0.057 [0.000, 1.250], mean action: 1.773 [0.000, 2.000], mean observation: 164.777 [0.000, 255.000], loss: 0.137144, mae: 2.116305, mean_q: 2.841987
  3925/20000: episode: 117, duration: 22.387s, episode steps: 22, steps per second: 1, episode reward: 1.150, mean reward: 0.052 [0.000, 1.150], mean action: 1.909 [0.000, 3.000], mean observation: 160.372 [0.000, 255.000], loss: 0.027837, mae: 2.101790, mean_q: 2.829130
  3948/20000: episode: 118, duration: 22.251s, episode steps: 23, steps per second: 1, episode reward: 4.950, mean reward: 0.215 [0.000, 4.950], mean action: 1.130 [0.000, 2.000], mean observation: 163.992 [0.000, 255.000], loss: 0.229149, mae: 2.115800, mean_q: 2.837968
  3971/20000: episode: 119, duration: 22.522s, episode steps: 23, steps per second: 1, episode reward: 1.150, mean reward: 0.050 [0.000, 1.150], mean action: 1.913 [0.000, 3.000], mean

  4906/20000: episode: 147, duration: 21.444s, episode steps: 35, steps per second: 2, episode reward: 2.650, mean reward: 0.076 [0.000, 2.650], mean action: 1.457 [0.000, 3.000], mean observation: 161.008 [0.000, 255.000], loss: 0.090628, mae: 1.929389, mean_q: 2.598830
  4939/20000: episode: 148, duration: 21.708s, episode steps: 33, steps per second: 2, episode reward: 2.650, mean reward: 0.080 [0.000, 2.650], mean action: 1.576 [0.000, 3.000], mean observation: 162.466 [0.000, 255.000], loss: 0.219096, mae: 1.930538, mean_q: 2.602988
  4973/20000: episode: 149, duration: 21.374s, episode steps: 34, steps per second: 2, episode reward: 2.700, mean reward: 0.079 [0.000, 2.700], mean action: 1.088 [0.000, 3.000], mean observation: 160.661 [0.000, 255.000], loss: 0.152499, mae: 1.924819, mean_q: 2.594511
  5007/20000: episode: 150, duration: 21.520s, episode steps: 34, steps per second: 2, episode reward: 2.450, mean reward: 0.072 [0.000, 2.450], mean action: 1.971 [0.000, 3.000], mean

  5933/20000: episode: 178, duration: 21.836s, episode steps: 34, steps per second: 2, episode reward: 1.250, mean reward: 0.037 [0.000, 1.250], mean action: 2.471 [0.000, 3.000], mean observation: 163.560 [0.000, 255.000], loss: 0.138983, mae: 1.833264, mean_q: 2.485834
  5966/20000: episode: 179, duration: 21.393s, episode steps: 33, steps per second: 2, episode reward: 2.550, mean reward: 0.077 [0.000, 2.550], mean action: 1.242 [0.000, 3.000], mean observation: 163.891 [0.000, 255.000], loss: 0.203095, mae: 1.838422, mean_q: 2.494488
  5999/20000: episode: 180, duration: 21.358s, episode steps: 33, steps per second: 2, episode reward: 2.550, mean reward: 0.077 [0.000, 2.550], mean action: 2.152 [0.000, 3.000], mean observation: 161.433 [0.000, 255.000], loss: 0.139666, mae: 1.844954, mean_q: 2.500036
  6032/20000: episode: 181, duration: 21.492s, episode steps: 33, steps per second: 2, episode reward: 2.800, mean reward: 0.085 [0.000, 2.800], mean action: 1.182 [0.000, 3.000], mean

  6947/20000: episode: 209, duration: 21.547s, episode steps: 32, steps per second: 1, episode reward: 1.250, mean reward: 0.039 [0.000, 1.250], mean action: 0.688 [0.000, 3.000], mean observation: 164.483 [0.000, 255.000], loss: 0.262026, mae: 1.739233, mean_q: 2.331042
  6980/20000: episode: 210, duration: 21.630s, episode steps: 33, steps per second: 2, episode reward: 4.050, mean reward: 0.123 [0.000, 4.050], mean action: 0.545 [0.000, 2.000], mean observation: 162.634 [0.000, 255.000], loss: 0.133502, mae: 1.758830, mean_q: 2.375882
  7013/20000: episode: 211, duration: 21.555s, episode steps: 33, steps per second: 2, episode reward: 2.450, mean reward: 0.074 [0.000, 2.450], mean action: 0.697 [0.000, 3.000], mean observation: 163.889 [0.000, 255.000], loss: 0.016134, mae: 1.766261, mean_q: 2.401660
  7046/20000: episode: 212, duration: 21.578s, episode steps: 33, steps per second: 2, episode reward: 1.250, mean reward: 0.038 [0.000, 1.250], mean action: 1.000 [0.000, 3.000], mean

  7909/20000: episode: 240, duration: 21.440s, episode steps: 32, steps per second: 1, episode reward: 4.850, mean reward: 0.152 [0.000, 4.850], mean action: 1.031 [0.000, 2.000], mean observation: 164.260 [0.000, 255.000], loss: 0.249252, mae: 1.701394, mean_q: 2.304226
  7941/20000: episode: 241, duration: 21.854s, episode steps: 32, steps per second: 1, episode reward: 1.250, mean reward: 0.039 [0.000, 1.250], mean action: 1.062 [0.000, 2.000], mean observation: 162.577 [0.000, 255.000], loss: 0.080163, mae: 1.715425, mean_q: 2.334441
  7972/20000: episode: 242, duration: 21.567s, episode steps: 31, steps per second: 1, episode reward: 2.350, mean reward: 0.076 [0.000, 2.350], mean action: 1.097 [0.000, 3.000], mean observation: 162.585 [0.000, 255.000], loss: 0.016761, mae: 1.711706, mean_q: 2.339354
  8003/20000: episode: 243, duration: 21.256s, episode steps: 31, steps per second: 1, episode reward: 2.650, mean reward: 0.085 [0.000, 2.650], mean action: 1.161 [0.000, 3.000], mean

  8854/20000: episode: 271, duration: 21.693s, episode steps: 32, steps per second: 1, episode reward: 1.250, mean reward: 0.039 [0.000, 1.250], mean action: 1.531 [0.000, 3.000], mean observation: 164.580 [0.000, 255.000], loss: 0.011210, mae: 1.736886, mean_q: 2.366236
  8886/20000: episode: 272, duration: 21.702s, episode steps: 32, steps per second: 1, episode reward: 1.250, mean reward: 0.039 [0.000, 1.250], mean action: 2.781 [0.000, 3.000], mean observation: 162.633 [0.000, 255.000], loss: 0.201873, mae: 1.750175, mean_q: 2.398872
  8917/20000: episode: 273, duration: 21.712s, episode steps: 31, steps per second: 1, episode reward: 1.150, mean reward: 0.037 [0.000, 1.150], mean action: 0.935 [0.000, 3.000], mean observation: 160.007 [0.000, 255.000], loss: 0.086627, mae: 1.699071, mean_q: 2.314070
  8948/20000: episode: 274, duration: 21.924s, episode steps: 31, steps per second: 1, episode reward: 1.250, mean reward: 0.040 [0.000, 1.250], mean action: 1.581 [0.000, 3.000], mean

  9835/20000: episode: 302, duration: 21.372s, episode steps: 33, steps per second: 2, episode reward: 1.350, mean reward: 0.041 [0.000, 1.350], mean action: 2.515 [0.000, 3.000], mean observation: 163.238 [0.000, 255.000], loss: 0.125741, mae: 1.709072, mean_q: 2.318754
  9868/20000: episode: 303, duration: 21.629s, episode steps: 33, steps per second: 2, episode reward: 1.550, mean reward: 0.047 [0.000, 1.550], mean action: 2.303 [0.000, 3.000], mean observation: 162.245 [0.000, 255.000], loss: 0.021743, mae: 1.741783, mean_q: 2.364228
  9901/20000: episode: 304, duration: 21.321s, episode steps: 33, steps per second: 2, episode reward: 2.550, mean reward: 0.077 [0.000, 2.550], mean action: 1.424 [0.000, 3.000], mean observation: 162.770 [0.000, 255.000], loss: 0.020682, mae: 1.710047, mean_q: 2.310116
  9934/20000: episode: 305, duration: 21.491s, episode steps: 33, steps per second: 2, episode reward: 1.450, mean reward: 0.044 [0.000, 1.450], mean action: 1.303 [0.000, 3.000], mean

 10841/20000: episode: 333, duration: 21.263s, episode steps: 31, steps per second: 1, episode reward: 2.800, mean reward: 0.090 [0.000, 2.800], mean action: 0.839 [0.000, 2.000], mean observation: 161.834 [0.000, 255.000], loss: 0.148145, mae: 1.775882, mean_q: 2.380402
 10872/20000: episode: 334, duration: 21.377s, episode steps: 31, steps per second: 1, episode reward: 2.450, mean reward: 0.079 [0.000, 2.450], mean action: 1.323 [0.000, 3.000], mean observation: 160.801 [0.000, 255.000], loss: 0.138037, mae: 1.737061, mean_q: 2.336534
 10904/20000: episode: 335, duration: 21.619s, episode steps: 32, steps per second: 1, episode reward: 1.450, mean reward: 0.045 [0.000, 1.450], mean action: 0.562 [0.000, 3.000], mean observation: 163.890 [0.000, 255.000], loss: 0.019109, mae: 1.777343, mean_q: 2.395601
 10936/20000: episode: 336, duration: 21.680s, episode steps: 32, steps per second: 1, episode reward: 2.850, mean reward: 0.089 [0.000, 2.850], mean action: 2.062 [0.000, 3.000], mean

 11815/20000: episode: 364, duration: 21.658s, episode steps: 32, steps per second: 1, episode reward: 2.900, mean reward: 0.091 [0.000, 2.900], mean action: 0.438 [0.000, 3.000], mean observation: 163.923 [0.000, 255.000], loss: 0.026889, mae: 1.817892, mean_q: 2.448548
 11847/20000: episode: 365, duration: 21.667s, episode steps: 32, steps per second: 1, episode reward: 2.450, mean reward: 0.077 [0.000, 2.450], mean action: 0.469 [0.000, 3.000], mean observation: 164.243 [0.000, 255.000], loss: 0.016316, mae: 1.826207, mean_q: 2.456075
 11879/20000: episode: 366, duration: 21.711s, episode steps: 32, steps per second: 1, episode reward: 1.250, mean reward: 0.039 [0.000, 1.250], mean action: 2.375 [0.000, 3.000], mean observation: 164.435 [0.000, 255.000], loss: 0.205346, mae: 1.847110, mean_q: 2.479177
 11910/20000: episode: 367, duration: 21.375s, episode steps: 31, steps per second: 1, episode reward: 0.100, mean reward: 0.003 [0.000, 0.100], mean action: 2.032 [0.000, 3.000], mean

 12777/20000: episode: 395, duration: 21.842s, episode steps: 32, steps per second: 1, episode reward: 1.150, mean reward: 0.036 [0.000, 1.150], mean action: 2.219 [1.000, 3.000], mean observation: 160.601 [0.000, 255.000], loss: 0.286707, mae: 1.884131, mean_q: 2.515280
 12808/20000: episode: 396, duration: 21.452s, episode steps: 31, steps per second: 1, episode reward: 1.500, mean reward: 0.048 [0.000, 1.500], mean action: 1.871 [0.000, 3.000], mean observation: 163.495 [0.000, 255.000], loss: 0.026886, mae: 1.916822, mean_q: 2.581319
 12840/20000: episode: 397, duration: 21.909s, episode steps: 32, steps per second: 1, episode reward: 1.150, mean reward: 0.036 [0.000, 1.150], mean action: 2.250 [1.000, 3.000], mean observation: 163.452 [0.000, 255.000], loss: 0.024564, mae: 1.904473, mean_q: 2.564656
 12871/20000: episode: 398, duration: 21.497s, episode steps: 31, steps per second: 1, episode reward: 1.550, mean reward: 0.050 [0.000, 1.550], mean action: 1.452 [0.000, 3.000], mean

 13737/20000: episode: 426, duration: 21.873s, episode steps: 26, steps per second: 1, episode reward: 1.450, mean reward: 0.056 [0.000, 1.450], mean action: 0.654 [0.000, 3.000], mean observation: 162.539 [0.000, 255.000], loss: 0.103379, mae: 1.750997, mean_q: 2.357804
 13765/20000: episode: 427, duration: 21.977s, episode steps: 28, steps per second: 1, episode reward: 2.900, mean reward: 0.104 [0.000, 2.900], mean action: 1.679 [0.000, 3.000], mean observation: 161.291 [0.000, 255.000], loss: 0.087892, mae: 1.797143, mean_q: 2.428109
 13793/20000: episode: 428, duration: 21.463s, episode steps: 28, steps per second: 1, episode reward: 1.250, mean reward: 0.045 [0.000, 1.250], mean action: 2.429 [0.000, 3.000], mean observation: 164.210 [0.000, 255.000], loss: 0.018744, mae: 1.813861, mean_q: 2.440982
 13820/20000: episode: 429, duration: 21.487s, episode steps: 27, steps per second: 1, episode reward: 2.750, mean reward: 0.102 [0.000, 2.750], mean action: 0.741 [0.000, 3.000], mean

 14665/20000: episode: 457, duration: 21.310s, episode steps: 31, steps per second: 1, episode reward: 2.650, mean reward: 0.085 [0.000, 2.650], mean action: 2.355 [0.000, 3.000], mean observation: 161.829 [0.000, 255.000], loss: 0.071598, mae: 1.719283, mean_q: 2.301552
 14696/20000: episode: 458, duration: 21.750s, episode steps: 31, steps per second: 1, episode reward: 1.250, mean reward: 0.040 [0.000, 1.250], mean action: 1.387 [0.000, 3.000], mean observation: 164.439 [0.000, 255.000], loss: 0.016863, mae: 1.722586, mean_q: 2.312104
 14726/20000: episode: 459, duration: 21.571s, episode steps: 30, steps per second: 1, episode reward: 3.800, mean reward: 0.127 [0.000, 3.800], mean action: 0.467 [0.000, 3.000], mean observation: 164.547 [0.000, 255.000], loss: 0.081333, mae: 1.705655, mean_q: 2.293924
 14757/20000: episode: 460, duration: 21.711s, episode steps: 31, steps per second: 1, episode reward: 4.950, mean reward: 0.160 [0.000, 4.950], mean action: 1.161 [0.000, 3.000], mean

 15677/20000: episode: 488, duration: 21.171s, episode steps: 37, steps per second: 2, episode reward: 1.250, mean reward: 0.034 [0.000, 1.250], mean action: 0.108 [0.000, 3.000], mean observation: 164.262 [0.000, 255.000], loss: 0.014430, mae: 1.660562, mean_q: 2.239698
 15714/20000: episode: 489, duration: 21.440s, episode steps: 37, steps per second: 2, episode reward: 3.800, mean reward: 0.103 [0.000, 3.800], mean action: 1.541 [0.000, 3.000], mean observation: 160.453 [0.000, 255.000], loss: 0.176260, mae: 1.679229, mean_q: 2.263527
 15751/20000: episode: 490, duration: 21.395s, episode steps: 37, steps per second: 2, episode reward: 1.150, mean reward: 0.031 [0.000, 1.150], mean action: 0.189 [0.000, 2.000], mean observation: 159.024 [0.000, 255.000], loss: 0.073692, mae: 1.665263, mean_q: 2.249958
 15788/20000: episode: 491, duration: 21.258s, episode steps: 37, steps per second: 2, episode reward: 2.700, mean reward: 0.073 [0.000, 2.700], mean action: 1.486 [0.000, 3.000], mean

 16831/20000: episode: 519, duration: 21.288s, episode steps: 38, steps per second: 2, episode reward: 1.250, mean reward: 0.033 [0.000, 1.250], mean action: 2.553 [0.000, 3.000], mean observation: 162.872 [0.000, 255.000], loss: 0.065393, mae: 1.654434, mean_q: 2.220923
 16869/20000: episode: 520, duration: 21.340s, episode steps: 38, steps per second: 2, episode reward: 3.250, mean reward: 0.086 [0.000, 3.250], mean action: 2.342 [0.000, 3.000], mean observation: 162.031 [0.000, 255.000], loss: 0.019289, mae: 1.654997, mean_q: 2.220751
 16906/20000: episode: 521, duration: 21.092s, episode steps: 37, steps per second: 2, episode reward: 3.700, mean reward: 0.100 [0.000, 3.700], mean action: 0.757 [0.000, 3.000], mean observation: 162.644 [0.000, 255.000], loss: 0.068669, mae: 1.673127, mean_q: 2.246217
 16944/20000: episode: 522, duration: 21.588s, episode steps: 38, steps per second: 2, episode reward: 1.350, mean reward: 0.036 [0.000, 1.350], mean action: 1.842 [0.000, 3.000], mean

 18011/20000: episode: 550, duration: 21.355s, episode steps: 38, steps per second: 2, episode reward: 2.800, mean reward: 0.074 [0.000, 2.800], mean action: 1.763 [0.000, 3.000], mean observation: 159.752 [0.000, 255.000], loss: 0.070645, mae: 1.735839, mean_q: 2.342985
 18049/20000: episode: 551, duration: 21.405s, episode steps: 38, steps per second: 2, episode reward: 2.700, mean reward: 0.071 [0.000, 2.700], mean action: 0.868 [0.000, 2.000], mean observation: 160.115 [0.000, 255.000], loss: 0.131886, mae: 1.755275, mean_q: 2.375019
 18086/20000: episode: 552, duration: 21.250s, episode steps: 37, steps per second: 2, episode reward: 2.650, mean reward: 0.072 [0.000, 2.650], mean action: 0.703 [0.000, 3.000], mean observation: 161.931 [0.000, 255.000], loss: 0.128153, mae: 1.734495, mean_q: 2.350236
 18124/20000: episode: 553, duration: 21.315s, episode steps: 38, steps per second: 2, episode reward: 2.650, mean reward: 0.070 [0.000, 2.650], mean action: 1.684 [0.000, 3.000], mean

 19189/20000: episode: 581, duration: 21.021s, episode steps: 37, steps per second: 2, episode reward: 1.550, mean reward: 0.042 [0.000, 1.550], mean action: 1.676 [0.000, 3.000], mean observation: 162.285 [0.000, 255.000], loss: 0.014914, mae: 1.726305, mean_q: 2.317613
 19227/20000: episode: 582, duration: 21.010s, episode steps: 38, steps per second: 2, episode reward: 2.900, mean reward: 0.076 [0.000, 2.900], mean action: 1.579 [0.000, 3.000], mean observation: 161.733 [0.000, 255.000], loss: 0.016296, mae: 1.770333, mean_q: 2.373301
 19265/20000: episode: 583, duration: 21.101s, episode steps: 38, steps per second: 2, episode reward: 2.700, mean reward: 0.071 [0.000, 2.700], mean action: 1.132 [0.000, 3.000], mean observation: 163.425 [0.000, 255.000], loss: 0.015525, mae: 1.742630, mean_q: 2.336395
 19303/20000: episode: 584, duration: 21.219s, episode steps: 38, steps per second: 2, episode reward: 1.250, mean reward: 0.033 [0.000, 1.250], mean action: 2.158 [0.000, 3.000], mean

<keras.callbacks.callbacks.History at 0x27ea8c0abc8>