In [1]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, Reshape
from keras.optimizers import Adam
from keras.regularizers import l2

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
import cupcake_game

# ゲームクラスをロード
env = cupcake_game.Game(step=True, image=True)
# プレイヤーの行動種類数（上下左右の移動）
nb_actions = 4

# CNNで使うパラメータ
hidden_size = 128
n_filters = 8
kernel = (13, 13)
strides = (3, 3)

# CNNモデル
model = Sequential()
model.add(Reshape((env.observation_space.shape), input_shape=(1,) + env.observation_space.shape))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Flatten())
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(nb_actions, activation='linear'))
print(model.summary())

memory = SequentialMemory(limit=100000, window_length=1)
policy = EpsGreedyQPolicy(eps=0.1)

# DQNモデル
dqn = DQNAgent(model=model, nb_actions=nb_actions, gamma=0.99, memory=memory, nb_steps_warmup=100,
               target_model_update=1e-2, policy=policy)

dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# パラメータの重みを保存しているファイル
fname = "cupcake_dqn_weights.bin"
try:
    dqn.load_weights(fname)
    print("Weights are loaded.")
except:
    print("Weights are NOT loaded.")

# 学習実行
history = dqn.fit(env, nb_steps=10000, verbose=2)

dqn.save_weights(fname, overwrite=True)

# テスト実行
dqn.test(env, nb_episodes=10)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 151, 120, 3)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 51, 40, 8)         4064      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 17, 14, 8)         10824     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 6, 5, 8)           10824     
_________________________________________________________________
flatten_1 (Flatten)          (None, 240)               0         
_________________________________________________________________
dense_1 (Dense)              

  896/10000: episode: 24, duration: 21.127s, episode steps: 45, steps per second: 2, episode reward: 4.850, mean reward: 0.108 [0.000, 4.850], mean action: 0.289 [0.000, 2.000], mean observation: 163.481 [0.000, 255.000], loss: 0.010994, mae: 1.481926, mean_q: 2.006811
  940/10000: episode: 25, duration: 21.029s, episode steps: 44, steps per second: 2, episode reward: 1.750, mean reward: 0.040 [0.000, 1.750], mean action: 0.227 [0.000, 3.000], mean observation: 160.732 [0.000, 255.000], loss: 0.008656, mae: 1.495731, mean_q: 2.023999
  984/10000: episode: 26, duration: 21.029s, episode steps: 44, steps per second: 2, episode reward: 3.300, mean reward: 0.075 [0.000, 3.300], mean action: 0.977 [0.000, 3.000], mean observation: 159.391 [0.000, 255.000], loss: 0.008236, mae: 1.514951, mean_q: 2.037872
 1028/10000: episode: 27, duration: 21.239s, episode steps: 44, steps per second: 2, episode reward: 5.750, mean reward: 0.131 [0.000, 5.750], mean action: 0.545 [0.000, 3.000], mean observa

 2120/10000: episode: 55, duration: 21.242s, episode steps: 38, steps per second: 2, episode reward: 3.300, mean reward: 0.087 [0.000, 3.300], mean action: 1.553 [0.000, 3.000], mean observation: 160.362 [0.000, 255.000], loss: 0.010476, mae: 1.881622, mean_q: 2.528676
 2158/10000: episode: 56, duration: 21.496s, episode steps: 38, steps per second: 2, episode reward: 3.100, mean reward: 0.082 [0.000, 3.100], mean action: 0.737 [0.000, 3.000], mean observation: 161.221 [0.000, 255.000], loss: 0.014021, mae: 1.873170, mean_q: 2.522852
 2196/10000: episode: 57, duration: 21.508s, episode steps: 38, steps per second: 2, episode reward: 3.400, mean reward: 0.089 [0.000, 3.400], mean action: 1.658 [0.000, 3.000], mean observation: 161.738 [0.000, 255.000], loss: 0.017027, mae: 1.905677, mean_q: 2.554651
 2234/10000: episode: 58, duration: 21.482s, episode steps: 38, steps per second: 2, episode reward: 5.300, mean reward: 0.139 [0.000, 5.300], mean action: 1.289 [0.000, 3.000], mean observa

 3296/10000: episode: 86, duration: 21.476s, episode steps: 38, steps per second: 2, episode reward: 4.250, mean reward: 0.112 [0.000, 4.250], mean action: 0.632 [0.000, 2.000], mean observation: 161.238 [0.000, 255.000], loss: 0.012382, mae: 1.985019, mean_q: 2.661397
 3334/10000: episode: 87, duration: 21.597s, episode steps: 38, steps per second: 2, episode reward: 6.550, mean reward: 0.172 [0.000, 6.550], mean action: 0.974 [0.000, 3.000], mean observation: 163.360 [0.000, 255.000], loss: 0.015500, mae: 2.010676, mean_q: 2.684936
 3371/10000: episode: 88, duration: 21.342s, episode steps: 37, steps per second: 2, episode reward: 3.200, mean reward: 0.086 [0.000, 3.200], mean action: 1.514 [0.000, 3.000], mean observation: 159.591 [0.000, 255.000], loss: 0.013602, mae: 2.000612, mean_q: 2.681289
 3409/10000: episode: 89, duration: 21.468s, episode steps: 38, steps per second: 2, episode reward: 3.050, mean reward: 0.080 [0.000, 3.050], mean action: 1.737 [0.000, 3.000], mean observa

 4465/10000: episode: 117, duration: 21.092s, episode steps: 37, steps per second: 2, episode reward: 3.950, mean reward: 0.107 [0.000, 3.950], mean action: 0.730 [0.000, 3.000], mean observation: 162.098 [0.000, 255.000], loss: 0.015150, mae: 2.119435, mean_q: 2.837511
 4502/10000: episode: 118, duration: 21.005s, episode steps: 37, steps per second: 2, episode reward: 2.850, mean reward: 0.077 [0.000, 2.850], mean action: 2.081 [0.000, 3.000], mean observation: 160.487 [0.000, 255.000], loss: 0.012390, mae: 2.103544, mean_q: 2.813265
 4540/10000: episode: 119, duration: 21.455s, episode steps: 38, steps per second: 2, episode reward: 2.900, mean reward: 0.076 [0.000, 2.900], mean action: 1.632 [0.000, 2.000], mean observation: 159.201 [0.000, 255.000], loss: 0.017485, mae: 2.131695, mean_q: 2.860442
 4578/10000: episode: 120, duration: 21.382s, episode steps: 38, steps per second: 2, episode reward: 4.000, mean reward: 0.105 [0.000, 4.000], mean action: 1.868 [0.000, 3.000], mean obs

 5624/10000: episode: 148, duration: 21.019s, episode steps: 37, steps per second: 2, episode reward: 3.400, mean reward: 0.092 [0.000, 3.400], mean action: 0.757 [0.000, 3.000], mean observation: 163.263 [0.000, 255.000], loss: 0.017732, mae: 2.215423, mean_q: 2.947275
 5661/10000: episode: 149, duration: 21.315s, episode steps: 37, steps per second: 2, episode reward: 1.950, mean reward: 0.053 [0.000, 1.950], mean action: 1.162 [0.000, 3.000], mean observation: 160.034 [0.000, 255.000], loss: 0.013956, mae: 2.202012, mean_q: 2.945162
 5698/10000: episode: 150, duration: 21.080s, episode steps: 37, steps per second: 2, episode reward: 3.200, mean reward: 0.086 [0.000, 3.200], mean action: 1.622 [0.000, 3.000], mean observation: 163.148 [0.000, 255.000], loss: 0.015704, mae: 2.205628, mean_q: 2.945117
 5736/10000: episode: 151, duration: 21.514s, episode steps: 38, steps per second: 2, episode reward: 2.800, mean reward: 0.074 [0.000, 2.800], mean action: 1.184 [0.000, 3.000], mean obs

 6781/10000: episode: 179, duration: 21.192s, episode steps: 37, steps per second: 2, episode reward: 4.650, mean reward: 0.126 [0.000, 4.650], mean action: 2.000 [0.000, 3.000], mean observation: 163.496 [0.000, 255.000], loss: 0.012867, mae: 2.207438, mean_q: 2.953501
 6818/10000: episode: 180, duration: 21.537s, episode steps: 37, steps per second: 2, episode reward: 3.400, mean reward: 0.092 [0.000, 3.400], mean action: 1.324 [0.000, 3.000], mean observation: 167.579 [0.000, 255.000], loss: 0.012851, mae: 2.228673, mean_q: 2.984241
 6856/10000: episode: 181, duration: 21.538s, episode steps: 38, steps per second: 2, episode reward: 4.350, mean reward: 0.114 [0.000, 4.350], mean action: 1.895 [0.000, 3.000], mean observation: 160.665 [0.000, 255.000], loss: 0.015746, mae: 2.203345, mean_q: 2.954720
 6893/10000: episode: 182, duration: 21.227s, episode steps: 37, steps per second: 2, episode reward: 3.350, mean reward: 0.091 [0.000, 3.350], mean action: 1.595 [0.000, 3.000], mean obs

 7935/10000: episode: 210, duration: 21.249s, episode steps: 37, steps per second: 2, episode reward: 3.300, mean reward: 0.089 [0.000, 3.300], mean action: 1.216 [0.000, 3.000], mean observation: 161.038 [0.000, 255.000], loss: 0.018489, mae: 2.277959, mean_q: 3.056059
 7972/10000: episode: 211, duration: 21.244s, episode steps: 37, steps per second: 2, episode reward: 3.400, mean reward: 0.092 [0.000, 3.400], mean action: 1.892 [0.000, 3.000], mean observation: 160.995 [0.000, 255.000], loss: 0.012422, mae: 2.294136, mean_q: 3.068127
 8009/10000: episode: 212, duration: 21.060s, episode steps: 37, steps per second: 2, episode reward: 5.450, mean reward: 0.147 [0.000, 5.450], mean action: 1.270 [0.000, 3.000], mean observation: 164.403 [0.000, 255.000], loss: 0.009508, mae: 2.277064, mean_q: 3.049501
 8046/10000: episode: 213, duration: 21.138s, episode steps: 37, steps per second: 2, episode reward: 1.700, mean reward: 0.046 [0.000, 1.700], mean action: 1.378 [0.000, 3.000], mean obs

 9079/10000: episode: 241, duration: 21.246s, episode steps: 37, steps per second: 2, episode reward: 4.400, mean reward: 0.119 [0.000, 4.400], mean action: 0.919 [0.000, 3.000], mean observation: 165.735 [0.000, 255.000], loss: 0.011437, mae: 2.373951, mean_q: 3.184133
 9116/10000: episode: 242, duration: 21.357s, episode steps: 37, steps per second: 2, episode reward: 4.450, mean reward: 0.120 [0.000, 4.450], mean action: 1.054 [0.000, 3.000], mean observation: 162.449 [0.000, 255.000], loss: 0.013160, mae: 2.387528, mean_q: 3.205965
 9153/10000: episode: 243, duration: 21.367s, episode steps: 37, steps per second: 2, episode reward: 4.850, mean reward: 0.131 [0.000, 4.850], mean action: 0.919 [0.000, 3.000], mean observation: 163.762 [0.000, 255.000], loss: 0.015139, mae: 2.424606, mean_q: 3.250108
 9190/10000: episode: 244, duration: 21.444s, episode steps: 37, steps per second: 2, episode reward: 4.300, mean reward: 0.116 [0.000, 4.300], mean action: 0.622 [0.000, 3.000], mean obs

<keras.callbacks.callbacks.History at 0x2c81c181f08>