In [1]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, Reshape
from keras.optimizers import Adam
from keras.regularizers import l2

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
import cupcake_game

# ゲームクラスをロード
env = cupcake_game.Game(step=True, image=True)
# プレイヤーの行動種類数（上下左右の移動）
nb_actions = 4

# CNNで使うパラメータ
hidden_size = 128
n_filters = 8
kernel = (13, 13)
strides = (3, 3)

# CNNモデル
model = Sequential()
model.add(Reshape((env.observation_space.shape), input_shape=(1,) + env.observation_space.shape))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Conv2D(n_filters, kernel, strides=strides, activation='relu', padding='same'))
model.add(Flatten())
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(hidden_size, kernel_initializer='he_normal', activation='relu',
                kernel_regularizer=l2(0.01)))
model.add(Dense(nb_actions, activation='linear'))
print(model.summary())

memory = SequentialMemory(limit=100000, window_length=1)
policy = EpsGreedyQPolicy(eps=0.001)

# DQNモデル
dqn = DQNAgent(model=model, nb_actions=nb_actions, gamma=0.99, memory=memory, nb_steps_warmup=100,
               target_model_update=1e-2, policy=policy)

dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# パラメータの重みを保存しているファイル
fname = "cupcake_dqn_weights.bin"
try:
    dqn.load_weights(fname)
    print("Weights are loaded.")
except:
    print("Weights are NOT loaded.")

# 学習実行
history = dqn.fit(env, nb_steps=1000, verbose=2)

dqn.save_weights(fname, overwrite=True)

# テスト実行
dqn.test(env, nb_episodes=10)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 151, 120, 3)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 51, 40, 8)         4064      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 17, 14, 8)         10824     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 6, 5, 8)           10824     
_________________________________________________________________
flatten_1 (Flatten)          (None, 240)               0         
_________________________________________________________________
dense_1 (Dense)              

 840/1000: episode: 23, duration: 21.237s, episode steps: 33, steps per second: 2, episode reward: 1.000, mean reward: 0.030 [0.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: 164.794 [0.000, 255.000], loss: 6.250738, mae: 2.243757, mean_q: 4.410833
 874/1000: episode: 24, duration: 21.723s, episode steps: 34, steps per second: 2, episode reward: 1.000, mean reward: 0.029 [0.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: 164.742 [0.000, 255.000], loss: 6.958624, mae: 2.145422, mean_q: 4.156997
 907/1000: episode: 25, duration: 21.170s, episode steps: 33, steps per second: 2, episode reward: 1.000, mean reward: 0.030 [0.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: 164.794 [0.000, 255.000], loss: 6.199006, mae: 2.071837, mean_q: 4.233021
 941/1000: episode: 26, duration: 21.717s, episode steps: 34, steps per second: 2, episode reward: 1.000, mean reward: 0.029 [0.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: 16

<keras.callbacks.callbacks.History at 0x2303f95b788>