In [1]:
%matplotlib inline

import os
import io
import base64

from IPython.display import HTML
import matplotlib.pyplot as plt
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import Callback

Using TensorFlow backend.


In [2]:
weight_path = 'models/cartpole/keras_weights.h5'

In [3]:
env = gym.make('CartPole-v0')

In [4]:
env = gym.wrappers.Monitor(env, "./gym-results/cartpole", force=True, video_callable=(lambda ep: ep % 10 == 0))

In [5]:
input_shape = (1,) + env.observation_space.shape
output = env.action_space.n

In [6]:
model = Sequential()
model.add(Flatten(input_shape=input_shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(output))
model.add(Activation('linear'))

In [7]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)               

In [8]:
dqn = DQNAgent(
    model=model,
    # 出力 分類数 action数
    nb_actions=output,
    # 割引率 https://github.com/keras-rl/keras-rl/blob/master/rl/agents/dqn.py#L307
    gamma=0.99,
    # experience replay
    # メモリにaction、reward、observationsなどのデータを経験（Experience）として保管しておいて、
    # 後でランダムにデータを再生（Replay）して学習を行う
    memory=SequentialMemory(
        # メモリの上限サイズ
        limit=5000,
        # 観測を何個連結して処理するか。例えば時系列の複数の観測をまとめて1つの状態とする場合に利用。
        window_length=1,
    ),
    # ウォームアップステップ数。学習の初期は安定しないため、学習率を徐々に上げていく期間。
    nb_steps_warmup=10,
    # bellman equation
    # 1未満の値の場合はSoft update
    # 1以上の値の場合はHard update = ステップごとに重みが完全に更新
    target_model_update=1e-2,
    # 環境において行動を選択する基準
    # GreedyQPolicy デフォルト 探索か活用か、学習が進むにつれて探索率を下げていく
    # BoltzmannQPolicy ボルツマン分布を利用したソフトマックス手法による方策
    policy=BoltzmannQPolicy(),
)

In [9]:
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [10]:
if os.path.exists(weight_path):
    dqn.load_weights(weight_path)

In [11]:
try:
    dqn.fit(
        env,
        nb_steps=10000,  # 1min
        visualize=False,
        log_interval=1000,
    )
except KeyboardInterrupt:
    pass
finally:
    dqn.save_weights(weight_path, overwrite=True)

Training for 10000 steps ...
Interval 1 (0 steps performed)

   8/1000 [..............................] - ETA: 16s - reward: 1.0000 



38 episodes - episode_reward: 25.711 [10.000, 74.000] - loss: 0.167 - mae: 2.140 - mean_q: 4.072

Interval 2 (1000 steps performed)
10 episodes - episode_reward: 94.800 [13.000, 153.000] - loss: 0.691 - mae: 6.397 - mean_q: 12.811

Interval 3 (2000 steps performed)
5 episodes - episode_reward: 182.800 [148.000, 200.000] - loss: 1.431 - mae: 11.060 - mean_q: 22.382

Interval 4 (3000 steps performed)
5 episodes - episode_reward: 200.000 [200.000, 200.000] - loss: 2.116 - mae: 15.522 - mean_q: 31.453

Interval 5 (4000 steps performed)
5 episodes - episode_reward: 200.000 [200.000, 200.000] - loss: 3.062 - mae: 19.821 - mean_q: 40.109

Interval 6 (5000 steps performed)
5 episodes - episode_reward: 200.000 [200.000, 200.000] - loss: 4.575 - mae: 23.607 - mean_q: 47.616

Interval 7 (6000 steps performed)
5 episodes - episode_reward: 200.000 [200.000, 200.000] - loss: 5.147 - mae: 26.375 - mean_q: 52.996

Interval 8 (7000 steps performed)
5 episodes - episode_reward: 200.000 [200.000, 200.000

In [17]:
video = io.open('./gym-results/cartpole/openaigym.video.%s.video000010.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))

In [18]:
video = io.open('./gym-results/cartpole/openaigym.video.%s.video000080.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))