In [1]:
# basic
import logging
import os, sys
import numpy as np
from typing import NamedTuple
from PIL import Image
from collections import deque
import random

# ml
import tensorflow as tf

# gym
import gym
from gym.wrappers import Monitor
import gym_ple

# disply
from IPython.display import HTML
import matplotlib.pyplot as plt
import matplotlib.animation as animation
%matplotlib inline

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [2]:
# select gpu
gpu_index = 0
from tensorflow.keras.backend import set_session
config = tf.ConfigProto(
    gpu_options=tf.GPUOptions(
        visible_device_list=str(gpu_index),
        allow_growth=True
    )
)
set_session(tf.Session(config=config))

In [3]:
class Experience(NamedTuple):
    state: gym.spaces.Box
    action: int
    reward: float
    next_state: gym.spaces.Box
    done: bool

In [4]:
class Logger():

    def __init__(self, log_dir='./logs/', dir_name=""):
        self.log_dir = log_dir

        if dir_name:
            self.log_dir = os.path.join(self.log_dir, dir_name)
            if not os.path.exists(self.log_dir):
                os.mkdir(self.log_dir)

        self._callback = tf.keras.callbacks.TensorBoard(self.log_dir)

    @property
    def writer(self):
        return self._callback.writer
    
    @property
    def ckpt_path(self):
        return os.path.join(self.log_dir, 'model.ckpt')

    def set_model(self, model):
        self._callback.set_model(model)

    def describe(self, name, values, episode=-1, step=-1):
        mean = np.round(np.mean(values), 3)
        std = np.round(np.std(values), 3)
        desc = "{} is {} (+/-{})".format(name, mean, std)
        if episode > 0:
            print("At episode {}, {}".format(episode, desc))
        elif step > 0:
            print("At step {}, {}".format(step, desc))

    def plot(self, name, values, interval=10):
        indices = list(range(0, len(values), interval))
        means = []
        stds = []
        for i in indices:
            _values = values[i:(i + interval)]
            means.append(np.mean(_values))
            stds.append(np.std(_values))
        means = np.array(means)
        stds = np.array(stds)
        plt.figure()
        plt.title("{} History".format(name))
        plt.grid()
        plt.fill_between(indices, means - stds, means + stds,
                         alpha=0.1, color="g")
        plt.plot(indices, means, "o-", color="g",
                 label="{} per {} episode".format(name.lower(), interval))
        plt.legend(loc="best")
        plt.show()

    def write(self, index, name, value):
        summary = tf.Summary()
        summary_value = summary.value.add()
        summary_value.tag = name
        summary_value.simple_value = value
        self.writer.add_summary(summary, index)
        self.writer.flush()

In [5]:
class AgentModel(tf.keras.Model):
    
    def __init__(self, is_training: bool, num_actions: int):
        super(AgentModel, self).__init__()
        self.is_training = is_training
        k_init = tf.keras.initializers.glorot_normal()
        relu = tf.nn.relu
        self.conv_01 = tf.keras.layers.Conv2D(16, kernel_size=8, strides=4, padding='same', kernel_initializer=k_init, activation=relu) 
        self.conv_02 = tf.keras.layers.Conv2D(32, kernel_size=4, strides=4, padding='same', kernel_initializer=k_init, activation=relu)
        self.conv_03 = tf.keras.layers.Conv2D(64, kernel_size=3, strides=2, padding='same', kernel_initializer=k_init, activation=relu)
        self.flatten = tf.keras.layers.Flatten()
        self.dense_v = tf.keras.layers.Dense(128, kernel_initializer=k_init, activation=relu)
        self.dense_ad = tf.keras.layers.Dense(128, kernel_initializer=k_init, activation=relu)
        self.output_v_layer = tf.keras.layers.Dense(1, kernel_initializer=k_init)
        self.output_ad_layer = tf.keras.layers.Dense(num_actions, kernel_initializer=k_init)
        
    def call(self, inputs):
        outputs = inputs
        outputs = self.conv_01(outputs)
        outputs = self.conv_02(outputs)
        outputs = self.conv_03(outputs)
        outputs = self.flatten(outputs)
        output_v = self.dense_v(outputs)
        outputs_ad = self.dense_ad(outputs)
        output_v = self.output_v_layer(output_v)
        outputs_ad = self.output_ad_layer(outputs_ad)
        q = output_v + (outputs_ad - tf.reduce_mean(outputs_ad))
        return q

In [6]:
class Agent:
    
    def __init__(self, actions, epsilon, input_shape, learning_rate=0.0001):
        self.actions = actions
        self.epsilon = epsilon
        self.input_shape = input_shape
        self.learning_rate = learning_rate
        self.model = None
        self._teacher_model = None
        self.initialize()
        
    def initialize(self):
        self.build()
        optimizer = tf.train.RMSPropOptimizer(self.learning_rate)
        self.model.compile(optimizer, loss='mse')
        
    def save(self, model_path):
        self.model.save_weights(model_path, overwrite=True)

    @classmethod
    def load(cls, env, model_path, epsilon=0.0001):
        actions = list(range(env.action_space.n))
        input_shape = (env.width, env.height, env.frame_count)
        agent = cls(actions, epsilon, input_shape)
        agent.initialize()
        agent.model.load_weights(model_path)
        return agent
        
    def build(self):
        inputs = tf.keras.Input(shape=self.input_shape)
        model = AgentModel(is_training=True, num_actions=len(self.actions))
        outputs = model(inputs)
        self.model = tf.keras.Model(inputs=inputs, outputs=outputs)

        # teacher_model を更新するため、両方のモデルで一度計算し重みを取得する
        self._teacher_model = AgentModel(is_training=True, num_actions=len(self.actions))
        dummy = np.random.randn(1, *self.input_shape).astype(np.float32)
        dummy = tf.convert_to_tensor(dummy)
        _ = self.model.call(dummy)
        _ = self._teacher_model.call(dummy)
        self.update_teacher()
    
    def policy(self, state) -> int:
        '''
        epsilon greedy で受け取った state をもとに行動を決定する
        '''
        # epsilon greedy
        if np.random.random() < self.epsilon:
            return np.random.randint(len(self.actions))
        else:
            estimates = self.estimate(state)
            return np.argmax(estimates)
    
    def estimate(self, state):
        '''
        ある state の状態価値を推定する
        '''
        state_as_batch = np.array([state])
        return self.model.predict(state_as_batch)[0]
    
    def update(self, experiences, gamma):
        '''
        与えられた experiences をもとに学習
        '''
        states = np.array([e.state for e in experiences])
        next_states = np.array([e.next_state for e in experiences])

        estimated_values = self.model.predict(states)
        next_state_values = self._teacher_model.predict(next_states)
        
        # train
        for i, e in enumerate(experiences):
            reward = e.reward
            if not e.done:
                reward += gamma * np.max(next_state_values[i])
            estimated_values[i][e.action] = reward
        loss = self.model.train_on_batch(states, estimated_values)
        return loss
    
    def update_teacher(self):
        self._teacher_model.set_weights(self.model.get_weights())
    
    def play(self, env, episode_count: int = 2, render: bool = True):
        total_rewards = []
        for e in range(episode_count):
            state = env.reset()
            done = False
            episode_reward = 0
            
            while not done:
                if render:
                    env.render()
                action = self.policy(state)
                step_reward = 0
                for _ in range(4):
                    next_state, reward, done = env.step_with_raw_reward(action)
                    if done:
                        break
                    step_reward += reward
                episode_reward += step_reward
                state = next_state
            print('episode {}, total reward: {:.4f}'.format(e, episode_reward))
            total_rewards.append(episode_reward)
                
        env.reset()
        print('reward by {}, mean: {:.4f}, std: {:.4f}'.format(episode_count, np.mean(total_rewards), np.std(total_rewards)))

In [7]:
class Observer:
    
    def __init__(self, env, frame_count, width, height, render=False, outdir='./playlogs/'):
        self._env = env
        if render:
            outdir = outdir
            env = Monitor(env, directory=outdir, video_callable=(lambda x: x % 5 == 0), force=True)
            self._env = env
        self.frame_count = frame_count
        self.width = width
        self.height = height
        self._frames = deque(maxlen=frame_count)

    def reset(self):
        return self.transform(self._env.reset())
        
    def render(self):
        self._env.render(mode = 'rgb_array')
        
    def step(self, action):
        next_state, reward, done, _ = self._env.step(action)
        return self.transform(next_state), self.reward_shaping(reward), done
    
    def step_with_raw_reward(self, action):
        next_state, reward, done, _ = self._env.step(action)
        return self.transform(next_state), reward, done
        
    def transform(self, state):
        state = state[:400, :, :]
        grayed = Image.fromarray(state).convert('L')  # h, w, c -> h, w
        
        resized = grayed.resize((self.width, self.height))
        resized = np.array(resized).astype(np.float32)
        resized = np.transpose(resized, (1, 0)) # h, w -> w, h
        normalized = resized / 255.0
        if len(self._frames) == 0:
            for i in range(self.frame_count):
                self._frames.append(normalized)
        else:
            self._frames.append(normalized)
        feature = np.array(self._frames)
        feature = np.transpose(feature, (1, 2, 0))  # [f, w, h] -> [w, h, f]
        
        return feature

    def reward_shaping(self, reward):
        if 0.01 > reward > -0.01:
            return 0.01
        elif reward <= -0.1:
            return -1.0
        else:
            return 1.0

    @property
    def action_space(self):
        return self._env.action_space
    
    @property
    def observation_space(self):
        return self._env.observation_space

In [8]:
class Trainer:
    
    def __init__(self, filename, buffer_size=50000, batch_size=32, gamma=0.98, report_interval=20):
        self.file_name = filename
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.report_interval = report_interval
        self.experiences = deque(maxlen=buffer_size)
        self.training = False
        self.training_count = 0
        self.reward_log = []
        self.logger = Logger(dir_name=filename)
        self._max_reward = 0
        self.teacher_update_freq = 10000
        
    def train(self, env, episode_count=10000, initial_count=200, model_path = None):
        actions = list(range(env.action_space.n))
        if model_path:
            agent = Agent.load(env, model_path, epsilon=0.1)
        else:
            agent = Agent(actions, 0.1, input_shape=(env.width, env.height, env.frame_count))
        self.train_loop(env, agent, episode_count, initial_count)
        
    def train_loop(self, env, agent, episode_count, initial_count):
        
        for episode in range(episode_count):
            state = env.reset()
            done = False
            step_count = 0
            episode_reward = 0
            while not done:
                action = agent.policy(state)
                step_reward = 0
                for _ in range(4):
                    next_state, reward, done = env.step(action)
                    if done:
                        break
                    step_reward += reward
                e = Experience(state, action, step_reward, next_state, done)
                self.experiences.append(e)
                episode_reward += step_reward
                loss = self.step(episode, agent)
                state = next_state
                
                if not self.training and (len(self.experiences) >= self.buffer_size or episode >= initial_count):
                    self.begin_training(agent)
                    self.training = True
            
            self.end_episode(episode, episode_reward, loss, agent)
    
    def step(self, step, agent):
        if self.training:
            batch = random.sample(self.experiences, self.batch_size)
            loss = agent.update(batch, self.gamma)
            self.training_count += 1
            if self.is_event(self.training_count, self.teacher_update_freq):
                agent.update_teacher()
            return loss
            
    def begin_training(self, agent):
        print('start training!')
        self.logger.set_model(agent.model)
        
    def end_episode(self, episode, reward, loss, agent):
        self.reward_log.append(reward)
        if self.training:
            self.logger.write(self.training_count, "loss", loss)
            self.logger.write(self.training_count, "reward", reward)
            self.logger.write(self.training_count, "epsilon", agent.epsilon)
            if reward > self._max_reward:
                agent.save(self.logger.ckpt_path)
                self._max_reward = reward

        if self.is_event(episode, self.report_interval):
            recent_rewards = self.reward_log[-self.report_interval:]
            self.logger.describe("reward", recent_rewards, episode=episode)
        
    def is_event(self, count, interval):
        return True if count != 0 and count % interval == 0 else False

In [22]:
model_name = '03_dueling_05'

In [23]:
env = gym.make('FlappyBird-v0')
obs = Observer(env, 4, 128, 128)
trainer = Trainer(filename=model_name)

In [None]:
trainer.train(obs)

At episode 20, reward is 0.203 (+/-0.045)
At episode 40, reward is 0.197 (+/-0.044)
At episode 60, reward is 0.198 (+/-0.044)
At episode 80, reward is 0.203 (+/-0.049)
At episode 100, reward is 0.221 (+/-0.051)
At episode 120, reward is 0.195 (+/-0.046)
At episode 140, reward is 0.218 (+/-0.055)
At episode 160, reward is 0.214 (+/-0.071)
At episode 180, reward is 0.195 (+/-0.036)
start training!
At episode 200, reward is 0.221 (+/-0.055)
At episode 220, reward is 0.319 (+/-0.331)
At episode 240, reward is 1.006 (+/-0.664)
At episode 260, reward is 1.087 (+/-0.726)
At episode 280, reward is 0.855 (+/-0.627)
At episode 300, reward is 0.916 (+/-0.7)
At episode 320, reward is 0.698 (+/-0.388)
At episode 340, reward is 1.002 (+/-0.735)
At episode 360, reward is 1.042 (+/-0.729)
At episode 380, reward is 0.972 (+/-0.719)
At episode 400, reward is 1.178 (+/-0.851)
At episode 420, reward is 0.944 (+/-0.728)
At episode 440, reward is 0.88 (+/-0.81)
At episode 460, reward is 0.79 (+/-0.579)
At e

# play

In [27]:
env = gym.make('FlappyBird-v0')
obs = Observer(env, 4, 128, 128, render=True)

In [28]:
# 1
model_name = '03_dueling_01'
agent = Agent.load(obs, 'logs/{}/model.ckpt'.format(model_name))
agent.play(obs, episode_count=50)

episode 0, total reward: 9.0000
episode 1, total reward: 40.0000
episode 2, total reward: 62.0000
episode 3, total reward: 25.0000
episode 4, total reward: 161.0000
episode 5, total reward: 28.0000
episode 6, total reward: 15.0000
episode 7, total reward: 124.0000
episode 8, total reward: 79.0000
episode 9, total reward: 2.0000
episode 10, total reward: 4.0000
episode 11, total reward: 224.0000
episode 12, total reward: 14.0000
episode 13, total reward: 0.0000
episode 14, total reward: 53.0000
episode 15, total reward: 46.0000
episode 16, total reward: 23.0000
episode 17, total reward: 19.0000
episode 18, total reward: 39.0000
episode 19, total reward: 13.0000
episode 20, total reward: 1.0000
episode 21, total reward: 53.0000
episode 22, total reward: 3.0000
episode 23, total reward: 8.0000
episode 24, total reward: 34.0000
episode 25, total reward: 17.0000
episode 26, total reward: 2.0000
episode 27, total reward: 98.0000
episode 28, total reward: 6.0000
episode 29, total reward: 140.

In [29]:
# 2
model_name = '03_dueling_02'
agent = Agent.load(obs, 'logs/{}/model.ckpt'.format(model_name))
agent.play(obs, episode_count=50)

episode 0, total reward: 35.0000
episode 1, total reward: 19.0000
episode 2, total reward: 37.0000
episode 3, total reward: 34.0000
episode 4, total reward: 49.0000
episode 5, total reward: 2.0000
episode 6, total reward: 1.0000
episode 7, total reward: 61.0000
episode 8, total reward: 5.0000
episode 9, total reward: 12.0000
episode 10, total reward: 58.0000
episode 11, total reward: 77.0000
episode 12, total reward: 3.0000
episode 13, total reward: 5.0000
episode 14, total reward: 30.0000
episode 15, total reward: 50.0000
episode 16, total reward: 19.0000
episode 17, total reward: 16.0000
episode 18, total reward: 25.0000
episode 19, total reward: 46.0000
episode 20, total reward: 50.0000
episode 21, total reward: 14.0000
episode 22, total reward: 166.0000
episode 23, total reward: 68.0000
episode 24, total reward: 47.0000
episode 25, total reward: 3.0000
episode 26, total reward: 7.0000
episode 27, total reward: 8.0000
episode 28, total reward: 26.0000
episode 29, total reward: 39.00

In [30]:
# 3
model_name = '03_dueling_03'
agent = Agent.load(obs, 'logs/{}/model.ckpt'.format(model_name))
agent.play(obs, episode_count=50)

episode 0, total reward: 20.0000
episode 1, total reward: 46.0000
episode 2, total reward: 23.0000
episode 3, total reward: 8.0000
episode 4, total reward: 13.0000
episode 5, total reward: 4.0000
episode 6, total reward: 22.0000
episode 7, total reward: 74.0000
episode 8, total reward: 15.0000
episode 9, total reward: 6.0000
episode 10, total reward: 27.0000
episode 11, total reward: 55.0000
episode 12, total reward: 1.0000
episode 13, total reward: 19.0000
episode 14, total reward: 2.0000
episode 15, total reward: 1.0000
episode 16, total reward: 31.0000
episode 17, total reward: 10.0000
episode 18, total reward: 19.0000
episode 19, total reward: 26.0000
episode 20, total reward: 27.0000
episode 21, total reward: 85.0000
episode 22, total reward: 12.0000
episode 23, total reward: 1.0000
episode 24, total reward: 6.0000
episode 25, total reward: 1.0000
episode 26, total reward: 5.0000
episode 27, total reward: 12.0000
episode 28, total reward: 6.0000
episode 29, total reward: 3.0000
ep

In [31]:
# 4
model_name = '03_dueling_04'
agent = Agent.load(obs, 'logs/{}/model.ckpt'.format(model_name))
agent.play(obs, episode_count=50)

episode 0, total reward: 21.0000
episode 1, total reward: 6.0000
episode 2, total reward: 4.0000
episode 3, total reward: 58.0000
episode 4, total reward: 104.0000
episode 5, total reward: 52.0000
episode 6, total reward: 86.0000
episode 7, total reward: 4.0000
episode 8, total reward: 4.0000
episode 9, total reward: 7.0000
episode 10, total reward: 59.0000
episode 11, total reward: 5.0000
episode 12, total reward: 35.0000
episode 13, total reward: 37.0000
episode 14, total reward: 9.0000
episode 15, total reward: 135.0000
episode 16, total reward: 1.0000
episode 17, total reward: 52.0000
episode 18, total reward: 80.0000
episode 19, total reward: 50.0000
episode 20, total reward: 6.0000
episode 21, total reward: 33.0000
episode 22, total reward: 4.0000
episode 23, total reward: 32.0000
episode 24, total reward: 10.0000
episode 25, total reward: 13.0000
episode 26, total reward: 264.0000
episode 27, total reward: 26.0000
episode 28, total reward: 69.0000
episode 29, total reward: 14.00

In [32]:
# 5
model_name = '03_dueling_05'
agent = Agent.load(obs, 'logs/{}/model.ckpt'.format(model_name))
agent.play(obs, episode_count=50)

episode 0, total reward: 18.0000
episode 1, total reward: 23.0000
episode 2, total reward: 59.0000
episode 3, total reward: 53.0000
episode 4, total reward: 2.0000
episode 5, total reward: 95.0000
episode 6, total reward: 95.0000
episode 7, total reward: 116.0000
episode 8, total reward: 74.0000
episode 9, total reward: 37.0000
episode 10, total reward: 2.0000
episode 11, total reward: 35.0000
episode 12, total reward: 2.0000
episode 13, total reward: 47.0000
episode 14, total reward: 264.0000
episode 15, total reward: 2.0000
episode 16, total reward: 88.0000
episode 17, total reward: 235.0000
episode 18, total reward: 14.0000
episode 19, total reward: 93.0000
episode 20, total reward: 5.0000
episode 21, total reward: 27.0000
episode 22, total reward: 264.0000
episode 23, total reward: 264.0000
episode 24, total reward: 64.0000
episode 25, total reward: 24.0000
episode 26, total reward: 124.0000
episode 27, total reward: 67.0000
episode 28, total reward: 8.0000
episode 29, total reward

In [33]:
res = np.array([44.5400, 52.6800, 21.1400, 43.2800, 75.6000])
print('mean: {:.3f}, median: {:.3f}'.format(res.mean(), np.median(res)))

mean: 47.448, median: 44.540
