In [1]:
# basic
import logging
import os, sys
import numpy as np
from typing import NamedTuple
from PIL import Image
from collections import deque
import random

# ml
import tensorflow as tf

# gym
import gym
from gym.wrappers import Monitor
import gym_ple

# disply
from IPython.display import HTML
import matplotlib.pyplot as plt
import matplotlib.animation as animation
%matplotlib inline

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [2]:
# select gpu
gpu_index = 1
from tensorflow.keras.backend import set_session
config = tf.ConfigProto(
    gpu_options=tf.GPUOptions(
        visible_device_list=str(gpu_index),
        allow_growth=True
    )
)
set_session(tf.Session(config=config))

In [3]:
class Experience(NamedTuple):
    state: gym.spaces.Box
    action: int
    reward: float
    next_state: gym.spaces.Box
    done: bool

In [4]:
class Logger():

    def __init__(self, log_dir='./logs/', dir_name=""):
        self.log_dir = log_dir

        if dir_name:
            self.log_dir = os.path.join(self.log_dir, dir_name)
            if not os.path.exists(self.log_dir):
                os.mkdir(self.log_dir)

        self._callback = tf.keras.callbacks.TensorBoard(self.log_dir)

    @property
    def writer(self):
        return self._callback.writer
    
    @property
    def ckpt_path(self):
        return os.path.join(self.log_dir, 'model.ckpt')

    def set_model(self, model):
        self._callback.set_model(model)

    def describe(self, name, values, episode=-1, step=-1):
        mean = np.round(np.mean(values), 3)
        std = np.round(np.std(values), 3)
        desc = "{} is {} (+/-{})".format(name, mean, std)
        if episode > 0:
            print("At episode {}, {}".format(episode, desc))
        elif step > 0:
            print("At step {}, {}".format(step, desc))

    def plot(self, name, values, interval=10):
        indices = list(range(0, len(values), interval))
        means = []
        stds = []
        for i in indices:
            _values = values[i:(i + interval)]
            means.append(np.mean(_values))
            stds.append(np.std(_values))
        means = np.array(means)
        stds = np.array(stds)
        plt.figure()
        plt.title("{} History".format(name))
        plt.grid()
        plt.fill_between(indices, means - stds, means + stds,
                         alpha=0.1, color="g")
        plt.plot(indices, means, "o-", color="g",
                 label="{} per {} episode".format(name.lower(), interval))
        plt.legend(loc="best")
        plt.show()

    def write(self, index, name, value):
        summary = tf.Summary()
        summary_value = summary.value.add()
        summary_value.tag = name
        summary_value.simple_value = value
        self.writer.add_summary(summary, index)
        self.writer.flush()

In [5]:
class AgentNoisyLayer(tf.keras.layers.Layer):
    
    def __init__(self, num_outputs: int):
        super(AgentNoisyLayer, self).__init__(trainable=True, name='noisynet', dtype=tf.float32)
        self.num_outputs = num_outputs
        # dense layer のみ noisy network に変更する
        self.relu = relu = tf.nn.relu
        k_init = tf.keras.initializers.glorot_normal()
        self.v_dense_w_mu = self.add_weight("v_dense_w_mu", [1024, 128], initializer=k_init)
        self.ad_dense_w_mu = self.add_weight("ad_dense_w_mu", [1024, 128], initializer=k_init)
        
        self.v_output_w_mu = self.add_weight("v_output_w_mu", [128, 1], initializer=k_init)
        self.ad_output_w_mu = self.add_weight("ad_output_w_mu", [128, num_outputs], initializer=k_init)
        
        self.v_dense_b_mu = self.add_weight("v_dense_b_mu", [128], initializer='zero')
        self.ad_dense_b_mu = self.add_weight("ad_dense_b_mu", [128], initializer='zero')

        self.v_output_b_mu = self.add_weight("v_output_b_mu", (), initializer='zero')
        self.ad_output_b_mu = self.add_weight("ad_output_b_mu", [num_outputs], initializer='zero')

        self.v_dense_w_sigma = self.add_weight("v_dense_w_sigma", [1024, 128])
        self.ad_dense_w_sigma = self.add_weight("ad_dense_w_sigma", [1024, 128])
        
        self.v_output_w_sigma = self.add_weight("v_output_w_sigma", [128, 1])
        self.ad_output_w_sigma = self.add_weight("ad_output_w_sigma", [128, num_outputs])
        
        self.v_dense_b_sigma = self.add_weight("v_dense_b_sigma", [128])
        self.ad_dense_b_sigma = self.add_weight("ad_dense_b_sigma", [128])

        self.v_output_b_sigma = self.add_weight("v_output_b_sigma", ())
        self.ad_output_b_sigma = self.add_weight("ad_output_b_sigma", [num_outputs])
        
    def call(self, x):
        # noise
        dense_in_noise = tf.random.normal([1024])
        v_dense_out_noise = tf.random.normal([128])
        ad_dense_out_noise = tf.random.normal([128])
        d_in = self._noise_w(dense_in_noise)
        v_d_out = self._noise_w(v_dense_out_noise)
        ad_d_out = self._noise_w(ad_dense_out_noise)
        
        v_noise_dense_w = tf.matmul(tf.expand_dims(d_in, 1), tf.expand_dims(v_d_out, 0))
        ad_noise_dense_w = tf.matmul(tf.expand_dims(d_in, 1), tf.expand_dims(ad_d_out, 0))
        v_dense_w = self.v_dense_w_mu + tf.multiply(self.v_dense_w_sigma, v_noise_dense_w)
        ad_dense_w = self.ad_dense_w_mu + tf.multiply(self.ad_dense_w_sigma, ad_noise_dense_w)
        v_dense_b = self.v_dense_b_mu + tf.multiply(self.v_dense_b_sigma, self._noise_b(v_dense_out_noise))
        ad_dense_b = self.ad_dense_b_mu + tf.multiply(self.ad_dense_b_sigma, self._noise_b(ad_dense_out_noise))
        
        v_output_in_noise = tf.random.normal([128])
        v_output_out_noise = tf.random.normal([1])
        ad_output_in_noise = tf.random.normal([128])
        ad_output_out_noise = tf.random.normal([self.num_outputs])

        v_o_in = self._noise_w(v_output_in_noise)
        ad_o_in = self._noise_w(ad_output_in_noise)
        v_o_out = self._noise_w(v_output_out_noise)
        ad_o_out = self._noise_w(ad_output_out_noise)
        v_noise_output_w = tf.matmul(tf.expand_dims(v_o_in, 1), tf.expand_dims(v_o_out, 0))
        ad_noise_output_w = tf.matmul(tf.expand_dims(ad_o_in, 1), tf.expand_dims(ad_o_out, 0))
        v_output_w = self.v_output_w_mu + tf.multiply(self.v_output_w_sigma, v_noise_output_w)
        ad_output_w = self.ad_output_w_mu + tf.multiply(self.ad_output_w_sigma, ad_noise_output_w)
        v_output_b = self.v_output_b_mu + tf.multiply(self.v_output_b_sigma, self._noise_b(v_output_out_noise))
        ad_output_b = self.ad_output_b_mu + tf.multiply(self.ad_output_b_sigma, self._noise_b(ad_output_out_noise))
        
        # forward
        v_outputs = tf.matmul(tf.to_float(x), v_dense_w) + v_dense_b
        ad_outputs = tf.matmul(tf.to_float(x), ad_dense_w) + ad_dense_b
        v_outputs = self.relu(v_outputs)
        ad_outputs = self.relu(ad_outputs)
        v_outputs = tf.matmul(v_outputs, v_output_w) + v_output_b
        ad_outputs = tf.matmul(ad_outputs, ad_output_w) + ad_output_b
        outputs = v_outputs + (ad_outputs - tf.reduce_mean(ad_outputs))
        return outputs
    
    def _noise_w(self, x):
        return tf.multiply(tf.sign(x), tf.pow(tf.abs(x), 0.5))
    
    def _noise_b(self, x):
        return x

In [6]:
class AgentModel(tf.keras.Model):
    
    def __init__(self, is_training: bool, num_outputs: int):
        super(AgentModel, self).__init__()
        self.is_training = is_training
        self.num_outputs = num_outputs
        k_init = tf.keras.initializers.glorot_normal()
        relu = tf.nn.relu
        self.conv_01 = tf.keras.layers.Conv2D(16, kernel_size=8, strides=4, padding='same', kernel_initializer=k_init, activation=relu) 
        self.conv_02 = tf.keras.layers.Conv2D(32, kernel_size=4, strides=4, padding='same', kernel_initializer=k_init, activation=relu)
        self.conv_03 = tf.keras.layers.Conv2D(64, kernel_size=3, strides=2, padding='same', kernel_initializer=k_init, activation=relu)
        self.flatten = tf.keras.layers.Flatten()
        self.dense = tf.keras.layers.Dense(256, kernel_initializer=k_init, activation=relu) # 1024 x 256
        self.output_layer = tf.keras.layers.Dense(num_outputs, kernel_initializer=k_init) # 256 x 2
        
        # dense layer のみ noisy network に変更する
        self.noisy_layer = AgentNoisyLayer(num_outputs)
        
    def call(self, inputs):
        outputs = inputs
        outputs = self.conv_01(outputs)
        outputs = self.conv_02(outputs)
        outputs = self.conv_03(outputs)
        outputs = self.flatten(outputs)
        outputs = self.noisy_layer(outputs)
        return outputs
    
    def compute_output_shape(self, input_shape):
        shape = tf.TensorShape(input_shape).as_list()
        return [shape[0], self.num_outputs]
    
    def get_noise_mean(self):
        all_weights = self.noisy_layer.get_weights()
        noise_weights = np.mean([np.mean(np.abs(w)) for w in all_weights[8:12]])
        noise_biases = np.mean([np.mean(np.abs(w)) for w in all_weights[12:16]])
        return noise_weights, noise_biases

In [7]:
class Agent:
    
    def __init__(self, actions, epsilon, input_shape, learning_rate=0.0001):
        self.actions = actions
        self.epsilon = epsilon
        self.input_shape = input_shape
        self.learning_rate = learning_rate
        self.model = None
        self._teacher_model = None
        self.initialize()
        
    def initialize(self):
        self.build()
        optimizer = tf.train.RMSPropOptimizer(self.learning_rate)
        self.model.compile(optimizer, loss='mse')
        
    def save(self, model_path):
        self.model.save_weights(model_path, overwrite=True)

    @classmethod
    def load(cls, env, model_path, epsilon=0.0001):
        actions = list(range(env.action_space.n))
        input_shape = (env.width, env.height, env.frame_count)
        agent = cls(actions, epsilon, input_shape)
        agent.initialize()
        agent.model.load_weights(model_path)
        return agent
        
    def build(self):
        self.model = AgentModel(is_training=True, num_outputs=len(self.actions))

        # teacher_model を更新するため、両方のモデルで一度計算し重みを取得する
        self._teacher_model = AgentModel(is_training=True, num_outputs=len(self.actions))
        dummy = np.random.randn(1, *self.input_shape).astype(np.float32)
        dummy = tf.convert_to_tensor(dummy)
        _ = self.model.call(dummy)
        _ = self._teacher_model.call(dummy)
        self.update_teacher()
    
    def policy(self, state) -> int:
        '''
        epsilon greedy で受け取った state をもとに行動を決定する
        '''
        # no longer use epsilon greedy
        estimates = self.estimate(state)
        return np.argmax(estimates)
    
    def estimate(self, state):
        '''
        ある state の状態価値を推定する
        '''
        state_as_batch = np.array([state])
        return self.model.predict(state_as_batch)[0]
    
    def update(self, experiences, gamma):
        '''
        与えられた experiences をもとに学習
        '''
        states = np.array([e.state for e in experiences])
        next_states = np.array([e.next_state for e in experiences])

        estimated_values = self.model.predict(states)
        next_state_values = self._teacher_model.predict(next_states)
        
        # train
        for i, e in enumerate(experiences):
            reward = e.reward
            if not e.done:
                reward += gamma * np.max(next_state_values[i])
            estimated_values[i][e.action] = reward
        loss = self.model.train_on_batch(states, estimated_values)
        return loss
    
    def update_teacher(self):
        self._teacher_model.set_weights(self.model.get_weights())
    
    def play(self, env, episode_count: int = 2, render: bool = True):
        total_rewards = []
        for e in range(episode_count):
            state = env.reset()
            done = False
            episode_reward = 0
            
            while not done:
                if render:
                    env.render()
                action = self.policy(state)
                step_reward = 0
                for _ in range(4):
                    next_state, reward, done = env.step_with_raw_reward(action)
                    if done:
                        break
                    step_reward += reward
                episode_reward += step_reward
                state = next_state
            print('episode {}, total reward: {:.4f}'.format(e, episode_reward))
            total_rewards.append(episode_reward)
                
        env.reset()
        print('reward by {}, mean: {:.4f}, std: {:.4f}'.format(episode_count, np.mean(total_rewards), np.std(total_rewards)))

In [8]:
class Observer:
    
    def __init__(self, env, frame_count, width, height, render=False, outdir='./playlogs/'):
        self._env = env
        if render:
            outdir = outdir
            env = Monitor(env, directory=outdir, video_callable=(lambda x: x % 5 == 0), force=True)
            self._env = env
        self.frame_count = frame_count
        self.width = width
        self.height = height
        self._frames = deque(maxlen=frame_count)

    def reset(self):
        return self.transform(self._env.reset())
        
    def render(self):
        self._env.render(mode = 'rgb_array')
        
    def step(self, action):
        next_state, reward, done, _ = self._env.step(action)
        return self.transform(next_state), self.reward_shaping(reward), done
    
    def step_with_raw_reward(self, action):
        next_state, reward, done, _ = self._env.step(action)
        return self.transform(next_state), reward, done
        
    def transform(self, state):
        state = state[:400, :, :]
        grayed = Image.fromarray(state).convert('L')  # h, w, c -> h, w
        
        resized = grayed.resize((self.width, self.height))
        resized = np.array(resized).astype(np.float32)
        resized = np.transpose(resized, (1, 0)) # h, w -> w, h
        normalized = resized / 255.0
        if len(self._frames) == 0:
            for i in range(self.frame_count):
                self._frames.append(normalized)
        else:
            self._frames.append(normalized)
        feature = np.array(self._frames)
        feature = np.transpose(feature, (1, 2, 0))  # [f, w, h] -> [w, h, f]
        
        return feature

    def reward_shaping(self, reward):
        if 0.01 > reward > -0.01:
            return 0.01
        elif reward <= -0.1:
            return -1.0
        else:
            return 1.0

    @property
    def action_space(self):
        return self._env.action_space
    
    @property
    def observation_space(self):
        return self._env.observation_space

In [9]:
class Trainer:
    
    def __init__(self, filename, buffer_size=50000, batch_size=32, gamma=0.98, report_interval=20):
        self.file_name = filename
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.report_interval = report_interval
        self.experiences = deque(maxlen=buffer_size)
        self.training = False
        self.training_count = 0
        self.reward_log = []
        self.logger = Logger(dir_name=filename)
        self._max_reward = 0
        self.teacher_update_freq = 10000
        
    def train(self, env, episode_count=10000, initial_count=200, model_path = None):
        actions = list(range(env.action_space.n))
        if model_path:
            agent = Agent.load(env, model_path, epsilon=0.1)
        else:
            agent = Agent(actions, 0.1, input_shape=(env.width, env.height, env.frame_count))
        self.train_loop(env, agent, episode_count, initial_count)
        
    def train_loop(self, env, agent, episode_count, initial_count):
        
        for episode in range(episode_count):
            state = env.reset()
            done = False
            step_count = 0
            episode_reward = 0
            while not done:
                action = agent.policy(state)
                step_reward = 0
                for _ in range(4):
                    next_state, reward, done = env.step(action)
                    if done:
                        break
                    step_reward += reward
                e = Experience(state, action, step_reward, next_state, done)
                self.experiences.append(e)
                episode_reward += step_reward
                loss = self.step(episode, agent)
                state = next_state
                
                if not self.training and (len(self.experiences) >= self.buffer_size or episode >= initial_count):
                    self.begin_training(agent)
                    self.training = True
            
            self.end_episode(episode, episode_reward, loss, agent)
    
    def step(self, step, agent):
        if self.training:
            batch = random.sample(self.experiences, self.batch_size)
            loss = agent.update(batch, self.gamma)
            self.training_count += 1
            if self.is_event(self.training_count, self.teacher_update_freq):
                agent.update_teacher()
            return loss
            
    def begin_training(self, agent):
        print('start training!')
        self.logger.set_model(agent.model)
        
    def end_episode(self, episode, reward, loss, agent):
        self.reward_log.append(reward)
        if self.training:
            self.logger.write(self.training_count, "loss", loss)
            self.logger.write(self.training_count, "reward", reward)
            self.logger.write(self.training_count, "epsilon", agent.epsilon)
            noise_w, noise_b = agent.model.get_noise_mean()
            self.logger.write(self.training_count, "noise_weights", noise_w)
            self.logger.write(self.training_count, "noise_biases", noise_b)
            if reward > self._max_reward:
                agent.save(self.logger.ckpt_path)
                self._max_reward = reward

        if self.is_event(episode, self.report_interval):
            recent_rewards = self.reward_log[-self.report_interval:]
            self.logger.describe("reward", recent_rewards, episode=episode)
        
    def is_event(self, count, interval):
        return True if count != 0 and count % interval == 0 else False

In [10]:
model_name = '04_noisynetdueling_01'

In [11]:
env = gym.make('FlappyBird-v0')
obs = Observer(env, 4, 128, 128)
trainer = Trainer(filename=model_name)



In [None]:
trainer.train(obs)

At episode 20, reward is 0.914 (+/-0.542)
At episode 40, reward is 0.952 (+/-1.043)
At episode 60, reward is 1.022 (+/-0.722)
At episode 80, reward is 1.072 (+/-0.957)
At episode 100, reward is 1.092 (+/-0.753)
At episode 120, reward is 0.833 (+/-0.455)
At episode 140, reward is 1.076 (+/-0.859)
At episode 160, reward is 0.828 (+/-0.479)
At episode 180, reward is 1.184 (+/-0.791)
start training!
At episode 200, reward is 0.828 (+/-0.504)
At episode 220, reward is 0.946 (+/-0.672)
At episode 240, reward is 0.649 (+/-0.25)
At episode 260, reward is 0.679 (+/-0.523)
At episode 280, reward is 0.957 (+/-0.712)
At episode 300, reward is 0.859 (+/-0.524)
At episode 320, reward is 0.861 (+/-0.503)
At episode 340, reward is 0.546 (+/-0.129)
At episode 360, reward is 0.921 (+/-0.967)
At episode 380, reward is 1.043 (+/-1.242)
At episode 400, reward is 0.839 (+/-0.715)
At episode 420, reward is 0.622 (+/-0.277)
At episode 440, reward is 0.765 (+/-0.513)
At episode 460, reward is 0.885 (+/-0.699)


# play

In [None]:
env = gym.make('FlappyBird-v0')
obs = Observer(env, 4, 128, 128, render=True)

In [None]:
# 1
model_name = '04_noisynetdueling_01'
agent = Agent.load(obs, 'logs/{}/model.ckpt'.format(model_name))
agent.play(obs, episode_count=50)

In [None]:
# 2
model_name = '04_noisynetdueling_02'
agent = Agent.load(obs, 'logs/{}/model.ckpt'.format(model_name))
agent.play(obs, episode_count=50)

In [None]:
# 3
model_name = '04_noisynetdueling_03'
agent = Agent.load(obs, 'logs/{}/model.ckpt'.format(model_name))
agent.play(obs, episode_count=50)

In [None]:
# 4
model_name = '04_noisynetdueling_04'
agent = Agent.load(obs, 'logs/{}/model.ckpt'.format(model_name))
agent.play(obs, episode_count=50)

In [None]:
# 5
model_name = '04_noisynetdueling_05'
agent = Agent.load(obs, 'logs/{}/model.ckpt'.format(model_name))
agent.play(obs, episode_count=50)

In [None]:
res = np.array([])
print('mean: {:.3f}, median: {:.3f}'.format(res.mean(), np.median(res)))