# 实验：对决网络勇闯死亡回廊

## 1. 导入库

In [2]:
import tensorflow as tf
import numpy as np
import random
import time
import matplotlib.pyplot as plt
import warnings
from vizdoom import *
from skimage import transform
from collections import deque

warnings.filterwarnings('ignore') 

## 2. 创建游戏环境
* 死亡回廊游戏环境包含 7 个动作

In [3]:
def create_environment():
    game = DoomGame()
    game.load_config("deadly_corridor.cfg")
    game.set_doom_scenario_path("deadly_corridor.wad")
    possible_actions = np.identity(7,dtype=int).tolist()
    
    return game, possible_actions

game, possible_actions = create_environment()

## 3. 图像处理过程
* 图像处理包含两个过程：处理环境给予的多个图像帧与处理每一帧

### 如何处理每一帧
* TODO:完成下述函数

In [4]:
## 参数：每一帧
## 过程1：去除无用图像部分
## 过程2：画面大小缩放到[100,120]或适合的大小（请你完成）
"""
        __________________
        |                 |
        |                 |
        |                 |
        |                 |
        |_________________|
        
        to
        _____________
        |            |
        |            |
        |            |
        |____________|
    
"""
## 过程3：归一化（请你完成）

def preprocess_frame(frame):
    cropped_frame = frame[15:-5,20:-20]
    
    # 画面大小重新剪裁
    preprocessed_frame = transform.resize(cropped_frame, [100,120])
    
    # 归一化
    normalized_frame = preprocessed_frame/255.0
    
    return normalized_frame

### 如何处理环境给予的多个图像
* 环境每次依然给与4帧的图像
* 完成图像打包
* TODO：尝试优化

In [5]:
stack_size = 4

# 初始化队列
stacked_frames  =  deque([np.zeros((100,120), dtype=np.int) for i in range(stack_size)], maxlen=4) 

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)   
    if is_new_episode:
        stacked_frames = deque([np.zeros((100,120), dtype=np.int) for i in range(stack_size)], maxlen=4)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)

    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2) 
    
    return stacked_state, stacked_frames

## 4. 设定超参数
* 设定强化学习超参数
* 设定深度学习超参数

In [6]:
# 模型参数
state_size = [100,120,4] #注意这里要配合前面的图像处理大小
action_size = game.get_available_buttons_size()
learning_rate =  0.00025

# 训练参数
total_episodes = 5000
max_steps = 5000
batch_size = 32

# 强化学习算法参数 
max_tau = 10000
gamma = 0.95
explore_start = 1.0
explore_stop = 0.01
decay_rate = 0.00005

# 记忆力参数
## 如果 pretrain_length 与 memory_size 相差太小一开始 loss 会变 0
pretrain_length = 10000 #建议尝试100000，如果内存卡死就可以换10000-50000之间
memory_size = 10000 #建议尝试1000000，如果内存卡死就可以换100000-500000之间

# 环境参数
training = True
#episode_render = False

## 5. 创建对决网络模型
* 首先模型接收 4 帧图像作为数据
* 然后经过 3 层卷积
* 铺平
* 接下来会分离两条计算路径
    - 一条路计算 V(s)
    - 另一条路计算 A(s,a)
* 最后做汇聚
* 模型返回 Q 值

In [7]:
class DuelNet:
    def __init__(self, state_size, action_size, learning_rate, name):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.name = name
        
        with tf.variable_scope(self.name):
            # TODO: 建立输入、动作、target_Q这三个 placeholder
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, action_size], name="actions_")
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            
            #
            self.ISWeights_ = tf.placeholder(tf.float32, [None,1], name='IS_weights')
            
            
            # TODO：建立第一层卷积，COV+ELU，建议初始化参数使用 tf.contrib.layers.xavier_initializer_conv2d()
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                          filters = 32,
                                          kernel_size = [8,8],
                                          strides = [4,4],
                                          padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name = "conv1")
            
            self.conv1_out = tf.nn.elu(self.conv1, name="conv1_out")
            
            
            # TODO：建立第二层卷积，同上
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                          filters = 64,
                                          kernel_size = [4,4],
                                          strides = [2,2],
                                          padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name = "conv2")

            self.conv2_out = tf.nn.elu(self.conv2, name="conv2_out")
            
            
            # TODO：建立第三层卷积，同上
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                          filters = 128,
                                          kernel_size = [4,4],
                                          strides = [2,2],
                                          padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name = "conv3")

            self.conv3_out = tf.nn.elu(self.conv3, name="conv3_out")
            
            
            # TODO：建立铺平层
            self.flatten = tf.layers.flatten(self.conv3_out)
            
            
            # TODO：使用建立两条全连接计算流
            ## V(s)计算流第一个隐含层包含 512 个神经元，第二层是大小为 1 的输出
            self.value_fc = tf.layers.dense(inputs = self.flatten,
                                            units = 512,
                                            activation = tf.nn.elu,
                                            kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                            name="value_fc")
            
            self.value = tf.layers.dense(inputs = self.value_fc,
                                         units = 1,
                                         activation = None,
                                         kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                         name="value")
            
            # A(s,a)计算流第一个隐含层包含 512 个神经元，第二层是大小为 动作范围 的输出
            self.advantage_fc = tf.layers.dense(inputs = self.flatten,
                                                units = 512,
                                                activation = tf.nn.elu,
                                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                name="advantage_fc")
            
            self.advantage = tf.layers.dense(inputs = self.advantage_fc,
                                             units = self.action_size,
                                             activation = None,
                                             kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                             name="advantages")
            
            # TODO：按以下公式进行汇聚
            ## Q(s,a) = V(s) + (A(s,a) - 1/|A| * sum A(s,a'))
            self.output = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keepdims=True))
              
            # 让上面输出与动作相乘，得到唯一的优选动作 Q
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
            
            # 以下内容请勿修改
            self.absolute_errors = tf.abs(self.target_Q - self.Q)# 更新求和树所需           
            self.loss = tf.reduce_mean(self.ISWeights_ * tf.squared_difference(self.target_Q, self.Q))           
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)

In [8]:
# 上述模型完成后就可以进行实例化了
tf.reset_default_graph()
DQNetwork = DuelNet(state_size, action_size, learning_rate, name="DQNetwork")
TargetNetwork = DuelNet(state_size, action_size, learning_rate, name="TargetNetwork")

## 6. 记忆优化
* 按照论文，我们使用了求和树作为存储记忆优先级的基本数据结构
![sum_tree](assets\SUM_TREE.png)

* 求和树：
    - **def __init__**: 初始化求和树，建立求和树结构与记忆结构
    - **def add**: 把记忆与对应的优先级评分添加进来
    - **def update**: 当记忆的优先级发生改变时可以进行对应的更新
    - **def get_leaf**: 优先级查询
    - **def total_priority**: 计算优先级总和
    
* 记忆过程
    - **def __init__**: 实例化求和树
    - **def store**: 存储经验
    - **def sample**: 记忆采样
    - **def update_batch**: 更新树的优先级

In [9]:
class SumTree():
    data_pointer = 0
    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype=object)

    def add(self, priority, data):
        tree_index = self.data_pointer + self.capacity - 1
        self.data[self.data_pointer] = data
        self.update(tree_index, priority)
        self.data_pointer += 1       
        if self.data_pointer >= self.capacity: 
            self.data_pointer = 0

    def update(self, tree_index, priority):
        change = priority - self.tree[tree_index]
        self.tree[tree_index] = priority
        while tree_index != 0:
            tree_index = (tree_index - 1) // 2
            self.tree[tree_index] += change

    def get_leaf(self, v):
        parent_index = 0
        
        while True:
            left_child_index = 2 * parent_index + 1
            right_child_index = left_child_index + 1
            if left_child_index >= len(self.tree):
                leaf_index = parent_index
                break
            
            else:      
                if v <= self.tree[left_child_index]:
                    parent_index = left_child_index          
                else:
                    v -= self.tree[left_child_index]
                    parent_index = right_child_index    
        data_index = leaf_index - self.capacity + 1

        return leaf_index, self.tree[leaf_index], self.data[data_index]
    
    @property
    def total_priority(self):
        return self.tree[0]
    

class Memory():
    PER_e = 0.01 # 防止 0 采样
    PER_a = 0.6  # 高优先级采样概率
    PER_b = 0.4  # 重要性采样
    PER_b_increment_per_sampling = 0.001

    def __init__(self, capacity):
        self.tree = SumTree(capacity)

    def store(self, experience):
        # 新的记忆拥有最高优先级
        max_priority = np.max(self.tree.tree[-self.tree.capacity:])
        if max_priority == 0:
            max_priority = 1.
        
        self.tree.add(max_priority, experience)   # set the max p for new p

    def sample(self, n):
        memory_b = []
        b_idx, b_ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, 1), dtype=np.float32)
        # 将优先级进行分段
        priority_segment = self.tree.total_priority / n
        self.PER_b = np.min([1., self.PER_b + self.PER_b_increment_per_sampling])        
        # 计算最大权重
        p_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority
        max_weight = (p_min * n) ** (-self.PER_b)
        
        for i in range(n):
            a, b = priority_segment * i, priority_segment * (i + 1)
            value = np.random.uniform(a, b)
            index, priority, data = self.tree.get_leaf(value)
            sampling_probabilities = priority / self.tree.total_priority
            
            #  IS = (1/N * 1/P(i))**b /max_weight == (N*P(i))**-b  /max_weight
            b_ISWeights[i, 0] = np.power(n * sampling_probabilities, -self.PER_b)/ max_weight                                  
            b_idx[i]= index           
            experience = [data]          
            memory_b.append(experience)
        
        return b_idx, memory_b, b_ISWeights
    
    def batch_update(self, tree_idx, abs_errors):
        abs_errors += self.PER_e
        clipped_errors = np.minimum(abs_errors, 1.)
        ps = np.power(clipped_errors, self.PER_a)

        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)

## 小插曲：初始记忆填充
这里只做了随机动作填充

In [10]:
game.init()

memory = Memory(memory_size)
game.new_episode()

# pretrain_length 就是填充长度
for i in range(pretrain_length):
    if i == 0:
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    action = random.choice(possible_actions)
    reward = game.make_action(action)
    done = game.is_episode_finished()

    if done:
        next_state = np.zeros(state.shape)       
        experience = state, action, reward, next_state, done
        memory.store(experience)
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        experience = state, action, reward, next_state, done
        memory.store(experience)
        state = next_state
        
game.close()

## 7. 建立日志

In [11]:
writer = tf.summary.FileWriter("tensorboard/dddqn/1")
tf.summary.scalar("Loss", DQNetwork.loss)
write_op = tf.summary.merge_all()

## 8. 准备训练
- **predict_action**：贪心算法输出动作
- **update_target_graph**：完成从行为网络到目标网络的参数拷贝工作

In [10]:
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    exp_exp_tradeoff = np.random.rand()
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > exp_exp_tradeoff):
        action = random.choice(possible_actions)
        
    else:
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
                
    return action, explore_probability

In [11]:
def update_target_graph():
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "DQNetwork")
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "TargetNetwork")
    op_holder = []

    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

## 9. 开始训练

In [14]:
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        decay_step = 0
        tau = 0
        game.init()
        update_target = update_target_graph()
        sess.run(update_target)
        
        for episode in range(500):
            step = 0
            episode_rewards = []
            game.new_episode()
            state = game.get_state().screen_buffer
            state, stacked_frames = stack_frames(stacked_frames, state, True)
        
            while step < max_steps:
                step += 1
                tau += 1
                decay_step +=1
                action, explore_probability = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)
                reward = game.make_action(action)
                done = game.is_episode_finished()
                episode_rewards.append(reward)

                if done:
                    next_state = np.zeros((120,140), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    step = max_steps
                    total_reward = np.sum(episode_rewards)

                    print('Episode: {}'.format(episode),
                          'Total reward: {}'.format(total_reward),
                          'Training loss: {:.4f}'.format(loss),
                          'Explore P: {:.4f}'.format(explore_probability))

                    # 记忆存储
                    experience = state, action, reward, next_state, done
                    memory.store(experience)

                else:
                    next_state = game.get_state().screen_buffer
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    # 记忆存储
                    experience = state, action, reward, next_state, done
                    memory.store(experience)
                    
                    state = next_state


                # 训练部分     
                ## 获取记忆的mini_batch
                tree_idx, batch, ISWeights_mb = memory.sample(batch_size)
                
                states_mb = np.array([each[0][0] for each in batch], ndmin=3)
                actions_mb = np.array([each[0][1] for each in batch])
                rewards_mb = np.array([each[0][2] for each in batch]) 
                next_states_mb = np.array([each[0][3] for each in batch], ndmin=3)
                dones_mb = np.array([each[0][4] for each in batch])

                target_Qs_batch = []

                
                ### 行为网络计算 a'
                ### 目标网络计算Q(s',a')
                
                ## 从行为网络获得 Q(s',a') 值
                q_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                ## 从目标网络获得 Q(s',a') 值
                q_target_next_state = sess.run(TargetNetwork.output, feed_dict = {TargetNetwork.inputs_: next_states_mb})
                
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]
                    
                    ## 获取a'
                    action = np.argmax(q_next_state[i])

                    ## 如果是最终状态那么 Q_target = r
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    
                    ## Q_target = r + gamma * Qtarget(s',a')
                    else:
                        ## 这里是 Double DQN 的 target 算法，使用来自行为网络的 action 下标，在目标网络里面进行再次的 Q 运算
                        ## 实现了两套参数嵌套
                        target = rewards_mb[i] + gamma * q_target_next_state[i][action]
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                
                _, loss, absolute_errors = sess.run([DQNetwork.optimizer, DQNetwork.loss, DQNetwork.absolute_errors],
                                                    feed_dict={DQNetwork.inputs_: states_mb,
                                                               DQNetwork.target_Q: targets_mb,
                                                               DQNetwork.actions_: actions_mb,
                                                               DQNetwork.ISWeights_: ISWeights_mb})
              
                memory.batch_update(tree_idx, absolute_errors)
                
                
                summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                        DQNetwork.target_Q: targets_mb,
                                                        DQNetwork.actions_: actions_mb,
                                                        DQNetwork.ISWeights_: ISWeights_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                
                if tau > max_tau:
                    update_target = update_target_graph()
                    sess.run(update_target)
                    tau = 0
                    print("Model updated")

            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")
                
game.close()

Episode: 0 Total reward: -107.46109008789062 Training loss: 83.2708 Explore P: 0.9959
Model Saved
Episode: 1 Total reward: -99.21345520019531 Training loss: 0.5869 Explore P: 0.9904
Episode: 2 Total reward: -92.57376098632812 Training loss: 0.5022 Explore P: 0.9864
Episode: 3 Total reward: -106.30680847167969 Training loss: 0.2053 Explore P: 0.9829
Episode: 4 Total reward: -107.10749816894531 Training loss: 38.8781 Explore P: 0.9778
Episode: 5 Total reward: -93.06199645996094 Training loss: 0.1752 Explore P: 0.9739
Model Saved
Episode: 6 Total reward: -89.39212036132812 Training loss: 0.2813 Explore P: 0.9698
Episode: 7 Total reward: -102.98558044433594 Training loss: 0.2687 Explore P: 0.9659
Episode: 8 Total reward: -92.00413513183594 Training loss: 0.2565 Explore P: 0.9574
Episode: 9 Total reward: -71.75691223144531 Training loss: 0.2212 Explore P: 0.9536
Episode: 10 Total reward: -60.135284423828125 Training loss: 22.9234 Explore P: 0.9497
Model Saved
Episode: 11 Total reward: -52.8

Episode: 94 Total reward: -29.503646850585938 Training loss: 30.8521 Explore P: 0.6477
Episode: 95 Total reward: -64.31242370605469 Training loss: 20.3328 Explore P: 0.6461
Model Saved
Episode: 96 Total reward: -115.9249267578125 Training loss: 0.6859 Explore P: 0.6435
Episode: 97 Total reward: -79.70756530761719 Training loss: 0.8915 Explore P: 0.6408
Episode: 98 Total reward: -55.674835205078125 Training loss: 0.4233 Explore P: 0.6382
Episode: 99 Total reward: -18.534927368164062 Training loss: 21.2507 Explore P: 0.6358
Episode: 100 Total reward: 10.925979614257812 Training loss: 0.6918 Explore P: 0.6333
Model Saved
Episode: 101 Total reward: -19.180633544921875 Training loss: 0.4938 Explore P: 0.6307
Episode: 102 Total reward: -48.38787841796875 Training loss: 28.7360 Explore P: 0.6282
Episode: 103 Total reward: -83.50558471679688 Training loss: 0.4542 Explore P: 0.6268
Episode: 104 Total reward: -52.70941162109375 Training loss: 0.7229 Explore P: 0.6254
Episode: 105 Total reward: -

Episode: 187 Total reward: -12.4166259765625 Training loss: 7.6572 Explore P: 0.4411
Episode: 188 Total reward: -57.97666931152344 Training loss: 2.3688 Explore P: 0.4400
Episode: 189 Total reward: -73.35598754882812 Training loss: 1.3567 Explore P: 0.4390
Episode: 190 Total reward: -72.44622802734375 Training loss: 1.0615 Explore P: 0.4380
Model Saved
Episode: 191 Total reward: -53.833221435546875 Training loss: 1.4156 Explore P: 0.4363
Episode: 192 Total reward: -22.984634399414062 Training loss: 0.9412 Explore P: 0.4346
Episode: 193 Total reward: -14.407135009765625 Training loss: 0.8770 Explore P: 0.4329
Episode: 194 Total reward: -71.23860168457031 Training loss: 2.5495 Explore P: 0.4319
Episode: 195 Total reward: 17.364242553710938 Training loss: 1.3053 Explore P: 0.4289
Model Saved
Episode: 196 Total reward: -30.22979736328125 Training loss: 1.2999 Explore P: 0.4272
Episode: 197 Total reward: -88.61210632324219 Training loss: 1.7826 Explore P: 0.4254
Episode: 198 Total reward: -

Episode: 280 Total reward: -101.73876953125 Training loss: 1.2468 Explore P: 0.3031
Model Saved
Episode: 281 Total reward: -54.58653259277344 Training loss: 9.0259 Explore P: 0.3019
Episode: 282 Total reward: -110.93882751464844 Training loss: 1.8144 Explore P: 0.2989
Episode: 283 Total reward: -115.90835571289062 Training loss: 1.3201 Explore P: 0.2972
Episode: 284 Total reward: -49.26557922363281 Training loss: 2.2046 Explore P: 0.2960
Episode: 285 Total reward: -97.17985534667969 Training loss: 1.7569 Explore P: 0.2953
Model Saved
Episode: 286 Total reward: -33.01533508300781 Training loss: 1.3273 Explore P: 0.2941
Episode: 287 Total reward: -74.89543151855469 Training loss: 21.1552 Explore P: 0.2930
Episode: 288 Total reward: -79.28390502929688 Training loss: 1.5119 Explore P: 0.2918
Episode: 289 Total reward: -84.67340087890625 Training loss: 3.0751 Explore P: 0.2898
Episode: 290 Total reward: -74.93247985839844 Training loss: 22.4415 Explore P: 0.2886
Model Saved
Episode: 291 Tot

Episode: 373 Total reward: -31.759246826171875 Training loss: 1.4598 Explore P: 0.2064
Episode: 374 Total reward: -66.35745239257812 Training loss: 26.8616 Explore P: 0.2042
Episode: 375 Total reward: -40.05412292480469 Training loss: 10.3995 Explore P: 0.2034
Model Saved
Episode: 376 Total reward: -46.139251708984375 Training loss: 1.9203 Explore P: 0.2026
Episode: 377 Total reward: -82.57797241210938 Training loss: 2.0636 Explore P: 0.2020
Episode: 378 Total reward: -38.8330078125 Training loss: 1.2759 Explore P: 0.2009
Episode: 379 Total reward: -11.68304443359375 Training loss: 1.7240 Explore P: 0.1854
Episode: 380 Total reward: -54.8214111328125 Training loss: 1.9578 Explore P: 0.1847
Model Saved
Episode: 381 Total reward: -84.83267211914062 Training loss: 1.2623 Explore P: 0.1840
Episode: 382 Total reward: 32.855865478515625 Training loss: 1.4235 Explore P: 0.1833
Episode: 383 Total reward: -0.874420166015625 Training loss: 1.2055 Explore P: 0.1826
Episode: 384 Total reward: -30.

Model Saved
Episode: 466 Total reward: 28.893386840820312 Training loss: 2.1610 Explore P: 0.1325
Episode: 467 Total reward: -47.401611328125 Training loss: 1.0198 Explore P: 0.1320
Episode: 468 Total reward: 27.089950561523438 Training loss: 6.4417 Explore P: 0.1315
Episode: 469 Total reward: 21.407760620117188 Training loss: 21.6828 Explore P: 0.1310
Episode: 470 Total reward: 22.998855590820312 Training loss: 6.8204 Explore P: 0.1305
Model Saved
Episode: 471 Total reward: 11.813369750976562 Training loss: 13.5251 Explore P: 0.1300
Episode: 472 Total reward: -51.877685546875 Training loss: 1.8424 Explore P: 0.1297
Episode: 473 Total reward: 37.22886657714844 Training loss: 3.2896 Explore P: 0.1292
Episode: 474 Total reward: -31.59661865234375 Training loss: 2.7693 Explore P: 0.1288
Episode: 475 Total reward: -107.43583679199219 Training loss: 3.8901 Explore P: 0.1285
Model Saved
Episode: 476 Total reward: -12.372451782226562 Training loss: 2.5129 Explore P: 0.1282
Episode: 477 Total 

## Step 10: 观看训练结果

In [18]:
saver = tf.train.Saver()

with tf.Session() as sess:
    
    game = DoomGame()
    game.load_config("deadly_corridor_testing.cfg")
    game.set_doom_scenario_path("deadly_corridor.wad")
    saver.restore(sess, "models/model.ckpt")
    game.init()
    
    for i in range(10):
        
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
        while not game.is_episode_finished():
            exp_exp_tradeoff = np.random.rand()
            explore_probability = 0.01  
            if (explore_probability > exp_exp_tradeoff):
                action = random.choice(possible_actions)
        
            else:
                Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
                choice = np.argmax(Qs)
                action = possible_actions[int(choice)]
            
            game.make_action(action)
            done = game.is_episode_finished()
        
            if done:
                break  
                
            else:
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                state = next_state
        
        score = game.get_total_reward()
        print("Score: ", score)
    
    game.close()

INFO:tensorflow:Restoring parameters from models/model.ckpt
Score:  -91.59956359863281
Score:  -91.59956359863281
Score:  -78.81681823730469
Score:  -108.4110107421875
Score:  -85.94677734375
Score:  -91.59956359863281
Score:  -91.59956359863281
Score:  -91.59956359863281
Score:  -91.59956359863281
Score:  -91.51455688476562
