In [11]:
import collections
import random
import gym,os
import  numpy as np
import  tensorflow as tf
from    tensorflow import keras
from    tensorflow.keras import layers,optimizers,losses

In [2]:
env = gym.make("CartPole-v1") # 创建游戏环境

In [None]:
observation = env.reset() # 游戏初始化
action = env.action_space.sample() # 随机生成一个动作
# 与环境交互，返回新的状态，奖励，是否结束标志，其他信息
observation, reward, done, info = env.step(action)
if done:#游戏回合结束，复位状态
    observation = env.reset()
env.close()

In [12]:
class Qnet(keras.Model):
    def __init__(self):
        # 创建Q网络，输入为状态向量，输出为动作的Q值
        super(Qnet, self).__init__()
        self.fc1 = layers.Dense(256, kernel_initializer='he_normal')
        self.fc2 = layers.Dense(256, kernel_initializer='he_normal')
        self.fc3 = layers.Dense(2, kernel_initializer='he_normal')

    def call(self, x, training=None):
        x = tf.nn.relu(self.fc1(x))
        x = tf.nn.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def sample_action(self, s, epsilon):
        s = tf.constant(s, dtype=tf.float32)
        s = tf.expand_dims(s, axis=0)
        out = self(s)[0]
        coin = random.random()
        # 策略改进：e-贪心方式
        if coin < epsilon:
            return random.randint(0, 1)
        else:  
            return int(tf.argmax(out))

In [13]:
class ReplayBuffer():
    # 经验回放池
    def __init__(self):
        # 双向队列
        self.buffer = collections.deque(maxlen=buffer_limit)

    def put(self, transition):
        self.buffer.append(transition)

    def sample(self, n):
        # 从回放池采样
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        # 按类别进行整理
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])
        # 转换成Tensor
        return tf.constant(s_lst, dtype=tf.float32),\
                      tf.constant(a_lst, dtype=tf.int32), \
                      tf.constant(r_lst, dtype=tf.float32), \
                      tf.constant(s_prime_lst, dtype=tf.float32), \
                      tf.constant(done_mask_lst, dtype=tf.float32)


    def size(self):
        return len(self.buffer)


In [14]:
def train(q, q_target, memory, optimizer):
    # 通过Q网络和target网络来构造贝尔曼方程的误差，
    # 并只更新Q网络，target网络的更新会滞后Q网络
    huber = losses.Huber()
    for i in range(10):  
        # 从缓冲池采样
        s, a, r, s_prime, done_mask = memory.sample(batch_size)
        with tf.GradientTape() as tape:
            q_out = q(s)  # 得到Q(s,a)的分布
            indices = tf.expand_dims(tf.range(a.shape[0]), axis=1)
            indices = tf.concat([indices, a], axis=1)
            q_a = tf.gather_nd(q_out, indices) # 动作的概率值
            q_a = tf.expand_dims(q_a, axis=1) 
            # 得到Q(s',a)的最大值，它来自target网络！ 
            max_q_prime = tf.reduce_max(q_target(s_prime),axis=1,keepdims=True)
            # 构造Q(s,a_t)的目标值，来自贝尔曼方程
            target = r + gamma * max_q_prime * done_mask
            # 计算Q(s,a_t)与目标值的误差
            loss = huber(q_a, target)
        # 更新网络，使得Q(s,a_t)估计符合贝尔曼方程
        grads = tape.gradient(loss, q.trainable_variables)
        optimizer.apply_gradients(zip(grads, q.trainable_variables))

In [15]:
# Hyperparameters
learning_rate = 0.0002
gamma = 0.99
buffer_limit = 50000
batch_size = 32

In [16]:
def main():
    env = gym.make('CartPole-v1')  # 创建环境
    q = Qnet()  # 创建Q网络
    q_target = Qnet()  # 创建target网络
    q.build(input_shape=(2,4))
    q_target.build(input_shape=(2,4))
    for src, dest in zip(q.variables, q_target.variables):
        dest.assign(src) 
    memory = ReplayBuffer()  # 创建回放池

    print_interval = 20
    score = 0.0
    optimizer = optimizers.Adam(lr=learning_rate)

    for n_epi in range(10000):  # 训练次数
        # epsilon概率也会8%到1%衰减，越到后面越使用Q值最大的动作
        epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200))
        s = env.reset()  # 复位环境
        for t in range(600):  # 一个回合最大时间戳
            # 根据当前Q网络提取策略，并改进策略
            a = q.sample_action(s, epsilon)
            # 使用改进的策略与环境交互
            s_prime, r, done, info = env.step(a)
            done_mask = 0.0 if done else 1.0  # 结束标志掩码
            memory.put((s, a, r, s_prime, done_mask))
            s = s_prime  # 刷新状态
            score += r  # 记录游戏得分
            if done:  # 回合结束
                break

        if memory.size() > 2000:  # 缓冲池大于2000就可以训练
            train(q, q_target, memory, optimizer)

        if n_epi % print_interval == 0 and n_epi != 0:
            for src, dest in zip(q.variables, q_target.variables):
                dest.assign(src)  # target网络权值更新
            print("# of episode :{}, avg score : {:.1f}, buffer size : {}, " \
                  "epsilon : {:.1f}%" \
                  .format(n_epi, score / print_interval, memory.size(), epsilon * 100))
            score = 0.0
    env.close()

In [17]:
main()

# of episode :20, avg score : 17.5, buffer size : 350, epsilon : 7.9%
# of episode :40, avg score : 14.9, buffer size : 648, epsilon : 7.8%
# of episode :60, avg score : 15.3, buffer size : 954, epsilon : 7.7%
# of episode :80, avg score : 10.7, buffer size : 1167, epsilon : 7.6%
# of episode :100, avg score : 13.6, buffer size : 1439, epsilon : 7.5%
# of episode :120, avg score : 15.0, buffer size : 1739, epsilon : 7.4%
# of episode :140, avg score : 10.9, buffer size : 1957, epsilon : 7.3%
# of episode :160, avg score : 10.9, buffer size : 2175, epsilon : 7.2%
# of episode :180, avg score : 9.9, buffer size : 2374, epsilon : 7.1%
# of episode :200, avg score : 9.3, buffer size : 2561, epsilon : 7.0%
# of episode :220, avg score : 9.6, buffer size : 2753, epsilon : 6.9%
# of episode :240, avg score : 9.6, buffer size : 2944, epsilon : 6.8%
# of episode :260, avg score : 16.2, buffer size : 3268, epsilon : 6.7%
# of episode :280, avg score : 20.1, buffer size : 3669, epsilon : 6.6%
# o

# of episode :2260, avg score : 96.2, buffer size : 50000, epsilon : 1.0%
# of episode :2280, avg score : 93.3, buffer size : 50000, epsilon : 1.0%
# of episode :2300, avg score : 85.2, buffer size : 50000, epsilon : 1.0%
# of episode :2320, avg score : 99.2, buffer size : 50000, epsilon : 1.0%
# of episode :2340, avg score : 99.1, buffer size : 50000, epsilon : 1.0%
# of episode :2360, avg score : 101.4, buffer size : 50000, epsilon : 1.0%
# of episode :2380, avg score : 88.8, buffer size : 50000, epsilon : 1.0%
# of episode :2400, avg score : 94.7, buffer size : 50000, epsilon : 1.0%
# of episode :2420, avg score : 87.0, buffer size : 50000, epsilon : 1.0%
# of episode :2440, avg score : 65.1, buffer size : 50000, epsilon : 1.0%
# of episode :2460, avg score : 89.0, buffer size : 50000, epsilon : 1.0%
# of episode :2480, avg score : 81.8, buffer size : 50000, epsilon : 1.0%
# of episode :2500, avg score : 49.5, buffer size : 50000, epsilon : 1.0%
# of episode :2520, avg score : 82.8,

# of episode :4480, avg score : 103.7, buffer size : 50000, epsilon : 1.0%
# of episode :4500, avg score : 113.2, buffer size : 50000, epsilon : 1.0%
# of episode :4520, avg score : 107.2, buffer size : 50000, epsilon : 1.0%
# of episode :4540, avg score : 221.1, buffer size : 50000, epsilon : 1.0%
# of episode :4560, avg score : 111.5, buffer size : 50000, epsilon : 1.0%
# of episode :4580, avg score : 152.8, buffer size : 50000, epsilon : 1.0%
# of episode :4600, avg score : 124.3, buffer size : 50000, epsilon : 1.0%
# of episode :4620, avg score : 240.6, buffer size : 50000, epsilon : 1.0%
# of episode :4640, avg score : 181.0, buffer size : 50000, epsilon : 1.0%
# of episode :4660, avg score : 121.6, buffer size : 50000, epsilon : 1.0%
# of episode :4680, avg score : 140.4, buffer size : 50000, epsilon : 1.0%
# of episode :4700, avg score : 98.7, buffer size : 50000, epsilon : 1.0%
# of episode :4720, avg score : 101.5, buffer size : 50000, epsilon : 1.0%
# of episode :4740, avg sc

# of episode :6700, avg score : 87.8, buffer size : 50000, epsilon : 1.0%
# of episode :6720, avg score : 63.5, buffer size : 50000, epsilon : 1.0%
# of episode :6740, avg score : 189.3, buffer size : 50000, epsilon : 1.0%
# of episode :6760, avg score : 117.9, buffer size : 50000, epsilon : 1.0%
# of episode :6780, avg score : 122.8, buffer size : 50000, epsilon : 1.0%
# of episode :6800, avg score : 103.7, buffer size : 50000, epsilon : 1.0%
# of episode :6820, avg score : 79.4, buffer size : 50000, epsilon : 1.0%
# of episode :6840, avg score : 69.8, buffer size : 50000, epsilon : 1.0%
# of episode :6860, avg score : 68.8, buffer size : 50000, epsilon : 1.0%
# of episode :6880, avg score : 70.9, buffer size : 50000, epsilon : 1.0%
# of episode :6900, avg score : 71.5, buffer size : 50000, epsilon : 1.0%
# of episode :6920, avg score : 41.5, buffer size : 50000, epsilon : 1.0%
# of episode :6940, avg score : 89.5, buffer size : 50000, epsilon : 1.0%
# of episode :6960, avg score : 75

# of episode :8900, avg score : 500.0, buffer size : 50000, epsilon : 1.0%
# of episode :8920, avg score : 489.2, buffer size : 50000, epsilon : 1.0%
# of episode :8940, avg score : 500.0, buffer size : 50000, epsilon : 1.0%
# of episode :8960, avg score : 492.4, buffer size : 50000, epsilon : 1.0%
# of episode :8980, avg score : 500.0, buffer size : 50000, epsilon : 1.0%
# of episode :9000, avg score : 500.0, buffer size : 50000, epsilon : 1.0%
# of episode :9020, avg score : 500.0, buffer size : 50000, epsilon : 1.0%
# of episode :9040, avg score : 500.0, buffer size : 50000, epsilon : 1.0%
# of episode :9060, avg score : 500.0, buffer size : 50000, epsilon : 1.0%
# of episode :9080, avg score : 500.0, buffer size : 50000, epsilon : 1.0%
# of episode :9100, avg score : 490.4, buffer size : 50000, epsilon : 1.0%
# of episode :9120, avg score : 468.4, buffer size : 50000, epsilon : 1.0%
# of episode :9140, avg score : 500.0, buffer size : 50000, epsilon : 1.0%
# of episode :9160, avg s