In [None]:
import tensorflow as tf
print(tf.__version__)
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import gym
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [None]:
env = gym.make('Pendulum-v1').unwrapped
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.shape[0]
action_bound = [env.action_space.low, env.action_space.high]
epsilon = 0.2 

In [None]:
class TRPO(object):
    def __init__(self):
        self.sess = tf.Session()
        self.state_ph = tf.placeholder(tf.float32, [None, state_shape], 'state2')

        with tf.variable_scope('value2'):
            layer1_v = tf.layers.dense(self.state_ph, 100, tf.nn.relu)
            self.v = tf.layers.dense(layer1_v, 1)
            self.Q = tf.placeholder(tf.float32, [None, 1], 'discounted_r2')
            self.advantage = self.Q - self.v
            self.value_loss = tf.reduce_mean(tf.square(self.advantage))
            self.train_value_nw = tf.train.AdamOptimizer(0.002).minimize(self.value_loss)

        pi, pi_params = self.build_policy_network('pi2', trainable=True)
        oldpi, oldpi_params = self.build_policy_network('oldpi2', trainable=False)

        with tf.variable_scope('sample_action2'):
            self.sample_op = tf.squeeze(pi.sample(1), axis=0)

        with tf.variable_scope('update_oldpi2'):
            self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]

        self.action_ph = tf.placeholder(tf.float32, [None, action_shape], 'action2')
        self.advantage_ph = tf.placeholder(tf.float32, [None, 1], 'advantage2')

        with tf.variable_scope('loss2'):
            with tf.variable_scope('surrogate2'):
                ratio = pi.prob(self.action_ph) / oldpi.prob(self.action_ph)
                objective = ratio * self.advantage_ph

                # TRPO-specific constraint
                kl_divergence = tf.distributions.kl_divergence(oldpi, pi)
                trpo_loss = -tf.reduce_mean(objective - 0.01 * kl_divergence)

            self.policy_loss = trpo_loss

        with tf.variable_scope('train_policy2'):
            self.global_step = tf.train.get_or_create_global_step()
            self.learning_rate = tf.train.exponential_decay(0.001, self.global_step, 10000, 0.96, staircase=True)
            self.train_policy_nw = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.policy_loss, global_step=self.global_step)

        self.sess.run(tf.global_variables_initializer())

    def train(self, state, action, reward):
        self.sess.run(self.update_oldpi_op)
        adv = self.sess.run(self.advantage, {self.state_ph: state, self.Q: reward})
        feed_dict = {self.state_ph: state, self.action_ph: action, self.advantage_ph: adv}
        [self.sess.run(self.train_policy_nw, {self.state_ph: state, self.action_ph: action, self.advantage_ph: adv}) for _ in range(10)]
        [self.sess.run(self.train_value_nw, {self.state_ph: state, self.Q: reward}) for _ in range(10)]


    def build_policy_network(self, name, trainable):
        with tf.variable_scope(name):
            layer1_pi = tf.layers.dense(self.state_ph, 100, tf.nn.relu, trainable=trainable)
            mu = 2 * tf.layers.dense(layer1_pi, action_shape, tf.nn.tanh, trainable=trainable)
            sigma = tf.layers.dense(layer1_pi, action_shape, tf.nn.softplus, trainable=trainable)
            norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)

        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
        return norm_dist, params

    def select_action(self, state):
        state = state[np.newaxis, :]
        action = self.sess.run(self.sample_op, {self.state_ph: state})[0]
        action = np.clip(action, action_bound[0], action_bound[1])
        return action

    def get_state_value(self, state):
        if state.ndim < 2:
            state = state[np.newaxis, :]
        return self.sess.run(self.v, {self.state_ph: state})[0, 0]


In [None]:
# 定义PPO类
class PPO(object):
    def __init__(self):
        # 开始TensorFlow会话
        self.sess = tf.Session()
        # 定义状态的占位符
        self.state_ph = tf.placeholder(tf.float32, [None, state_shape], 'state')

        # 构建值网络，返回状态的值
        with tf.variable_scope('value'):
            layer1 = tf.layers.dense(self.state_ph, 100, tf.nn.relu)
            self.v = tf.layers.dense(layer1, 1)
            self.Q = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
            self.advantage = self.Q - self.v
            self.value_loss = tf.reduce_mean(tf.square(self.advantage))
            # 使用Adam优化器最小化值网络的损失
            self.train_value_nw = tf.train.AdamOptimizer(0.002).minimize(self.value_loss)

        # 获取策略和其参数
        pi, pi_params = self.build_policy_network('pi', trainable=True)
        oldpi, oldpi_params = self.build_policy_network('oldpi', trainable=False)

        # 从新策略中抽样一个动作
        with tf.variable_scope('sample_action'):
            self.sample_op = tf.squeeze(pi.sample(1), axis=0)

        # 更新旧策略的参数
        with tf.variable_scope('update_oldpi'):
            self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]

        # 定义动作和优势的占位符
        self.action_ph = tf.placeholder(tf.float32, [None, action_shape], 'action')
        self.advantage_ph = tf.placeholder(tf.float32, [None, 1], 'advantage')

        # 定义策略网络的代理目标函数
        with tf.variable_scope('loss'):
            with tf.variable_scope('surrogate'):
                # 首先，定义比率
                ratio = pi.prob(self.action_ph) / oldpi.prob(self.action_ph)
                # 通过将比率和优势值相乘来定义目标
                objective = ratio * self.advantage_ph
                # 使用修剪后的比率和未修剪的目标值定义目标函数
                L = tf.reduce_mean(tf.minimum(objective, tf.clip_by_value(ratio, 1.-epsilon, 1.+ epsilon)*self.advantage_ph))

            # 计算梯度，并通过使用梯度上升来最大化目标函数。然而，我们可以通过添加负号将上述最大化目标转换为最小化目标。因此，我们可以将策略网络的损失表示为：
            self.policy_loss = -L

        # 使用Adam优化器最小化策略网络的损失
        with tf.variable_scope('train_policy'):
            self.train_policy_nw = tf.train.AdamOptimizer(0.001).minimize(self.policy_loss)

        # 初始化所有TensorFlow变量
        self.sess.run(tf.global_variables_initializer())

    # 训练函数
    def train(self, state, action, reward):
        # 更新旧策略
        self.sess.run(self.update_oldpi_op)
        # 计算优势值
        adv = self.sess.run(self.advantage, {self.state_ph: state, self.Q: reward})
        # 训练策略网络
        [self.sess.run(self.train_policy_nw, {self.state_ph: state, self.action_ph: action, self.advantage_ph: adv}) for _ in range(10)]
        # 训练值网络
        [self.sess.run(self.train_value_nw, {self.state_ph: state, self.Q: reward}) for _ in range(10)]

    # 构建策略网络
    def build_policy_network(self, name, trainable):
        with tf.variable_scope(name):
            # 定义网络的层
            layer1 = tf.layers.dense(self.state_ph, 100, tf.nn.relu, trainable=trainable)
            # 计算均值
            mu = 2 * tf.layers.dense(layer1, action_shape, tf.nn.tanh, trainable=trainable)
            # 计算标准差
            sigma = tf.layers.dense(layer1, action_shape, tf.nn.softplus, trainable=trainable)
            # 计算正态分布
            norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)

        # 获取策略网络的参数
        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
        return norm_dist, params

    # 选择动作函数
    def select_action(self, state):
        state = state[np.newaxis, :]
        # 从策略网络生成的正态分布中抽样一个动作
        action = self.sess.run(self.sample_op, {self.state_ph: state})[0]
        # 将动作剪切，使其在动作边界内
        action = np.clip(action, action_bound[0], action_bound[1])
        return action

    # 获取状态值函数
    def get_state_value(self, state):
        if state.ndim < 2:
            state = state[np.newaxis, :]
        return self.sess.run(self.v, {self.state_ph: state})[0, 0]


In [None]:
trpo = TRPO()
ppo = PPO()
num_episodes = 20
num_timesteps = 200
gamma = 0.9
batch_size = 32
def trainer(model):
    returns_list = []
    #for each episode
    for i in range(num_episodes):
        state = env.reset()
        episode_states, episode_actions, episode_rewards = [], [], []
        Return = 0
        #for every step
        for t in range(num_timesteps):   
            #render the environment
            env.render()
            #select the action
            action = model.select_action(state)
            #perform the selected action
            next_state, reward, done, _, _ = env.step(action)
            #store the state, action, and reward in the list
            episode_states.append(state)
            episode_actions.append(action)
            episode_rewards.append((reward+8)/8)    
            #update the state to the next state
            state = next_state
            #update the return
            Return += reward
            #if we reached the batch size or if we reached the final step of the episode
            if (t+1) % batch_size == 0 or t == num_timesteps-1:
                #compute the value of the next state
                v_s_ = model.get_state_value(next_state)
                #compute Q value as sum of reward and discounted value of next state
                discounted_r = []
                for reward in episode_rewards[::-1]:
                    v_s_ = reward + gamma * v_s_
                    discounted_r.append(v_s_)
                discounted_r.reverse()
                #stack the episode states, actions, and rewards:
                es, ea, er = np.vstack(episode_states), np.vstack(episode_actions), np.array(discounted_r)[:, np.newaxis]
                #empty the lists
                episode_states, episode_actions, episode_rewards = [], [], []
                #train the network
                model.train(es, ea, er)
        #print the return for every 10 episodes
        returns_list.append(Return)
        if i %10 ==0:
            print("Episode:{}, Return: {}".format(i,Return))  
    return returns_list

returns_trpo = trainer(trpo)
returns_ppo = trainer(ppo)

plt.figure(figsize=(10, 6))
plt.plot(returns_trpo, label='TRPO')
plt.plot(returns_ppo, label='PPO')
plt.title('Comparison of TRPO and PPO')
plt.xlabel('Episodes')
plt.ylabel('Return')
plt.legend()
plt.show()