In [13]:
import gymnasium as gym
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import rl_utils

In [14]:
class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(PolicyNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return F.softmax(self.fc2(x), dim=1)

In [16]:
class REINFORCE:
    def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma,
                 device):
        # \pi(\theta)，输入状态，输出策略概率分布
        self.policy_net = PolicyNet(state_dim, hidden_dim,
                                    action_dim).to(device)
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(),
                                          lr=learning_rate)  # 使用Adam优化器
        self.gamma = gamma  # 折扣因子
        self.device = device

    def take_action(self, state):  # 根据动作概率分布随机采样
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        probs = self.policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action.item()

    def update(self, transition_dict):
        reward_list = transition_dict['rewards']
        state_list = transition_dict['states']
        action_list = transition_dict['actions']

        G = 0
        self.optimizer.zero_grad()
        for i in reversed(range(len(reward_list))):  # 从最后一步算起
            reward = reward_list[i]
            state = torch.tensor([state_list[i]],
                                 dtype=torch.float).to(self.device)
            action = torch.tensor([action_list[i]]).view(-1, 1).to(self.device)
            pi = self.policy_net(state)
            log_prob = torch.log(pi.gather(1, action))
            G = self.gamma * G + reward
            loss = -log_prob * G  # 每一步的损失函数
            loss.backward()  # 反向传播计算梯度
        self.optimizer.step()  # 梯度下降

反向传播的对象应该是$J(\theta)$，但$J(\theta)$作为累积折扣奖励无法写成$\theta$相关的函数形式。  
所以在$\nabla_{\theta}J(\theta)$求积分得到代理目标，对代理目标求梯度。

In [15]:
learning_rate = 1e-3
num_episodes = 1000
hidden_dim = 128
gamma = 0.98
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

env_name = "CartPole-v0"
env = gym.make(env_name)
observation, info = env.reset(seed=0)

torch.manual_seed(0)
# 状态空间大小
state_dim = env.observation_space.shape[0]
# 动作空间大小
action_dim = env.action_space.n

agent = REINFORCE(state_dim, hidden_dim, action_dim, learning_rate, gamma,
                  device)

return_list = []
for i in range(10):
    with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar:
        # 回合
        for i_episode in range(int(num_episodes / 10)):
            episode_return = 0
            transition_dict = {
                'states': [],
                'actions': [],
                'next_states': [],
                'rewards': [],
                'dones': []
            }
            state, info = env.reset(seed=0)
            terminated = False
            while not (terminated or truncated):
                # 根据策略获取行动
                action = agent.take_action(state)
                # 根据行动执行，得到下一状态、奖励，是否停止.
                next_state, reward, terminated, truncated, info = env.step(action)
                transition_dict['states'].append(state)
                transition_dict['actions'].append(action)
                transition_dict['next_states'].append(next_state)
                transition_dict['rewards'].append(reward)
                transition_dict['dones'].append(done)
                state = next_state
                # 回合累积奖励。用来计算回合平均回报。
                episode_return += reward
            return_list.append(episode_return)
            # 策略提升，回合结束一次性输入。
            agent.update(transition_dict)
            if (i_episode + 1) % 10 == 0:
                pbar.set_postfix({
                    'episode':
                    '%d' % (num_episodes / 10 * i + i_episode + 1),
                    'return':
                    '%.3f' % np.mean(return_list[-10:])
                })
            pbar.update(1)

# Iteration 0: 100%|██████████| 100/100 [00:04<00:00, 23.88it/s, episode=100,
# return=55.500]
# Iteration 1: 100%|██████████| 100/100 [00:08<00:00, 10.45it/s, episode=200,
# return=75.300]
# Iteration 2: 100%|██████████| 100/100 [00:16<00:00,  4.75it/s, episode=300,
# return=178.800]
# Iteration 3: 100%|██████████| 100/100 [00:20<00:00,  4.90it/s, episode=400,
# return=164.600]
# Iteration 4: 100%|██████████| 100/100 [00:21<00:00,  4.58it/s, episode=500,
# return=156.500]
# Iteration 5: 100%|██████████| 100/100 [00:21<00:00,  4.73it/s, episode=600,
# return=187.400]
# Iteration 6: 100%|██████████| 100/100 [00:22<00:00,  4.40it/s, episode=700,
# return=194.500]
# Iteration 7: 100%|██████████| 100/100 [00:23<00:00,  4.24it/s, episode=800,
# return=200.000]
# Iteration 8: 100%|██████████| 100/100 [00:23<00:00,  4.33it/s, episode=900,
# return=200.000]
# Iteration 9: 100%|██████████| 100/100 [00:22<00:00,  4.14it/s, episode=1000,
# return=186.100]

cpu


Iteration 0: 100%|██████████| 100/100 [00:01<00:00, 94.95it/s, episode=100, return=32.300]
Iteration 1: 100%|██████████| 100/100 [00:01<00:00, 51.90it/s, episode=200, return=65.400]
Iteration 2: 100%|██████████| 100/100 [00:00<00:00, 135.64it/s, episode=300, return=0.000]
Iteration 3: 100%|██████████| 100/100 [00:00<00:00, 7988.09it/s, episode=400, return=0.000]
Iteration 4: 100%|██████████| 100/100 [00:00<00:00, 9514.56it/s, episode=500, return=0.000]
Iteration 5: 100%|██████████| 100/100 [00:00<00:00, 8167.91it/s, episode=600, return=0.000]
Iteration 6: 100%|██████████| 100/100 [00:00<00:00, 8503.40it/s, episode=700, return=0.000]
Iteration 7: 100%|██████████| 100/100 [00:00<00:00, 9633.88it/s, episode=800, return=0.000]
Iteration 8: 100%|██████████| 100/100 [00:00<00:00, 9497.97it/s, episode=900, return=0.000]
Iteration 9: 100%|██████████| 100/100 [00:00<00:00, 9859.90it/s, episode=1000, return=0.000]


In [None]:
episodes_list = list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('REINFORCE on {}'.format(env_name))
plt.show()

mv_return = rl_utils.moving_average(return_list, 9)
plt.plot(episodes_list, mv_return)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('REINFORCE on {}'.format(env_name))
plt.show()

#### 平均状态价值重推导
状态价值定义：  
$v_{\pi}(s)=\mathbb{E}[G_t|S_t=s]$

$G_t = R_t + \gamma G_{t+1}$

$= R_t + \gamma (R_{t+1}+\gamma G_{t+2}) = R_t + \gamma R_{t+1} + \gamma^2 G_{t+2}$

$= \sum_{t=0}^\infty \gamma^t R_t$

所以：  
$v_{\pi}(s) = \mathbb{E}[\sum_{t=0}^{\infty} \gamma^t R_t|S_t=s]$

状态价值针对的是某个状态，如果对所有状态的改路进行加权，就得到了策略梯度法的目标，平均状态价值。

$\sum_{s\in\mathcal{S}} v_{\pi}(s) = \sum_{s\in\mathcal{S}} d(s)\mathbb{E}[\sum_{t=0}^{\infty}\gamma^t R_t | S_t =s]=\bar{v}_{\pi}$

#### 为什么状态分布$d(s)$可以选择和策略无关的分布，然后还可以作为目标来提升

策略是$\theta$的函数，目标只要是$\theta$的函数，就可以通过梯度上升来调整$\theta$提升。  
平均状态价值中，两部分，其中$v_{\pi}(s)$已经是$\theta$的函数了。因为策略决定了状态分布，也就决定了获取的奖励及状态价值。所以状态价值是$\theta$的函数。  
至于稳定状态分布$d$，理论上说，它和$\theta$有关，但不是一个可以通过$\theta$估计出来的值，而是一个长期迭代后收敛的值。所以，本身它也没法作为提升的一部分写进目标公式里，必须要通过蒙特卡洛法来模拟这个分布。  
另一方面，调整分布可以设定出不同的目标，让$\theta$朝着不同的方向更新。比如如果d是所有状态平均，可能这个要解决的问题本身就隐含了希望所有状态的分布是均匀的、同权的，设置目标为平均状态价值，就会让$\theta$朝着这个方向前进。

#### average state value 和average reward 有什么区别

average state value:  
$\bar{v}_{\pi}(s) = \sum_{s\in\mathcal{S}}d_{\pi}(s)v_{\pi}(s)$

average reward:  
$\bar{r}_{\pi}(s) = \sum_{s\in\mathcal{S}} d_{\pi}(s)r_{\pi}(s)$

猛一看，以为这俩好像没什么区别。好像就是换了个字母。实际上仔细想下状态价值和$r_{\pi}(s)$含义，就能发现区别。  
简单说，状态价值包含即时奖励和未来折扣奖励。也就是说，它的信息里包含了未来的奖励信息。  
而$r_{\pi}(s)$中仅有即时奖励，没有未来的奖励。

$v_{\pi}(s) = \mathbb{E}[R_t + \gamma G_{t+1}|S_t=s]$

$r_{\pi}(s) = \mathbb{E}[R_t | S_t=s]$

这个$r_{\pi}(s)$之前没出现过，所以突然出现有点迷

换个角度：  
$v_{\pi}(s) = \sum_{a\in\mathcal{A}}\pi(a|s,\theta)r(s,a) + \gamma\sum_{a\in\mathcal{A}}\pi(a|s,\theta)\sum_{s\in\mathcal{S}}p(s'|s,a)v_{\pi}(s')$

$r_{\pi}(s) = \sum_{a\in\mathcal{A}}\pi(a|s,\theta)r(s,a)$

也能看出来，$v_{\pi}(s)$包含了未来折扣奖励，$r_{\pi}(s)$只有即时奖励。