In [1]:
import torch

In [2]:
import torch.nn as nn

In [3]:
import torch.nn.functional as F

In [4]:
import random

In [5]:
from collections import deque

In [6]:
from torch.distributions import Categorical

In [7]:
import numpy as np

In [8]:
import gym

In [9]:
import os

In [10]:
import copy

In [11]:
import matplotlib.pyplot as plt

In [12]:
import seaborn as sns

In [13]:
class ActorSoftmax(nn.Module):
    
    def __init__(self,input_dim,output_dim,hidden_dim=256):
        super(ActorSoftmax,self).__init__()
        self.fc1 = nn.Linear(input_dim,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
        self.fc3 = nn.Linear(hidden_dim,output_dim)
    
    
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        probs = F.softmax(self.fc3(x),dim=1)
        return probs

In [14]:
class Critic(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=256):
        super(Critic, self).__init__()
        assert output_dim == 1  # critic must output a single value
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        value = self.fc3(x)
        return value

In [15]:
class ReplayBufferQue:
    
    def __init__(self,capacity: int) -> None:
        self.capacity = capacity
        self.buffer = deque(maxlen=self.capacity)
    
    def push(self,transitions):
        self.buffer.append(transitions)
    
    def sample(self,batch_size: int, sequential: bool = False):
        if batch_size > len(self.buffer):
            batch_size = len(self.deque)
        
        if sequential:
            rand = random.randint(0,len(self.buffer)-batch_size)
            batch = [self.buffer[i] for i in range(rand,rand+batch_size)]
            return zip(*batch)
        else:
            batch = random.sample(self.buffer,batch_size)
            return zip(*batch)
    
    def clear(self):
        self.buffer.clear()
    
    def __len__(self):
        return len(self.buffer)

In [16]:
class PGReplay(ReplayBufferQue):
    def __init__(self):
        self.buffer = deque()

    def sample(self):
        batch = list(self.buffer)
        return zip(*batch)

In [24]:
class Agent:
    
    def __init__(self,cfg):
        self.gamma = cfg.gamma
        self.device = torch.device(cfg.device)
        self.actor = ActorSoftmax(cfg.n_states,cfg.n_actions,hidden_dim=cfg.actor_hidden_dim).to(self.device)
        self.critic = Critic(
            cfg.n_states,
            1,
            hidden_dim=cfg.critic_hidden_dim
        ).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),lr=cfg.actor_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),lr=cfg.critic_lr)
        self.memory = PGReplay()
        self.k_epochs = cfg.k_epochs
        self.eps_clip = cfg.eps_clip
        self.entropy_coef = cfg.entropy_coef
        self.sample_count = 0
        self.update_freq = cfg.update_freq
    
    
    def sample_action(self,state):
        self.sample_count += 1
        state = torch.tensor(state,device=self.device,dtype=torch.float32).unsqueeze(dim=0)
        probs = self.actor(state)
        dist = Categorical(probs)
        action = dist.sample()
        self.log_probs = dist.log_prob(action).detach()
        return action.detach().cpu().numpy().item()
    

    @torch.no_grad()
    def predict_action(self,state):
        state = torch.tensor(state,
                            device=self.device,
                            dtype=torch.float32).unsqueeze(dim=0)
        probs = self.actor(state)
        dist = Categorical(probs)
        action = dist.sample()
        return action.detach().cpu().numpy().item()
        

    
    def update(self):
        if self.sample_count % self.update_freq != 0:
            return
        
        old_states,old_actions,old_log_probs,old_rewards,old_dones = self.memory.sample()
        old_states = torch.tensor(np.array(old_states), device=self.device, dtype=torch.float32)
        old_actions = torch.tensor(np.array(old_actions), device=self.device, dtype=torch.float32)
        old_log_probs = torch.tensor(old_log_probs, device=self.device, dtype=torch.float32)
        
        returns = []
        discounted_sum = 0
        
        for reward, done in zip(reversed(old_rewards),reversed(old_dones)):
            if done:
                discounted_sum = 0
            discounted_sum = reward + (self.gamma * discounted_sum)
            returns.insert(0,discounted_sum)
        
        returns = torch.tensor(returns,device=self.device,dtype=torch.float32)
        returns = (returns - returns.mean()) / (returns.std() + 1e-5)
        
        for _ in range(self.k_epochs):
            values = self.critic(old_states)
            advantage = returns - values.detach()
            probs = self.actor(old_states)
            dist = Categorical(probs)
            new_probs = dist.log_prob(old_actions)
            ratio = torch.exp(new_probs - old_log_probs)
            
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio,1-self.eps_clip,1+self.eps_clip) * advantage
            
            actor_loss = -torch.min(surr1,surr2).mean() + self.entropy_coef * dist.entropy().mean()
            
            critic_loss = (returns - values).pow(2).mean()
        
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()
            
            actor_loss.backward()
            critic_loss.backward()
            
            self.actor_optimizer.step()
            self.critic_optimizer.step()
        
        self.memory.clear()
        
        

In [28]:
def train(cfg,env,agent):
    print('begin to train')
    rewards = []
    steps = []
    best_ep_reward = 0
    output_agent = None
    
    for i_ep in range(cfg.train_eps):
        ep_reward = 0
        ep_step = 0
        state, _ = env.reset()
        
        for _ in range(cfg.max_steps):
            ep_step += 1
            action = agent.sample_action(state)
            next_state, reward,terminatec.truncated,_ = env.step(action)
            env.render()
            agent.memory.push((state,action,agent.log_probs,reward,terminated))
            state = next_state
            agent.update()
            ep_reward += reward
            if terminated:
                break
        
        if (i_ep + 1) % cfg.eval_per_episode == 0:
            sum_eval_reward = 0
            for _ in range(cfg.eval_eps):
                eval_per_episode = 0
                state, _ = env.reset()
                for _ in range(cfg.max_steps):
                    action = agent.predict_action(state)
                    next_state,reward,terminated,truncated,_ = env.step(action)
                    state = next_state
                    eval_ep_reward += reward
                    if terminated:
                        break
                sum_eval_reward += eval_ep_reward
            mean_eval_reward = sum_eval_reward / cfg.eval_eps
            if mean_eval_reward >= best_ep_reward:
                best_ep_reward = mean_eval_reward
                output_agent = copy.deepcopy(agent)
                print(f"回合：{i_ep + 1}/{cfg.train_eps}，奖励：{ep_reward:.2f}，评估奖励：{mean_eval_reward:.2f}，最佳评估奖励：{best_ep_reward:.2f}，更新模型！")
            else:
                print(f"回合：{i_ep + 1}/{cfg.train_eps}，奖励：{ep_reward:.2f}，评估奖励：{mean_eval_reward:.2f}，最佳评估奖励：{best_ep_reward:.2f}")
    
        steps.append(ep_step)
        rewards.append(ep_reward)
    
    print('finished')
    env.close()
    return output_agent, {'rewards':rewards}

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# 定义一个简单的Actor网络
class SimpleActor(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleActor, self).__init__()
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, x):
        return torch.softmax(self.fc(x), dim=-1)

# 定义一个简单的环境
class SimpleEnv:
    def __init__(self):
        self.action_space = [0, 1, 2]  # 三个离散动作

    def reset(self):
        return [0]  # 返回初始状态

# 定义策略类
class Policy:
    def __init__(self, input_size, output_size, device):
        self.actor = SimpleActor(input_size, output_size).to(device)
        self.device = device
        self.sample_count = 0
        self.log_probs = None

    def sample_action(self, state):
        self.sample_count += 1
        state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
        probs = self.actor(state)
        print('probs',probs)
        dist = Categorical(probs)
        print('dist',dist)
        action = dist.sample()
        print('action',action)
        self.log_probs = dist.log_prob(action).detach()
        print('log probs',self.log_probs)
        return action.detach().cpu().numpy().item()

# 单元测试
def test_sample_action():
    device = 'cpu'  # 使用CPU
    env = SimpleEnv()
    policy = Policy(input_size=1, output_size=len(env.action_space), device=device)

    # 重置环境，获得初始状态
    initial_state = env.reset()

    # 采样动作
    action = policy.sample_action(initial_state)

    # 检查动作是否在有效范围内
    assert action in env.action_space, f"Action {action} is not in {env.action_space}"

    # 检查采样计数是否增加
    assert policy.sample_count == 1, "Sample count should be 1 after sampling."

    print(f"Sampled action: {action}")
    print(f"Log probability: {policy.log_probs.item()}")

# 运行测试
if __name__ == "__main__":
    test_sample_action()


probs tensor([[0.1102, 0.4467, 0.4431]], grad_fn=<SoftmaxBackward0>)
dist Categorical(probs: torch.Size([1, 3]))
action tensor([1])
log probs tensor([-0.8058])
Sampled action: 1
Log probability: -0.8058449625968933


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Categorical

# 定义一个简单的Actor网络
class SimpleActor(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleActor, self).__init__()
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, x):
        return torch.softmax(self.fc(x), dim=-1)

# 定义一个简单的Critic网络
class SimpleCritic(nn.Module):
    def __init__(self, input_size):
        super(SimpleCritic, self).__init__()
        self.fc = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.fc(x)

# 定义一个简单的环境
class SimpleEnv:
    def __init__(self):
        self.action_space = [0, 1, 2]  # 三个离散动作

    def reset(self):
        return [0]  # 返回初始状态

# 定义一个简单的记忆缓冲区
class Memory:
    def __init__(self):
        self.states = []
        self.actions = []
        self.log_probs = []
        self.rewards = []
        self.dones = []

    def store(self, state, action, log_prob, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.log_probs.append(log_prob)
        self.rewards.append(reward)
        self.dones.append(done)

    def sample(self):
        return self.states, self.actions, self.log_probs, self.rewards, self.dones

    def clear(self):
        self.states.clear()
        self.actions.clear()
        self.log_probs.clear()
        self.rewards.clear()
        self.dones.clear()

# 定义策略类
class Policy:
    def __init__(self, input_size, output_size, device):
        self.actor = SimpleActor(input_size, output_size).to(device)
        self.critic = SimpleCritic(input_size).to(device)
        self.memory = Memory()
        self.device = device
        self.gamma = 0.99
        self.eps_clip = 0.2
        self.k_epochs = 4
        self.entropy_coef = 0.01
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=0.001)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=0.001)
        self.sample_count = 0
        self.update_freq = 5

    def sample_action(self, state):
        self.sample_count += 1
        state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
        probs = self.actor(state)
        dist = Categorical(probs)
        action = dist.sample()
        self.memory.store(state.cpu().numpy(), action.item(), dist.log_prob(action).detach().cpu().numpy(), 0, False)  # 假设奖励和done为0和False
        return action.item()

    def update(self):
        if self.sample_count % self.update_freq != 0:
            return
        
        old_states, old_actions, old_log_probs, old_rewards, old_dones = self.memory.sample()
        old_states = torch.tensor(np.array(old_states), device=self.device, dtype=torch.float32)
        old_actions = torch.tensor(np.array(old_actions), device=self.device, dtype=torch.float32)
        old_log_probs = torch.tensor(old_log_probs, device=self.device, dtype=torch.float32)

        returns = []
        discounted_sum = 0

        for reward, done in zip(reversed(old_rewards), reversed(old_dones)):
            if done:
                discounted_sum = 0
            discounted_sum = reward + (self.gamma * discounted_sum)
            returns.insert(0, discounted_sum)

        returns = torch.tensor(returns, device=self.device, dtype=torch.float32)
        returns = (returns - returns.mean()) / (returns.std() + 1e-5)

        for _ in range(self.k_epochs):
            values = self.critic(old_states)
            advantage = returns - values.detach()
            probs = self.actor(old_states)
            dist = Categorical(probs)
            new_probs = dist.log_prob(old_actions)
            ratio = torch.exp(new_probs - old_log_probs)

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * advantage

            actor_loss = -torch.min(surr1, surr2).mean() + self.entropy_coef * dist.entropy().mean()
            critic_loss = (returns - values).pow(2).mean()

            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()

            actor_loss.backward()
            critic_loss.backward()

            self.actor_optimizer.step()
            self.critic_optimizer.step()

        self.memory.clear()

# 单元测试
def test_policy_update():
    device = 'cpu'  # 使用CPU
    env = SimpleEnv()
    policy = Policy(input_size=1, output_size=len(env.action_space), device=device)

    # 重置环境，获得初始状态
    initial_state = env.reset()

    # 采样动作
    action = policy.sample_action(initial_state)

    # 检查动作是否在有效范围内
    assert action in env.action_space, f"Action {action} is not in {env.action_space}"

    # 模拟获得奖励和done标志
    policy.memory.rewards = [1.0]  # 假设的奖励
    policy.memory.dones = [False]   # 假设的done标志

    # 更新策略
    policy.update()

    # 检查更新后的策略或值是否有效（例如，检查损失是否为非负）
    assert policy.actor_optimizer.param_groups[0]['lr'] > 0, "Learning rate should be greater than 0."

    print(f"Sampled action: {action}")
    print("Policy update completed successfully.")

# 运行测试
if __name__ == "__main__":
    test_policy_update()


Sampled action: 0
Policy update completed successfully.
