In [2]:
import torch
from torch.distributions import Categorical
import torch.nn as nn

# 定义一个简单的 Actor 模型
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc = nn.Linear(4, 2)  # 假设输入维度为4，输出维度为2

    def forward(self, x):
        return torch.softmax(self.fc(x), dim=-1)

# 假设的输入状态
state = [1.0, 2.0, 3.0, 4.0]
device = 'cpu'

# 初始化模型
actor = Actor().to(device)

# 转换状态并通过模型获取概率分布
state_tensor = torch.tensor(state, device=device, dtype=torch.float32).unsqueeze(dim=0)
print('state_tensor',state_tensor)
probs = actor(state_tensor)
print('probs',probs)

# 创建分布并采样动作
dist = Categorical(probs)
print('dist',dist)
action = dist.sample()
print('action',action)

# 获取动作的对数概率
log_probs = dist.log_prob(action).detach()
print('log_probs',log_probs)
# 输出动作
action_value = action.detach().cpu().numpy().item()
print(action_value)


state_tensor tensor([[1., 2., 3., 4.]])
probs tensor([[0.6368, 0.3632]], grad_fn=<SoftmaxBackward0>)
dist Categorical(probs: torch.Size([1, 2]))
action tensor([0])
log_probs tensor([-0.4513])
0


In [3]:
from torch import nn
import torch.nn.functional as F

In [4]:
class Actor(nn.Module):
    
    def __init__(self,input_dim,output_dim,hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
        self.fc3 = nn.Linear(hidden_dim,output_dim)
    
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        logits = self.fc3(x)
        return F.softmax(logits,dim=-1)

In [5]:
class Critic(nn.Module):
    
    def __init__(self,input_dim,output_dim,hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
        self.fc3 = nn.Linear(hidden_dim,output_dim)
    
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        value = self.fc3(x)
        return value

In [6]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from collections import deque
from torch.distributions import Categorical
import numpy as np
import gym
import os
import copy
import matplotlib.pyplot as plt
import seaborn as sns
np.bool8 = np.bool_
from torch.optim import Adam

In [7]:
env = gym.make("CartPole-v1")  # 或 "rgb_array"
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
print(f"状态空间维度：{n_states}，动作空间维度：{n_actions}")
state,_ = env.reset()
next_state, reward, terminated, truncated, _ = env.step(0)  # 注意这里的返回值

状态空间维度：4，动作空间维度：2


In [8]:
state

array([ 0.02936008, -0.04976663, -0.01232278, -0.02797908], dtype=float32)

In [9]:
reward

1.0

In [10]:
from collections import deque

In [11]:
class ReplayQue:
    
    def __init__(self):
        self.buffer = deque()
    
    def sample(self):
        return zip(*self.buffer)
    
    def clear(self):
        self.buffer.clear()
    
    def push(self,transitions):
        self.buffer.append(transitions)
    
    def __len__(self):
        return len(self.buffer)

In [12]:
class Agent:
    
    def __init__(self):
        self.critic = Critic(n_states,1,256)
        self.actor = Actor(n_states,n_actions,256)
        self.memory = ReplayQue()
        self.actor_optimizer = Adam(self.actor.parameters())
        self.critic_optimizer = Adam(self.critic.parameters())
    
    def sample_action(self,state):
        state = torch.tensor(state)
        probs = self.actor(state)
        dist = Categorical(probs)
        action = dist.sample()
        logprob = dist.log_prob(action).detach()
        return action.detach().cpu().numpy().item(), logprob
    
    @torch.no_grad()
    def predict(self,state):
        state = torch.tensor(state)
        probs = self.actor(state)
        dist = Categorical(probs)
        action = dist.sample()
        return action.detach().cpu().numpy().item()
    
    def update(self):
        if len(self.memory) % 100 != 0:
            return
        
        
#         next_state, reward, terminated
        old_states,old_logprobs,old_rewards,old_dones,old_actions = self.memory.sample()
        old_states = torch.tensor(old_states)
        old_logprobs = torch.tensor(old_logprobs)
        old_actions = torch.tensor(old_actions)
        
        
        returns = []
        discount_sum = 0
        for reward,done in zip(reversed(old_rewards),reversed(old_dones)):
            if done:
                discount_sum = 0
            discount_sum = discount_sum * 0.99 + reward
            returns.insert(0,discount_sum)
        
        
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std()+1e-5)
        
        
        for _ in range(4):
            values = self.critic(old_states)
            advantage = returns - values.detach()
            probs = self.actor(old_states)
            dist = Categorical(probs)
            logprobs = dist.log_prob(old_actions)
#             print(logprobs.shape,old_logprobs.shape)
            ratio = torch.exp(logprobs - old_logprobs)
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio,0.8,1.2) * advantage
            actor_loss = -torch.min(surr1,surr2).mean() + dist.entropy().mean() * 0.01
            critic_loss = ((returns - values)**2).mean()
            
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()
            
            actor_loss.backward()
            critic_loss.backward()
            
            self.actor_optimizer.step()
            self.critic_optimizer.step()
        
        self.memory.clear()

In [13]:
def train(env,agent):
    
    
    
    for epoch in range(10000):
        
        state,_ = env.reset()
        train_reward = 0
        for _ in range(1000):
            action,logprob = agent.sample_action(state)

            next_state, reward, terminated, truncated, _ = env.step(action)  # 注意这里的返回值
    #         old_states,old_logprobs,old_rewards,old_dones,old_actions
            agent.memory.push((state,logprob,reward,terminated,action))
            agent.update()
            state = next_state
            
            train_reward += reward
            if terminated:
                break

#         print('train reward.....', train_reward)
        #### eval the reward
        if (epoch+1) % 100 == 0:
            state,_ = env.reset()
            eval_reward = 0
            for _ in range(1000):
                action = agent.predict(state)
                next_state, reward, terminated, truncated, _ = env.step(action)  # 注意这里的返回值
                state = next_state
                eval_reward += reward
                if terminated:
                    break
            print('eval reward.....', eval_reward)

In [14]:
agent = Agent()
train(env,agent)

  old_states = torch.tensor(old_states)


eval reward..... 99.0
eval reward..... 926.0
eval reward..... 715.0
eval reward..... 135.0
eval reward..... 121.0
eval reward..... 152.0
eval reward..... 109.0
eval reward..... 103.0
eval reward..... 123.0
eval reward..... 103.0
eval reward..... 504.0
eval reward..... 213.0
eval reward..... 102.0
eval reward..... 208.0
eval reward..... 350.0
eval reward..... 199.0
eval reward..... 77.0
eval reward..... 205.0
eval reward..... 112.0
eval reward..... 63.0
eval reward..... 78.0
eval reward..... 119.0
eval reward..... 99.0
eval reward..... 120.0
eval reward..... 112.0
eval reward..... 118.0
eval reward..... 141.0
eval reward..... 118.0
eval reward..... 179.0
eval reward..... 288.0
eval reward..... 400.0
eval reward..... 78.0
eval reward..... 61.0
eval reward..... 111.0
eval reward..... 79.0
eval reward..... 98.0
eval reward..... 198.0
eval reward..... 406.0
eval reward..... 1000.0
eval reward..... 184.0
eval reward..... 196.0
eval reward..... 1000.0
eval reward..... 334.0
eval reward..... 1

In [15]:
train(env,agent)

eval reward..... 667.0
eval reward..... 1000.0
eval reward..... 361.0
eval reward..... 361.0
eval reward..... 508.0
eval reward..... 392.0
eval reward..... 458.0
eval reward..... 576.0
eval reward..... 343.0
eval reward..... 307.0
eval reward..... 257.0
eval reward..... 253.0
eval reward..... 196.0
eval reward..... 196.0
eval reward..... 154.0
eval reward..... 188.0
eval reward..... 177.0
eval reward..... 138.0
eval reward..... 140.0
eval reward..... 138.0
eval reward..... 135.0
eval reward..... 151.0
eval reward..... 171.0
eval reward..... 147.0
eval reward..... 133.0
eval reward..... 170.0
eval reward..... 152.0
eval reward..... 200.0
eval reward..... 175.0
eval reward..... 179.0
eval reward..... 167.0
eval reward..... 185.0
eval reward..... 146.0
eval reward..... 156.0
eval reward..... 158.0
eval reward..... 188.0
eval reward..... 166.0
eval reward..... 134.0
eval reward..... 170.0
eval reward..... 142.0
eval reward..... 154.0
eval reward..... 170.0
eval reward..... 171.0
eval rewar

In [22]:
env = gym.make('CartPole-v1', render_mode="human")  # 或 "rgb_array"

In [23]:
state,_ = env.reset()
eval_reward = 0
for _ in range(1000):
    action = agent.predict(state)
    next_state, reward, terminated, truncated, _ = env.step(action)  # 注意这里的返回值
    env.render()
    state = next_state
    eval_reward += reward
    if terminated:
        break

2025-03-13 09:15:04.864 python[97966:67564696] _TIPropertyValueIsValid called with 16 on nil context!
2025-03-13 09:15:04.864 python[97966:67564696] imkxpc_getApplicationProperty:reply: called with incorrect property value 16, bailing.
2025-03-13 09:15:04.864 python[97966:67564696] Text input context does not respond to _valueForTIProperty:


In [24]:
env.close()

2025-03-13 09:15:22.097 python[97966:67564696] _TIPropertyValueIsValid called with 16 on nil context!
2025-03-13 09:15:22.097 python[97966:67564696] imkxpc_getApplicationProperty:reply: called with incorrect property value 16, bailing.
2025-03-13 09:15:22.097 python[97966:67564696] Text input context does not respond to _valueForTIProperty:
