In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
from kine import kine
# def kine(posture):
#     ...
#     return position

# 环境类
class RobotArmEnv:
    def __init__(self, target_position):
        self.target_position = np.array(target_position)
        self.state = np.random.rand(6)  # 初始随机位姿
        self.position = kine(self.state)  # 计算初始末端位置

    def reset(self):
        self.state = np.random.rand(6)
        self.position = kine(self.state)
        return self.state

    def step(self, action):
        # 更新位姿
        self.state += action
        new_position = kine(self.state)
        reward = -np.linalg.norm(self.target_position - new_position)  # 奖励为负距离
        done = np.linalg.norm(self.target_position - new_position) < 0.1
        self.position = new_position
        return self.state, reward, done




In [2]:
# Actor网络
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(6, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 6)  # 6自由度机械臂的控制

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        action = torch.tanh(self.fc3(x))  # 使用tanh来保证输出范围为[-1, 1]
        return action

# Critic网络
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(12, 128)  # 状态和动作输入
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 1)  # 输出Q值

    def forward(self, state, action):
        x = torch.relu(self.fc1(torch.cat([state, action], dim=1)))
        x = torch.relu(self.fc2(x))
        q_value = self.fc3(x)
        return q_value


In [3]:

# 初始化环境和网络
env = RobotArmEnv(target_position=[1, 1, 1.5])
actor = Actor()
critic = Critic()
target_actor = Actor()
target_critic = Critic()

# 将目标网络参数设置为与主网络相同
target_actor.load_state_dict(actor.state_dict())
target_critic.load_state_dict(critic.state_dict())

# 优化器
actor_optimizer = optim.Adam(actor.parameters(), lr=1e-4)
critic_optimizer = optim.Adam(critic.parameters(), lr=1e-3)

# 柔性更新参数
tau = 0.005

# 经验回放
memory = deque(maxlen=10000)
batch_size = 64

# 训练参数
gamma = 0.99  # 折扣因子
max_episodes = 1000
max_steps = 200
update_every = 10  # 每隔一定步数更新一次网络参数
learn_steps = 0


In [4]:
# 训练循环
training_rewards = []  # 存储每个episode的奖励值
for episode in range(max_episodes):
    state = env.reset()
    episode_reward = 0
    for step in range(max_steps):
        # 使用Actor网络选择动作
        with torch.no_grad():
            action = actor(torch.tensor(state, dtype=torch.float32)).numpy()
        # 在环境中执行动作
        next_state, reward, done = env.step(action)
        episode_reward += reward
        # 将经验存储到回放池中
        memory.append((state, action, reward, next_state, done))
        # 如果回放池中的经验足够，开始训练网络
        if len(memory) >= batch_size:
            learn_steps += 1
            if learn_steps % update_every == 0:
                # 从经验回放中抽样一个小批量样本
                batch = random.sample(memory, batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)
                states = torch.tensor(states, dtype=torch.float32)
                actions = torch.tensor(actions, dtype=torch.float32)
                rewards = torch.tensor(rewards, dtype=torch.float32).view(-1, 1)
                next_states = torch.tensor(next_states, dtype=torch.float32)
                dones = torch.tensor(dones, dtype=torch.float32).view(-1, 1)
                # 计算目标Q值
                with torch.no_grad():
                    target_actions = target_actor(next_states)
                    target_q_values = target_critic(next_states, target_actions)
                    target_q_values = rewards + gamma * target_q_values * (1 - dones)
                # 更新Critic网络
                critic_optimizer.zero_grad()
                q_values = critic(states, actions)
                critic_loss = nn.functional.mse_loss(q_values, target_q_values)
                critic_loss.backward()
                critic_optimizer.step()
                # 更新Actor网络
                actor_optimizer.zero_grad()
                predicted_actions = actor(states)
                actor_loss = -critic(states, predicted_actions).mean()
                actor_loss.backward()
                actor_optimizer.step()
                # 软更新目标网络参数
                for target_param, param in zip(target_actor.parameters(), actor.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
                for target_param, param in zip(target_critic.parameters(), critic.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
        # 更新状态
        state = next_state
        # 判断是否结束本轮训练
        if done:
            break
    training_rewards.append(episode_reward)
    print("Episode: {}, Reward: {:.2f}".format(episode + 1, episode_reward))

  states = torch.tensor(states, dtype=torch.float32)


Episode: 1, Reward: -414.98
Episode: 2, Reward: -417.11
Episode: 3, Reward: -412.54
Episode: 4, Reward: -419.36
Episode: 5, Reward: -412.34
Episode: 6, Reward: -409.63
Episode: 7, Reward: -400.53
Episode: 8, Reward: -407.35
Episode: 9, Reward: -407.24
Episode: 10, Reward: -402.94
Episode: 11, Reward: -421.82
Episode: 12, Reward: -402.47
Episode: 13, Reward: -395.14
Episode: 14, Reward: -412.94
Episode: 15, Reward: -443.32
Episode: 16, Reward: -389.51
Episode: 17, Reward: -390.48
Episode: 18, Reward: -383.96
Episode: 19, Reward: -403.69
Episode: 20, Reward: -398.96
Episode: 21, Reward: -407.07
Episode: 22, Reward: -411.60
Episode: 23, Reward: -413.10
Episode: 24, Reward: -391.02
Episode: 25, Reward: -379.62
Episode: 26, Reward: -413.54
Episode: 27, Reward: -409.00
Episode: 28, Reward: -413.49
Episode: 29, Reward: -409.78
Episode: 30, Reward: -413.21
Episode: 31, Reward: -418.95
Episode: 32, Reward: -414.15
Episode: 33, Reward: -412.34
Episode: 34, Reward: -408.32
Episode: 35, Reward: -4

In [5]:
# 保存训练结果
np.save('./model_libs/training_rewards.npy', np.array(training_rewards))
torch.save(actor.state_dict(), './model_libs/actor.pth')
torch.save(critic.state_dict(), './model_libs/critic.pth')

In [8]:
# test
# 加载训练好的模型
actor = Actor()
actor.load_state_dict(torch.load('./model_libs/actor.pth'))
actor.eval()  # 设置为评估模式，关闭dropout和batch normalization等层的随机性

# 输入目标位置
target_position = [1, 1, 1.5]  

# 使用模型预测机器人位姿
with torch.no_grad():
    target_position_tensor = torch.tensor(target_position, dtype=torch.float32).unsqueeze(0)
    predicted_posture = actor(target_position_tensor).numpy()

print("Predicted posture:", predicted_posture)



RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x3 and 6x128)