In [1]:
import cv2
import numpy as np
import mediapipe as mp
import torch
import torch.nn as nn
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [2]:
class LSTM(nn.Module):
    
    def __init__(self,input_dim,hidden_dim,output_dim,layer_num):
        super(LSTM,self).__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.lstm = torch.nn.LSTM(input_dim,hidden_dim,layer_num,batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim,output_dim)
        self.bn = nn.BatchNorm1d(30)
        
    def forward(self,inputs):
        x = self.bn(inputs)
        lstm_out,(hn,cn) = self.lstm(x)
        out = self.fc(lstm_out[:,-1,:])
        return out

In [3]:
# 加载模型
n_hidden = 128  # 隐藏层维度
n_joints = 132  # 输入维度，每帧的特征数量
n_categories = 3  # 输出维度，类别数量
n_layer = 3  # LSTM 层数
model = LSTM(n_joints,n_hidden,n_categories,n_layer)  # 确保 LSTM 模型的架构与训练时相同
model.load_state_dict(torch.load('lstm_6_bn.pkl'))
model.to(device)
model.eval()

LSTM(
  (lstm): LSTM(132, 128, num_layers=3, batch_first=True)
  (fc): Linear(in_features=128, out_features=3, bias=True)
  (bn): BatchNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [4]:
def extract_keypoints(results):
    """
    Processes and organizes the keypoints detected from the pose estimation model 
    to be used as inputs for the exercise decoder models
    
    """
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    return pose

In [5]:
# Mediapipe 模型初始化
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# 相机对象初始化
cap = cv2.VideoCapture(0)

sequence = []
sequence_length = 30  # 确保这与训练时使用的序列长度相同

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [6]:
actions = np.array(['curl', 'press', 'squat'])


In [7]:


while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # 使用 Mediapipe 进行姿势估计
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = pose.process(frame_rgb)

    # 绘制姿势估计结果
    if results.pose_landmarks:
        mp.solutions.drawing_utils.draw_landmarks(
            frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

    # 预测动作
    keypoints = extract_keypoints(results)
    sequence.append(keypoints)
    sequence = sequence[-sequence_length:]

    if len(sequence) == sequence_length:
        # 转换为适当的输入格式
        inputs = torch.tensor([sequence], dtype=torch.float32).to(device)
        with torch.no_grad():
            output = model(inputs)
            action_idx = torch.argmax(output, dim=1).item()
            action_name = actions[action_idx]  # 确保 actions 列表已定义

        # 在屏幕上显示预测的动作名称
        cv2.putText(frame, action_name, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

    # 显示图像
    cv2.imshow('OpenCV Feed', frame)

    # 退出循环
    if cv2.waitKey(10) & 0xFF == ord('q'):
        break

# 释放资源并关闭窗口
cap.release()
cv2.destroyAllWindows()

  inputs = torch.tensor([sequence], dtype=torch.float32).to(device)


In [8]:
cap.release()
cv2.destroyAllWindows()

: 