In [31]:
from collections import defaultdict, deque
import numpy as np
import torch
import cv2
from ultralytics import YOLO
import torch.nn as nn  # nn 모듈 import

# YOLO Pose 모델 로드
pose_model = YOLO("yolov8n-pose.pt")

# 학습된 클래스 이름
class_names = ["running", "walking", "sitting", "lying"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMPoseClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, dropout=0.5):
        super(LSTMPoseClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden = hidden[-1]
        return self.fc(hidden)

# LSTM 모델 로드
lstm_model = LSTMPoseClassifier(input_dim=17 * 2, hidden_dim=128, output_dim=len(class_names)).to(device)
lstm_model.load_state_dict(torch.load("lstm_pose_classifier2.0.pth", map_location=device))
lstm_model.eval()

# 사람별 랜드마크 시퀀스 버퍼
sequence_buffers = defaultdict(lambda: deque(maxlen=96))  # person_id별 버퍼 관리

def extract_landmarks(frame, model):
    """
    YOLO Pose 모델로 랜드마크 추출
    Args:
        frame (np.array): 프레임 이미지
        model: YOLO Pose 모델
    Returns:
        list: 각 사람의 랜드마크 [(person_id, landmarks)]
    """
    results = model(frame)
    height, width, _ = frame.shape
    people_landmarks = []

    if len(results) > 0 and hasattr(results[0], 'keypoints'):
        keypoints = results[0].keypoints.xy.cpu().numpy()  # (N, 17, 2)
        for person_id, landmarks in enumerate(keypoints):
            keypoints_normalized = landmarks / [width, height]  # 정규화
            people_landmarks.append((person_id, keypoints_normalized))
    return people_landmarks

def predict_class(sequence, model):
    """
    LSTM 모델로 클래스 예측
    Args:
        sequence (deque): 랜드마크 시퀀스 (deque 형태)
        model (nn.Module): 학습된 LSTM 모델
    Returns:
        str: 예측된 클래스 이름
    """
    if len(sequence) < 96:
        return "Waiting for data..."

    # 시퀀스를 텐서로 변환
    input_tensor = torch.tensor(np.array(sequence), dtype=torch.float32).unsqueeze(0).to(device)
    input_tensor = input_tensor.view(input_tensor.size(0), input_tensor.size(1), -1)  # (batch_size, seq_len, 34)

    # 모델 예측
    with torch.no_grad():
        output = model(input_tensor)
        _, predicted = torch.max(output, 1)
    return class_names[predicted.item()]

# 웹캠 시작
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        break

    # 프레임 크기 조정
    frame = cv2.resize(frame, (640, 480))

    # 랜드마크 추출
    people_landmarks = extract_landmarks(frame, pose_model)

    # 각 사람별 버퍼에 랜드마크 추가
    for person_id, landmarks in people_landmarks:
        sequence_buffers[person_id].append(landmarks)

    # 각 사람별로 클래스 예측
    for person_id, buffer in sequence_buffers.items():
        predicted_class = predict_class(buffer, lstm_model)
        cv2.putText(
            frame,
            f"Person {person_id}: {predicted_class}",
            (10, 30 + person_id * 30),  # 사람마다 다른 위치에 표시
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (0, 255, 0),
            2,
        )

    # 화면에 출력
    cv2.imshow("Real-Time Pose Classification", frame)

    # 'q' 키를 누르면 종료
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# 종료
cap.release()
cv2.destroyAllWindows()



0: 480x640 1 person, 35.9ms
Speed: 3.2ms preprocess, 35.9ms inference, 125.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.1ms
Speed: 1.6ms preprocess, 6.1ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 10.2ms
Speed: 1.8ms preprocess, 10.2ms inference, 2.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 9.0ms
Speed: 2.9ms preprocess, 9.0ms inference, 1.8ms postprocess per image at shape (1, 3, 480, 640)



qt.qpa.plugin: Could not find the Qt platform plugin "wayland" in "/home/hyun/venv/torch_venv/lib/python3.12/site-packages/cv2/qt/plugins"


0: 480x640 1 person, 7.4ms
Speed: 1.3ms preprocess, 7.4ms inference, 1.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 20.7ms
Speed: 2.5ms preprocess, 20.7ms inference, 1.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 14.4ms
Speed: 2.0ms preprocess, 14.4ms inference, 2.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.0ms
Speed: 1.0ms preprocess, 7.0ms inference, 2.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 11.1ms
Speed: 3.0ms preprocess, 11.1ms inference, 2.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 9.3ms
Speed: 3.0ms preprocess, 9.3ms inference, 2.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.3ms
Speed: 1.2ms preprocess, 7.3ms inference, 2.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.5ms
Speed: 1.2ms preprocess, 6.5ms inference, 1.7ms postprocess per image at shape (1, 3, 480, 640)

0:

In [32]:
from collections import defaultdict, deque
import numpy as np
import torch
import cv2
from ultralytics import YOLO
import torch.nn as nn  # nn 모듈 import

# YOLO Pose 모델 로드
pose_model = YOLO("yolov8n-pose.pt")

# 학습된 클래스 이름
class_names = ["running", "walking", "sitting", "lying"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMPoseClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, dropout=0.5):
        super(LSTMPoseClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden = hidden[-1]
        return self.fc(hidden)

# LSTM 모델 로드
lstm_model = LSTMPoseClassifier(input_dim=17 * 2, hidden_dim=128, output_dim=len(class_names)).to(device)
lstm_model.load_state_dict(torch.load("lstm_pose_classifier2.0.pth", map_location=device))
lstm_model.eval()

# 사람별 랜드마크 시퀀스 버퍼
sequence_buffers = defaultdict(lambda: deque(maxlen=96))  # person_id별 버퍼 관리

def extract_landmarks_and_boxes(frame, model):
    """
    YOLO Pose 모델로 랜드마크와 바운딩 박스 추출
    Args:
        frame (np.array): 프레임 이미지
        model: YOLO Pose 모델
    Returns:
        list: 각 사람의 [(person_id, landmarks, box)]
    """
    results = model(frame)
    height, width, _ = frame.shape
    people_data = []

    if len(results) > 0 and hasattr(results[0], 'keypoints'):
        keypoints = results[0].keypoints.xy.cpu().numpy()  # (N, 17, 2)
        boxes = results[0].boxes.xyxy.cpu().numpy()  # (N, 4) 바운딩 박스 좌표
        for person_id, (landmarks, box) in enumerate(zip(keypoints, boxes)):
            # 랜드마크 정규화
            keypoints_normalized = landmarks / [width, height]
            people_data.append((person_id, keypoints_normalized, box))
    return people_data

def predict_class(sequence, model):
    """
    LSTM 모델로 클래스 예측
    Args:
        sequence (deque): 랜드마크 시퀀스 (deque 형태)
        model (nn.Module): 학습된 LSTM 모델
    Returns:
        str: 예측된 클래스 이름
    """
    if len(sequence) < 96:
        return "Waiting for data..."

    # 시퀀스를 텐서로 변환
    input_tensor = torch.tensor(np.array(sequence), dtype=torch.float32).unsqueeze(0).to(device)
    input_tensor = input_tensor.view(input_tensor.size(0), input_tensor.size(1), -1)  # (batch_size, seq_len, 34)

    # 모델 예측
    with torch.no_grad():
        output = model(input_tensor)
        _, predicted = torch.max(output, 1)
    return class_names[predicted.item()]

# 웹캠 시작
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        break

    # 프레임 크기 조정
    frame = cv2.resize(frame, (640, 480))

    # 랜드마크와 박스 추출
    people_data = extract_landmarks_and_boxes(frame, pose_model)

    # 각 사람별 데이터 처리
    for person_id, landmarks, box in people_data:
        # 랜드마크 추가
        sequence_buffers[person_id].append(landmarks)

        # 클래스 예측
        predicted_class = predict_class(sequence_buffers[person_id], lstm_model)

        # 바운딩 박스 그리기
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)  # 파란색 박스

        # 박스 위에 클래스 표시
        cv2.putText(
            frame,
            f"{predicted_class}",
            (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.7,
            (255, 0, 0),
            2,
        )

    # 화면 출력
    cv2.imshow("Real-Time Pose Classification", frame)

    # 'q' 키를 누르면 종료
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# 종료
cap.release()
cv2.destroyAllWindows()



0: 480x640 1 person, 9.2ms
Speed: 0.8ms preprocess, 9.2ms inference, 1.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 9.3ms
Speed: 3.3ms preprocess, 9.3ms inference, 1.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.7ms
Speed: 1.4ms preprocess, 6.7ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.5ms
Speed: 1.2ms preprocess, 7.5ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 9.4ms
Speed: 1.9ms preprocess, 9.4ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 9.7ms
Speed: 2.9ms preprocess, 9.7ms inference, 2.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.7ms
Speed: 2.6ms preprocess, 7.7ms inference, 1.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 8.4ms
Speed: 1.5ms preprocess, 8.4ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x

In [33]:
from collections import defaultdict, deque
import numpy as np
import torch
import cv2
from ultralytics import YOLO
import torch.nn as nn

# YOLO 모델 로드
pose_model = YOLO("yolov8n-pose.pt")  # YOLO Pose 모델
object_model = YOLO("yolov8n.pt")  # YOLO 기본 객체 탐지 모델

# 학습된 클래스 이름
class_names = ["running", "walking", "sitting", "lying"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMPoseClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, dropout=0.5):
        super(LSTMPoseClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden = hidden[-1]
        return self.fc(hidden)

# LSTM 모델 로드
lstm_model = LSTMPoseClassifier(input_dim=17 * 2, hidden_dim=128, output_dim=len(class_names)).to(device)
lstm_model.load_state_dict(torch.load("lstm_pose_classifier2.0.pth", map_location=device))
lstm_model.eval()

# 사람별 랜드마크 시퀀스 버퍼
sequence_buffers = defaultdict(lambda: deque(maxlen=96))

def extract_landmarks_and_boxes(frame, model):
    """
    YOLO Pose 모델로 랜드마크와 바운딩 박스 추출
    Args:
        frame (np.array): 프레임 이미지
        model: YOLO Pose 모델
    Returns:
        list: 각 사람의 [(person_id, landmarks, box)]
    """
    results = model(frame)
    height, width, _ = frame.shape
    people_data = []

    if len(results) > 0 and hasattr(results[0], 'keypoints'):
        keypoints = results[0].keypoints.xy.cpu().numpy()  # (N, 17, 2)
        boxes = results[0].boxes.xyxy.cpu().numpy()  # (N, 4)
        for person_id, (landmarks, box) in enumerate(zip(keypoints, boxes)):
            keypoints_normalized = landmarks / [width, height]
            people_data.append((person_id, keypoints_normalized, box))
    return people_data

def detect_objects(frame, model):
    """
    YOLO 기본 모델로 객체 탐지
    Args:
        frame (np.array): 프레임 이미지
        model: YOLO 모델
    Returns:
        list: [(class_name, confidence, box)]
    """
    results = model(frame)
    detections = []

    if len(results) > 0:
        for box, conf, cls in zip(results[0].boxes.xyxy.cpu().numpy(),
                                  results[0].boxes.conf.cpu().numpy(),
                                  results[0].boxes.cls.cpu().numpy()):
            class_name = model.names[int(cls)]  # 클래스 이름
            detections.append((class_name, conf, box))
    return detections

def predict_class(sequence, model):
    """
    LSTM 모델로 클래스 예측
    Args:
        sequence (deque): 랜드마크 시퀀스 (deque 형태)
        model (nn.Module): 학습된 LSTM 모델
    Returns:
        str: 예측된 클래스 이름
    """
    if len(sequence) < 96:
        return "Waiting for data..."

    input_tensor = torch.tensor(np.array(sequence), dtype=torch.float32).unsqueeze(0).to(device)
    input_tensor = input_tensor.view(input_tensor.size(0), input_tensor.size(1), -1)

    with torch.no_grad():
        output = model(input_tensor)
        _, predicted = torch.max(output, 1)
    return class_names[predicted.item()]

# 웹캠 시작
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        break

    # 프레임 크기 조정
    frame = cv2.resize(frame, (640, 480))

    # YOLO Pose로 랜드마크와 박스 추출
    people_data = extract_landmarks_and_boxes(frame, pose_model)

    # YOLO Object Detection으로 객체 탐지
    object_detections = detect_objects(frame, object_model)

    # 사람별 데이터 처리
    for person_id, landmarks, box in people_data:
        sequence_buffers[person_id].append(landmarks)
        predicted_class = predict_class(sequence_buffers[person_id], lstm_model)

        # 사람 박스와 클래스 표시
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
        cv2.putText(
            frame,
            f"{predicted_class}",
            (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.7,
            (255, 0, 0),
            2,
        )

    # 객체 탐지 결과 표시
    for class_name, conf, box in object_detections:
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(
            frame,
            f"{class_name} ({conf:.2f})",
            (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.7,
            (0, 255, 0),
            2,
        )

    # 화면 출력
    cv2.imshow("YOLO Pose + Object Detection", frame)

    # 'q' 키를 누르면 종료
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# 종료
cap.release()
cv2.destroyAllWindows()


Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:01<00:00, 4.15MB/s]



0: 480x640 1 person, 7.6ms
Speed: 3.4ms preprocess, 7.6ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 tv, 19.0ms
Speed: 2.6ms preprocess, 19.0ms inference, 1.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.5ms
Speed: 1.6ms preprocess, 6.5ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 tv, 6.9ms
Speed: 1.1ms preprocess, 6.9ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 8.3ms
Speed: 3.4ms preprocess, 8.3ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 tv, 11.8ms
Speed: 1.6ms preprocess, 11.8ms inference, 2.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 10.5ms
Speed: 2.8ms preprocess, 10.5ms inference, 2.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 tv, 10.0ms
Speed: 2.1ms preprocess, 10.0ms inference, 1.4ms postprocess per image at

In [1]:
from collections import defaultdict, deque
import numpy as np
import torch
import cv2
from ultralytics import YOLO
import torch.nn as nn

# YOLO 모델 로드
pose_model = YOLO("yolov8n-pose.pt")  # YOLO Pose 모델
object_model = YOLO("yolov8n.pt")  # YOLO 기본 객체 탐지 모델

# 학습된 클래스 이름
class_names = ["running", "walking", "sitting", "lying"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMPoseClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, dropout=0.5):
        super(LSTMPoseClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden = hidden[-1]
        return self.fc(hidden)

# LSTM 모델 로드
lstm_model = LSTMPoseClassifier(input_dim=17 * 2, hidden_dim=128, output_dim=len(class_names)).to(device)
lstm_model.load_state_dict(torch.load("lstm_pose_classifier2.0.pth", map_location=device))
lstm_model.eval()

# 사람별 랜드마크 시퀀스 버퍼
sequence_buffers = defaultdict(lambda: deque(maxlen=96))

def extract_landmarks_and_boxes(frame, model):
    """
    YOLO Pose 모델로 랜드마크와 바운딩 박스 추출
    Args:
        frame (np.array): 프레임 이미지
        model: YOLO Pose 모델
    Returns:
        list: 각 사람의 [(person_id, landmarks, box)]
    """
    results = model(frame)
    height, width, _ = frame.shape
    people_data = []

    if len(results) > 0 and hasattr(results[0], 'keypoints'):
        keypoints = results[0].keypoints.xy.cpu().numpy()  # (N, 17, 2)
        boxes = results[0].boxes.xyxy.cpu().numpy()  # (N, 4)
        for person_id, (landmarks, box) in enumerate(zip(keypoints, boxes)):
            keypoints_normalized = landmarks / [width, height]
            people_data.append((person_id, keypoints_normalized, box))
    return people_data

def detect_objects(frame, model):
    """
    YOLO 기본 모델로 사람 탐지 (사람만 필터링)
    Args:
        frame (np.array): 프레임 이미지
        model: YOLO 모델
    Returns:
        list: [(class_name, confidence, box)]
    """
    results = model(frame)
    detections = []

    if len(results) > 0:
        for box, conf, cls in zip(results[0].boxes.xyxy.cpu().numpy(),
                                  results[0].boxes.conf.cpu().numpy(),
                                  results[0].boxes.cls.cpu().numpy()):
            class_name = model.names[int(cls)]  # 클래스 이름
            if class_name == "person":  # 사람만 필터링
                detections.append((class_name, conf, box))
    return detections

def predict_class(sequence, model):
    """
    LSTM 모델로 클래스 예측
    Args:
        sequence (deque): 랜드마크 시퀀스 (deque 형태)
        model (nn.Module): 학습된 LSTM 모델
    Returns:
        str: 예측된 클래스 이름
    """
    if len(sequence) < 96:
        return "Waiting for data..."

    input_tensor = torch.tensor(np.array(sequence), dtype=torch.float32).unsqueeze(0).to(device)
    input_tensor = input_tensor.view(input_tensor.size(0), input_tensor.size(1), -1)

    with torch.no_grad():
        output = model(input_tensor)
        _, predicted = torch.max(output, 1)
    return class_names[predicted.item()]

def compute_iou(box1, box2):
    """
    바운딩 박스 간 IoU 계산
    Args:
        box1, box2: [x1, y1, x2, y2] 형태의 바운딩 박스
    Returns:
        float: IoU 값
    """
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union_area = box1_area + box2_area - inter_area

    return inter_area / union_area if union_area > 0 else 0

# 웹캠 시작
cap = cv2.VideoCapture(0)

# 메인 루프 수정
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        break

    # 프레임 크기 조정
    frame = cv2.resize(frame, (640, 480))

    # YOLO Pose로 랜드마크와 박스 추출
    people_data = extract_landmarks_and_boxes(frame, pose_model)

    # YOLO Object Detection으로 사람 탐지
    object_detections = detect_objects(frame, object_model)

    # Pose에서 감지된 박스와 Object Detection에서 감지된 박스 비교
    used_object_indices = set()  # 이미 처리된 Object Detection 인덱스 저장
    for person_id, landmarks, pose_box in people_data:
        sequence_buffers[person_id].append(landmarks)
        predicted_class = predict_class(sequence_buffers[person_id], lstm_model)

        # 사람 박스와 포즈 클래스 표시 (YOLO Pose)
        x1, y1, x2, y2 = map(int, pose_box)

        # Object Detection 결과와 중복 체크
        confidence = None  # 해당 박스에 대응하는 신뢰도 저장
        detected_class = None  # Object Detection의 클래스 이름 저장
        for i, (class_name, conf, object_box) in enumerate(object_detections):
            if i in used_object_indices:
                continue  # 이미 처리된 박스는 스킵
            iou = compute_iou(pose_box, object_box)
            if iou > 0.5 and class_name == "person":
                used_object_indices.add(i)  # 중복된 박스 기록
                confidence = conf  # "person"의 신뢰도 저장
                detected_class = class_name  # 감지된 클래스 저장
                break

        # 박스와 텍스트 출력
        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)  # 파란색 박스
        label = f"Pose: {predicted_class}"
        if detected_class is not None and confidence is not None:
            label += f" | Detected: {detected_class} ({confidence:.2f})"  # Detected Class 추가
        cv2.putText(
            frame,
            label,
            (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.7,
            (255, 0, 0),
            2,
        )

    # 중복되지 않은 Object Detection 결과 표시
    for i, (class_name, conf, object_box) in enumerate(object_detections):
        if i in used_object_indices or class_name != "person":
            continue  # 중복된 박스 또는 사람이 아닌 경우 스킵

        x1, y1, x2, y2 = map(int, object_box)
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)  # 초록색 박스
        cv2.putText(
            frame,
            f"Detected: {class_name} ({conf:.2f})",
            (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.7,
            (0, 255, 0),
            2,
        )

    # 화면 출력
    cv2.imshow("YOLO Pose + Object Detection (Unified)", frame)

    # 'q' 키를 누르면 종료
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# 종료
cap.release()
cv2.destroyAllWindows()



0: 480x640 1 person, 66.2ms
Speed: 6.0ms preprocess, 66.2ms inference, 217.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 26.0ms
Speed: 3.8ms preprocess, 26.0ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 12.0ms
Speed: 3.4ms preprocess, 12.0ms inference, 2.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 13.2ms
Speed: 3.4ms preprocess, 13.2ms inference, 4.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 19.6ms
Speed: 5.4ms preprocess, 19.6ms inference, 3.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 21.0ms
Speed: 6.3ms preprocess, 21.0ms inference, 4.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 26.2ms
Speed: 7.4ms preprocess, 26.2ms inference, 3.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 20.6ms
Speed: 3.2ms preprocess, 20.6ms inference, 3.4ms postprocess per image at shape (1, 3, 