### 필요한 라이브러리 로드

In [1]:
from ultralytics import YOLO
import mediapipe as mp
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split
import cv2
import numpy as np
import os
import random
import platform
import yaml
from tqdm import tqdm
import gc
from torch.cuda import empty_cache

if platform.system() == "Darwin":
    print("your system is mac os")
    device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"
else:
    print("your system is cuda")
    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

E0000 00:00:1743765691.188555    6368 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743765691.197117    6368 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743765691.226496    6368 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743765691.226631    6368 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743765691.226635    6368 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743765691.226638    6368 computation_placer.cc:177] computation placer already registered. Please check linka

your system is cuda


### YOLO-POSE 모델 로드

In [2]:
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils

yolo_best_model = '/home/pepsi/dev_ws/deeplearning-repo-2/src/video_ai_server/models/extin_per_fire.pt'
yolo_model = YOLO(yolo_best_model).to(device)

In [3]:
def get_YOLO_box(img, yolo_model, detect_cls):
    box_results = yolo_model.predict(img, conf=0.6, verbose=False, show=False)
    box_results = box_results[0].boxes

    boxes = box_results.xyxy.cpu().tolist()
    box_class = box_results.cls.cpu().tolist()

    p1x1, p1y1, p1x2, p1y2 = 0, 0, 0, 0
    p2x1, p2y1, p2x2, p2y2 = 0, 0, 0, 0
    for idx, cls in enumerate(box_class):
        if int(cls) == detect_cls:
            p1x1, p1y1, p1x2, p1y2 = boxes[0]
            p1x1, p1y1, p1x2, p1y2 = int(p1x1), int(p1y1), int(p1x2), int(p1y2)

            if len(boxes) > 1:
                p2x1, p2y1, p2x2, p2y2 = boxes[1]
                p2x1, p2y1, p2x2, p2y2 = int(p2x1), int(p2y1), int(p2x2), int(p2y2)

    return p1x1, p1y1, p1x2, p1y2, p2x1, p2y1, p2x2, p2y2

In [4]:
def get_pose_landmarks(results):
    xyz_list = []
    for landmark in results.pose_landmarks.landmark:
        xyz_list.append(landmark.x)
        xyz_list.append(landmark.y)
        xyz_list.append(landmark.z)
    return xyz_list 

In [5]:
def append_data_xyz_list_list(xyz_list_list, p1x1, p1y1, p1x2, p1y2, p2x1, p2y1, p2x2, p2y2, xyz_list):
    xyz_list.append(abs(p1x1 - p2x1) / 640)
    xyz_list.append(abs(p1x2 - p2x2) / 640)
    xyz_list.append(abs(p1y1 - p2y1) / 640)
    xyz_list.append(abs(p1y2 - p2y2) / 640)
    xyz_list_list.append(xyz_list)

    return xyz_list_list

In [6]:
def generate_dataset(mp_pose, video_path, detect_cls):
    xyz_list_list = []
    poses = mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5)

    cap = cv2.VideoCapture(video_path)

    if cap.isOpened():
        while True:
            ret, img = cap.read()
            if ret == True:
                xyz_list = []
                img = cv2.resize(img, (640, 640))
                results = poses.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

                if not results.pose_landmarks: continue

                xyz_list = get_pose_landmarks(results)
                p1x1, p1y1, p1x2, p1y2, p2x1, p2y1, p2x2, p2y2 = get_YOLO_box(img, yolo_model, detect_cls)

                if (p1x1 == 0 and p1y1 == 0 and p1x2 == 0 and p1y2== 0) and (p2x1 == 0 and p2y1 == 0 and p2x2 == 0 and p2y2== 0): continue

                xyz_list_list = append_data_xyz_list_list(xyz_list_list, p1x1, p1y1, p1x2, p1y2, p2x1, p2y1, p2x2, p2y2, xyz_list)

                cv2.waitKey(1)
            else:
                break

    cap.release()
    gc.collect()
    torch.cuda.empty_cache()
    
    return xyz_list_list

In [7]:
Video_path = '/home/pepsi/dev_ws/deeplearning-repo-2/src/video_ai_server/datasets/pose/train2'
video_name_list = os.listdir(Video_path)
dataset = []
length = 18
detect_cls = 1

for video_name in tqdm(video_name_list):
    if 'normal' in video_name: label = 0
    elif 'fighting' in video_name: label = 1
    elif 'lying' in video_name: label = 2
    elif 'smoking' in video_name: label = 3

    pose_data = generate_dataset(mp_pose, '{}/{}'.format(Video_path, video_name), detect_cls)

    for idx in range(0, len(pose_data), int(length)):
        seq_list = pose_data[idx : idx + length]
        if len(seq_list) == length:
            dataset.append({'key' : label, 'value': seq_list})

random.shuffle(dataset)

I0000 00:00:1743665825.956175   17200 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1743665825.960872   17507 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1743665826.074159   17497 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1743665826.147130   17495 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1743665826.176504   17502 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
  6%|▋         | 1/16 [00:32<08:00, 32.03s/it]I0000 00:00:1743665857.916477   17200 gl_context_egl

In [3]:
class MyDataset(Dataset):
    def __init__(self, seq_list):
        self.X = []
        self.y = []
        for dic in seq_list:
            self.y.append(dic['key'])
            self.X.append(dic['value'])

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        data = self.X[index]
        label = self.y[index]
        return torch.Tensor(np.array(data)), torch.tensor(np.array(int(label)))

In [9]:
split_ratio = [0.7, 0.2, 0.1]
train_len = int(len(dataset) * split_ratio[0])
val_len = int(len(dataset) * split_ratio[1])
test_len = len(dataset) - train_len - val_len

train_dataset = MyDataset(dataset)
train_data, valid_data, test_data = random_split(train_dataset, [train_len, val_len, test_len])

train_loader = DataLoader(train_data, batch_size=8)
val_loader = DataLoader(valid_data, batch_size=8)
test_loader = DataLoader(test_data, batch_size=8)

In [4]:
class LSTM(nn.Module):
    def __init__(self, num_layers=1):
        super(LSTM, self).__init__()
        self.lstm1 = nn.LSTM(103, 128, num_layers, batch_first=True, bidirectional=True)
        self.layer_norm1 = nn.LayerNorm(256)
        self.dropout1 = nn.Dropout(0.1)

        self.lstm2 = nn.LSTM(256, 64, num_layers, batch_first=True, bidirectional=True)
        self.layer_norm2 = nn.LayerNorm(128)
        self.dropout2 = nn.Dropout(0.1)

        self.lstm3 = nn.LSTM(128, 32, num_layers, batch_first=True, bidirectional=True)
        self.layer_norm3 = nn.LayerNorm(64)
        self.dropout3 = nn.Dropout(0.1)

        self.attention = nn.Linear(64, 1)
        self.fc = nn.Linear(64, 4)


    def forward(self, x):
        x, _ = self.lstm1(x)
        x = self.layer_norm1(x)
        x = self.dropout1(x)

        x, _ = self.lstm2(x)
        x = self.layer_norm2(x)
        x = self.dropout2(x)

        x, _ = self.lstm3(x)
        x = self.layer_norm3(x)
        x = self.dropout3(x)

        attention_weights = torch.softmax(self.attention(x), dim=1)
        x = torch.sum(attention_weights * x, dim=1)

        x = self.fc(x)
        return x

In [11]:
def init_model():
    global net, loss_fn, optim
    net = LSTM().to(device)
    loss_fn = nn.CrossEntropyLoss()
    optim = AdamW(net.parameters(), lr = 0.001)

def init_epoch():
    global epoch_cnt
    epoch_cnt = 0

def init_log():
    global iter_log, tloss_log, tacc_log, vloss_log, vacc_log, log_stack, time_log
    iter_log, tloss_log, tacc_log, vloss_log, vacc_log = [], [], [], [], []
    log_stack, time_log = [], []

def record_train_log(_tloss, _tacc, _time):
    time_log.append(_time)
    tloss_log.append(_tloss)
    tacc_log.append(_tacc)
    iter_log.append(epoch_cnt)

def record_valid_log(_vloss, _vacc):
    vloss_log.append(_vloss)
    vacc_log.append(_vacc)

def last(log_list):
    if len(log_list) > 0:
        return log_list[len(log_list) - 1]
    else:
        return -1
    
def print_log():
    train_loss = round(float(last(tloss_log)), 3)
    train_acc = round(float(last(tacc_log)), 3)
    val_loss = round(float(last(vloss_log)), 3)
    val_acc = round(float(last(vacc_log)), 3)
    time_spent = round(float(last(time_log)), 3)

    log_str = 'Epoch: {:3}| T_Loss {:5} | T_acc {:5}| V_Loss {:5}| V_acc {:5} | {:5}'.format(last(iter_log), train_loss, train_acc, val_loss, val_acc, time_spent)

    log_stack.append(log_str) #프린트 준비

    for idx in reversed(range(len(log_stack))):
        print(log_stack[idx])

def clear_memory():
    if device != 'cpu':
        empty_cache()
    gc.collect()
    
def epoch_not_finished():
    return epoch_cnt < maximum_epoch


def epoch(data_loader, mode='train'):
    global epoch_cnt

    iter_loss, iter_acc, last_grad_performed = [], [], False

    for _data, _label in data_loader:
        data, label = _data.to(device), _label.type(torch.LongTensor).to(device)

        if mode == 'train' : net.train()
        else: net.eval()

        result = net(data)
        _, out = torch.max(result, 1)

        loss = loss_fn(result, label)
        iter_loss.append(loss.item())

        if mode == 'train':
            optim.zero_grad()
            loss.backward()
            optim.step()
            last_grad_performed = True

        acc_partial = (out == label).float().sum()
        acc_partial = acc_partial / len(label)
        iter_acc.append(acc_partial.item())


    if last_grad_performed:
        epoch_cnt += 1

    clear_memory()

    return np.average(iter_loss), np.average(iter_acc)

In [12]:
import time

init_model()
init_epoch()
init_log()
maximum_epoch = 50

while epoch_not_finished():
    start_time = time.time()
    tloss, tacc = epoch(train_loader, mode='train')
    end_time = time.time()
    time_taken = end_time - start_time
    record_train_log(tloss, tacc, time_taken)
    with torch.no_grad():
        vloss, vacc = epoch(val_loader, mode= 'val')
        record_valid_log(vloss, vacc)
    print_log()

print('\n Training completed!')

Epoch:   1| T_Loss 1.123 | T_acc 0.511| V_Loss  0.64| V_acc 0.732 | 1.363
Epoch:   2| T_Loss 0.586 | T_acc 0.739| V_Loss 0.476| V_acc 0.786 | 2.517
Epoch:   1| T_Loss 1.123 | T_acc 0.511| V_Loss  0.64| V_acc 0.732 | 1.363
Epoch:   3| T_Loss 0.533 | T_acc 0.687| V_Loss 0.473| V_acc  0.75 | 0.834
Epoch:   2| T_Loss 0.586 | T_acc 0.739| V_Loss 0.476| V_acc 0.786 | 2.517
Epoch:   1| T_Loss 1.123 | T_acc 0.511| V_Loss  0.64| V_acc 0.732 | 1.363
Epoch:   4| T_Loss 0.458 | T_acc 0.732| V_Loss 0.394| V_acc 0.732 |  2.84
Epoch:   3| T_Loss 0.533 | T_acc 0.687| V_Loss 0.473| V_acc  0.75 | 0.834
Epoch:   2| T_Loss 0.586 | T_acc 0.739| V_Loss 0.476| V_acc 0.786 | 2.517
Epoch:   1| T_Loss 1.123 | T_acc 0.511| V_Loss  0.64| V_acc 0.732 | 1.363
Epoch:   5| T_Loss 0.367 | T_acc 0.793| V_Loss  0.34| V_acc 0.821 | 1.328
Epoch:   4| T_Loss 0.458 | T_acc 0.732| V_Loss 0.394| V_acc 0.732 |  2.84
Epoch:   3| T_Loss 0.533 | T_acc 0.687| V_Loss 0.473| V_acc  0.75 | 0.834
Epoch:   2| T_Loss 0.586 | T_acc 0.739

In [13]:
with torch.no_grad():
    test_loss, test_acc = epoch(test_loader, mode='test')
    test_acc = round(test_acc, 4)
    test_loss = round(test_loss, 4)
    print('Test Acc: {}'.format(test_acc))
    print('Test Loss: {}'.format(test_loss))

Test Acc: 1.0
Test Loss: 0.009


In [14]:
model_path = './models/lstm_model.pth'

torch.save(net.state_dict(), model_path)
print('모델이 저장되었습니다.')

모델이 저장되었습니다.


In [5]:
import socket

REC_IP = "192.168.0.85"
REC_PORT = 5001

rec_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
rec_socket.connect((REC_IP, REC_PORT))

In [8]:
model_path = './models/lstm_model.pth'

lstm_model = LSTM().to(device)
lstm_model.load_state_dict(torch.load(model_path, map_location=device))
lstm_model.eval()

length = 18
detect_cls = 1

lstm_model.eval()
dataset = []
status = 'None'
prev_action = None

mp_pose = mp.solutions.pose
poses = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

xyz_list_list = []
status_dict = {0: 'normal', 1: 'fighting', 2: 'lying', 3: 'smoking'}
rec_start_action = "REC_ON"
rec_end_action = "REC_OFF"


cap = cv2.VideoCapture(0)

if not cap.isOpened():
    RuntimeError("카메라 열기 실패")

while True:
    ret, frame = cap.read()
    if not ret:  # 프레임 읽기 실패 시 종료
        break

    # 프레임 크기 조정 (선택 사항, YOLO 모델에 따라 필요)
    frame = cv2.resize(frame, (640, 640))

    # Mediapipe 포즈 추출
    results = poses.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    xyz_list = []

    if results.pose_landmarks:

        # 포즈 랜드마크 추출 및 그리기
        for landmark in results.pose_landmarks.landmark:
            xyz_list.append(landmark.x)
            xyz_list.append(landmark.y)
            xyz_list.append(landmark.z)

        # YOLO 박스 예측
        box_results = yolo_model.predict(frame, conf=0.6, verbose=False, show=False)[0].boxes
        boxes = box_results.xyxy.cpu().tolist()
        box_class = box_results.cls.cpu().tolist()

        p1x1, p1y1, p1x2, p1y2 = 0, 0, 0, 0
        p2x1, p2y1, p2x2, p2y2 = 0, 0, 0, 0
        for idx, cls in enumerate(box_class):
            if int(cls) == detect_cls:
                p1x1, p1y1, p1x2, p1y2 = map(int, boxes[0])
                if len(boxes) > 1:
                    p2x1, p2y1, p2x2, p2y2 = map(int, boxes[1])

                break

        # YOLO 박스 좌표 정규화 후 추가
        xyz_list.extend([abs(p1x1 - p2x1) / 640, abs(p1x2 - p2x2) / 640, abs(p1y1 - p2y1) / 640, abs(p1y2 - p2y2) / 640])
        xyz_list_list.append(xyz_list)

    # 시퀀스 길이에 도달하면 LSTM 예측 수행
    if len(xyz_list_list) == length:
        dataset = [{'key': 0, 'value': xyz_list_list}]  # 임시 라벨 0 사용
        dataset = MyDataset(dataset)
        dataset_loader = DataLoader(dataset, batch_size=1)

        for data, _ in dataset_loader:
            data = data.to(device)
            with torch.no_grad():
                result = lstm_model(data)
                _, out = torch.max(result, 1)
                status = status_dict.get(out.item(), 'Unknown')

        xyz_list_list = []  # 시퀀스 초기화

    if status != "normal" and status != "None":
        if prev_action != rec_start_action:
            rec_socket.send(rec_start_action.encode("utf-8"))
            prev_action = rec_start_action
    else:
        if prev_action != rec_end_action:
            rec_socket.send(rec_end_action.encode("utf-8"))
            prev_action = rec_end_action

    # 상태 텍스트 표시
    cv2.putText(frame, status, (10, 50), cv2.FONT_HERSHEY_COMPLEX, 1.5, (255, 0, 0), 2)

    # 프레임 표시
    cv2.imshow('frame', frame)

    # 'q' 키로 종료
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# 리소스 해제
cap.release()
cv2.destroyAllWindows()

I0000 00:00:1743766246.185763    6368 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1743766246.187532    6831 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1743766246.261271    6818 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1743766246.327719    6819 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
