In [6]:
import time
import numpy as np
import mediapipe as mp
import torch
from torchvision.transforms import functional as F
import cv2
from ultralytics import YOLO
from src.utils import intersection_over_union
from src.elements import face_detect, head_Pose
import pyttsx3
import threading
engine = pyttsx3.init()

def speak(text):
    engine.say(text)
    engine.runAndWait()

# Check if CUDA is available and set the device accordingly
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# load Detection model & facemesh
coco128 = open('./coco128.txt', 'r')
data = coco128.read()
class_list = data.split('\n')
coco128.close()

model = YOLO('./yolov8s.pt')
model.to(device)

mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# camera setting 
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

# hyperparameter setting
CONFIDENCE_THRESHOLD = 0.8    # 최소 정확도 이하의 객체는 화면에 출력하지 않음
iou_threshold = 0.8           # 유사도 IoU 기준
stable_threshold_time = 1.5   # 움직임 안정 여부 시간(s) 기준
pct_threshold = 10            # 얼굴 비중 안내 기준

# constants
GREEN = (0, 255, 0)       
WHITE = (255, 255, 255)   
BLUE = (0, 0, 255)

last_change_time = time.time()
last_bbox = None
max_area = 0
max_box = None

original_dim = (1280, 720)
resize_dim = (640, 640)
# Scale factors for coordinates
x_scale = original_dim[0] / resize_dim[0]
y_scale = original_dim[1] / resize_dim[1]

# 말을 한번만 하기 위해서 상태를 나타내는 변수 도입 -> 1일 때만 말해줄거야
state_loc_variable = 1

# DETECTION
while True:
    start = time.time()
    success, frame = cap.read()
    if not success:
        print('Cam Error')
        break

    # 요소 3 : head pose estimation
    #headpose = head_Pose(image=frame, face_mesh=face_mesh)
    # 좌우반전
    frame = cv2.flip(frame, 1)
    ##### TODO : 요소3 안내 조건 조정 및 추가 #####
    #cv2.putText(frame, headpose, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, GREEN, 2)
    
    # resize and move the frame to the same device as the model
    frame_resized = cv2.resize(frame, (640, 640))
    frame_tensor = F.to_tensor(frame_resized).unsqueeze(0).to(device)

    detection = model.predict(frame,conf=CONFIDENCE_THRESHOLD)[0]
    for data in detection.boxes.data.tolist(): 
        # data : [xmin, ymin, xmax, ymax, confidence_score, class_id]
        xmin, ymin, xmax, ymax, conf, label = int(data[0]), int(data[1]), int(data[2]), int(data[3]), float(data[4]), int(data[5])
        xmin = int(xmin * x_scale)
        ymin = int(ymin * y_scale)
        xmax = int(xmax * x_scale)
        ymax = int(ymax * y_scale)

        if label!=0:
             continue
        
        # 가장 큰 사이즈의 박스만 pass
        area = (xmax-xmin)*(ymax-ymin)
        if area > max_area:
             max_area = area
             max_box = [xmin, ymin, xmax, ymax, conf, label]
    
    if max_box:
        # bbox 표시
        new_bbox = max_box[:3]
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), GREEN, 2)
        cv2.putText(frame, class_list[label]+' '+str(round(conf, 2)), (xmin, ymin-10), cv2.FONT_ITALIC, 1, WHITE, 2)

        if last_bbox is not None:
            iou = intersection_over_union(last_bbox, new_bbox)
            if iou < iou_threshold:
                 last_change_time = time.time()
                 state_loc_variable = 1

            elif (time.time() - last_change_time) > stable_threshold_time and state_loc_variable == 1:

                # 요소 1 & 2 탐지 시작
                pct, loc = face_detect(xmin,ymin,xmax,ymax,frame)
                # (요소1)
                if pct > pct_threshold: # TODO: 요소1 조건 조정
                    pct_text = f"object percentage: {pct}%"
                    cv2.putText(frame, pct_text, (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.5, GREEN, 2)
                # (요소2)
                loc_text = f"object location : {loc}"
                cv2.putText(frame, loc_text, (20, 150), cv2.FONT_HERSHEY_SIMPLEX, 0.5, GREEN, 2)
                # (TTS)
                threading.Thread(target=speak, args=(f"현재 얼굴 비중은 {pct}%이고 얼굴 위치는 {loc}이다.",)).start()

                # 안내 완료 후 타이머 리셋
                last_change_time = time.time()
                # state_loc_variable 리셋
                state_loc_variable = 0 
        
        last_bbox = new_bbox

    max_area = 0
    max_box = None

    # fps 계산
    end = time.time()
    totalTime = (end - start)
    fps = f'FPS: {1 / totalTime:.2f}'
    print(f'Time to process 1 frame: {totalTime * 1000:.0f} milliseconds')
    print(fps)
        
    cv2.putText(frame, fps, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, BLUE, 2) # fps 표시
    cv2.imshow('frame', frame)
        
    if cv2.waitKey(1) == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

Using device: cpu

0: 384x640 1 person, 268.8ms
Speed: 4.2ms preprocess, 268.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)
Time to process 1 frame: 769 milliseconds
FPS: 1.30

0: 384x640 1 person, 269.3ms
Speed: 2.0ms preprocess, 269.3ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)


IndexError: list index out of range

In [5]:
model = YOLO('./best.pt')
model

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(96, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_s

In [7]:
import requests
from PIL import Image
from ultralytics import YOLO
import cv2
import numpy as np

# load model
model = YOLO('./best.pt')

# perform inference
results = model('./selfie.jpg')

# Convert the PIL segmentation mask to numpy array
segmentation_array = results[0].plot()

# Display the segmentation mask using OpenCV
cv2.imshow('YOLO Segmentation', segmentation_array)
cv2.waitKey(0)
cv2.destroyAllWindows()


image 1/1 c:\Users\happy\Desktop\camera_ObjectDetection\\YOLOv8\selfie.jpg: 448x640 2 FACEs, 342.3ms
Speed: 3.6ms preprocess, 342.3ms inference, 1.8ms postprocess per image at shape (1, 3, 448, 640)


In [14]:
detection.boxes

ultralytics.engine.results.Boxes object with attributes:

cls: tensor([0., 0.])
conf: tensor([0.8811, 0.8806])
data: tensor([[516.8550, 138.4368, 655.0471, 337.2710,   0.8811,   0.0000],
        [407.4715, 108.2561, 510.6542, 241.8674,   0.8806,   0.0000]])
id: None
is_track: False
orig_shape: (667, 1000)
shape: torch.Size([2, 6])
xywh: tensor([[585.9510, 237.8539, 138.1921, 198.8342],
        [459.0628, 175.0618, 103.1827, 133.6113]])
xywhn: tensor([[0.5860, 0.3566, 0.1382, 0.2981],
        [0.4591, 0.2625, 0.1032, 0.2003]])
xyxy: tensor([[516.8550, 138.4368, 655.0471, 337.2710],
        [407.4715, 108.2561, 510.6542, 241.8674]])
xyxyn: tensor([[0.5169, 0.2076, 0.6550, 0.5057],
        [0.4075, 0.1623, 0.5107, 0.3626]])

In [15]:
detection = model.predict('./selfie.jpg', conf = 0.5)[0]
for data in detection.boxes.data.tolist(): 
    # data : [xmin, ymin, xmax, ymax, confidence_score, class_id]
    xmin, ymin, xmax, ymax, conf, label = int(data[0]), int(data[1]), int(data[2]), int(data[3]), float(data[4]), int(data[5])
    print(label)
        



image 1/1 c:\Users\happy\Desktop\camera_ObjectDetection\\YOLOv8\selfie.jpg: 448x640 2 FACEs, 378.4ms
Speed: 3.0ms preprocess, 378.4ms inference, 1.0ms postprocess per image at shape (1, 3, 448, 640)
0
0


In [11]:
detection = model.predict(frame,conf=CONFIDENCE_THRESHOLD)[0]
    for data in detection.boxes.data.tolist(): 
        # data : [xmin, ymin, xmax, ymax, confidence_score, class_id]
        xmin, ymin, xmax, ymax, conf, label = int(data[0]), int(data[1]), int(data[2]), int(data[3]), float(data[4]), int(data[5])
        xmin = int(xmin * x_scale)
        ymin = int(ymin * y_scale)
        xmax = int(xmax * x_scale)
        ymax = int(ymax * y_scale)

        if label!=0:
             continue
        
        # 가장 큰 사이즈의 박스만 pass
        area = (xmax-xmin)*(ymax-ymin)
        if area > max_area:
             max_area = area
             max_box = [xmin, ymin, xmax, ymax, conf, label]
    

ultralytics.engine.results.Results object with attributes:

boxes: ultralytics.engine.results.Boxes object
keypoints: None
masks: None
names: {0: 'FACE'}
obb: None
orig_img: array([[[251, 246, 243],
        [251, 246, 243],
        [251, 246, 243],
        ...,
        [252, 254, 254],
        [252, 254, 254],
        [252, 254, 254]],

       [[251, 246, 243],
        [251, 246, 243],
        [251, 246, 243],
        ...,
        [252, 254, 254],
        [252, 254, 254],
        [252, 254, 254]],

       [[251, 246, 243],
        [251, 246, 243],
        [251, 246, 243],
        ...,
        [252, 254, 254],
        [252, 254, 254],
        [252, 254, 254]],

       ...,

       [[130, 130, 130],
        [130, 128, 134],
        [127, 129, 130],
        ...,
        [ 91, 120, 187],
        [ 91, 123, 189],
        [ 90, 122, 187]],

       [[128, 128, 128],
        [129, 127, 133],
        [126, 128, 129],
        ...,
        [ 89, 118, 185],
        [ 88, 120, 186],
        [ 88, 1

In [2]:
import time
import numpy as np
import mediapipe as mp
import cv2
from ultralytics import YOLO
from src.utils import intersection_over_union
from src.elements import face_detect, head_Pose
import torch
from torchvision.transforms import functional as F

from gtts import gTTS
import pygame
import tempfile
import threading
import os

import pyttsx3
import threading
engine = pyttsx3.init()

def speak(text):
    engine.say(text)
    engine.runAndWait()

# Check if CUDA is available and set the device accordingly
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load Detection model & facemesh
# coco128 = open('camera_ObjectDetection\yolov8\coco128.txt', 'r')
# data = coco128.read()
# class_list = data.split('\n')
# coco128.close()
class_list = ['FACE']

model = YOLO('./best.pt')
model.to(device)  # Use the device set above

mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Camera setting
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

# Hyperparameter setting
CONFIDENCE_THRESHOLD = 0.5
iou_threshold = 0.7
stable_threshold_time = 1.7
pct_threshold = 10

# Constants
GREEN = (0, 255, 0)
WHITE = (255, 255, 255)
BLUE = (0, 0, 255)

last_change_time = time.time()
last_bbox = None
max_area = 0
max_box = None

original_dim = (1280, 720)
resize_dim = (640, 640)
# Scale factors for coordinates
x_scale = original_dim[0] / resize_dim[0]
y_scale = original_dim[1] / resize_dim[1]

# 말을 한번만 하기 위해서 상태를 나타내는 변수 도입 -> 1일 때만 말해줄거야
state_loc_variable = 1

# Detection loop
while True:
    start = time.time()
    success, frame = cap.read()
    
    if not success:
        print('Cam Error')
        break
    
    # Head pose estimation
    headpose = head_Pose(image=frame, face_mesh=face_mesh)
    # Left/Right Inversion
    frame = cv2.flip(frame, 1)
    cv2.putText(frame, headpose, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, GREEN, 2)
    
    # Convert resized frame to tensor and move to the same device as the model
    frame_resized = cv2.resize(frame, (640, 640))
    frame_tensor = F.to_tensor(frame_resized).unsqueeze(0).to(device)

    # Use the model for prediction
    detection = model.predict(frame_tensor, conf=CONFIDENCE_THRESHOLD)[0].cpu()

    # 모든 박스 검사 -> 최대 박스 선정
    for data in detection.boxes.data.tolist():
        xmin, ymin, xmax, ymax, conf, label = int(data[0]), int(data[1]), int(data[2]), int(data[3]), float(data[4]), int(data[5])
        xmin = int(xmin * x_scale)
        ymin = int(ymin * y_scale)
        xmax = int(xmax * x_scale)
        ymax = int(ymax * y_scale)

        # Calculate area of the box
        area = (xmax - xmin) * (ymax - ymin)

        # Check if this box is bigger than the previously found ones
        if area > max_area and label == 0:
            max_area = area
            max_box = [xmin, ymin, xmax, ymax, conf, label]

    # 조건을 충족하는 가장 큰 박스에서 작업 수행
    if max_box:
        xmin, ymin, xmax, ymax, conf, label = max_box
        new_bbox = [xmin, ymin, xmax, ymax]
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)  # GREEN
        cv2.putText(frame, f"{class_list[label]} {round(conf, 2)}", (xmin, ymin - 10), cv2.FONT_ITALIC, 1, (255, 255, 255), 2)  # WHITE

        if last_bbox is not None:
            iou = intersection_over_union(last_bbox, new_bbox)
            if iou < iou_threshold:
                last_change_time = time.time()
                state_loc_variable = 1
                # voice_assistant.shutdown()
            elif (time.time() - last_change_time) > stable_threshold_time and state_loc_variable == 1:
                pct, loc = face_detect(xmin, ymin, xmax, ymax, frame)
                #if pct > pct_threshold:
                text = f"현재 얼굴 비중은 {pct}%이고 얼굴 위치는 {loc}입니다."
                speak(text)
                last_change_time = time.time()
                state_loc_variable = 0

        last_bbox = new_bbox
    
    max_area = 0
    max_box = None
    
    # FPS calculation
    end = time.time()
    totalTime = end - start
    fps = f'FPS: {1 / totalTime:.2f}'
    print(f'Time to process 1 frame: {totalTime * 1000:.0f} milliseconds')
    print(fps)

    cv2.putText(frame, fps, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, BLUE, 2)
    cv2.imshow('frame', frame)

    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Using device: cpu

0: 640x640 (no detections), 489.8ms
Speed: 0.0ms preprocess, 489.8ms inference, 6.5ms postprocess per image at shape (1, 3, 640, 640)
Time to process 1 frame: 1203 milliseconds
FPS: 0.83

0: 640x640 1 FACE, 535.2ms
Speed: 0.0ms preprocess, 535.2ms inference, 5.9ms postprocess per image at shape (1, 3, 640, 640)
Time to process 1 frame: 598 milliseconds
FPS: 1.67

0: 640x640 1 FACE, 462.1ms
Speed: 0.0ms preprocess, 462.1ms inference, 5.5ms postprocess per image at shape (1, 3, 640, 640)
Time to process 1 frame: 507 milliseconds
FPS: 1.97

0: 640x640 1 FACE, 471.7ms
Speed: 0.0ms preprocess, 471.7ms inference, 5.7ms postprocess per image at shape (1, 3, 640, 640)
Time to process 1 frame: 518 milliseconds
FPS: 1.93

0: 640x640 1 FACE, 437.0ms
Speed: 0.0ms preprocess, 437.0ms inference, 6.1ms postprocess per image at shape (1, 3, 640, 640)
Time to process 1 frame: 481 milliseconds
FPS: 2.08

0: 640x640 1 FACE, 445.5ms
Speed: 0.0ms preprocess, 445.5ms inference, 6.7ms post

In [18]:
head_Pose('./selfie.jpg',face_mesh=face_mesh)

error: OpenCV(4.9.0) :-1: error: (-5:Bad argument) in function 'flip'
> Overload resolution failed:
>  - src is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'src'


In [1]:
import time
import numpy as np
import mediapipe as mp
import cv2
from ultralytics import YOLO
from src.utils import intersection_over_union
from src.elements import face_detect, head_Pose
import torch
from torchvision.transforms import functional as F

from gtts import gTTS
import pygame
import tempfile
import threading
import os

import pyttsx3
import threading
engine = pyttsx3.init()

def speak(text):
    engine.say(text)
    engine.runAndWait()

# Check if CUDA is available and set the device accordingly
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load Detection model & facemesh
# coco128 = open('camera_ObjectDetection\yolov8\coco128.txt', 'r')
# data = coco128.read()
# class_list = data.split('\n')
# coco128.close()
class_list = ['FACE']

model = YOLO('./best.pt')
model.to(device)  # Use the device set above

mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Camera setting
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

# Hyperparameter setting
CONFIDENCE_THRESHOLD = 0.5
iou_threshold = 0.7
stable_threshold_time = 1.7
pct_threshold = 10

# Constants
GREEN = (0, 255, 0)
WHITE = (255, 255, 255)
BLUE = (0, 0, 255)

last_change_time = time.time()
last_bbox = None
max_area = 0
max_box = None

original_dim = (1280, 720)
resize_dim = (640, 640)
# Scale factors for coordinates
x_scale = original_dim[0] / resize_dim[0]
y_scale = original_dim[1] / resize_dim[1]

# 말을 한번만 하기 위해서 상태를 나타내는 변수 도입 -> 1일 때만 말해줄거야
state_loc_variable = 1

# Detection loop
while True:
    start = time.time()
    success, frame = cap.read()
    
    if not success:
        print('Cam Error')
        break
    
    # Head pose estimation
    headpose = head_Pose(image=frame, face_mesh=face_mesh)
    # Left/Right Inversion
    frame = cv2.flip(frame, 1)
    cv2.putText(frame, headpose, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, GREEN, 2)
    
    # Convert resized frame to tensor and move to the same device as the model
    frame_resized = cv2.resize(frame, (640, 640))
    frame_tensor = F.to_tensor(frame_resized).unsqueeze(0).to(device)

    # Use the model for prediction
    detection = model.predict(frame_tensor, conf=CONFIDENCE_THRESHOLD)[0].cpu()

    # 모든 박스 검사 -> 최대 박스 선정
    for data in detection.boxes.data.tolist():
        xmin, ymin, xmax, ymax, conf, label = int(data[0]), int(data[1]), int(data[2]), int(data[3]), float(data[4]), int(data[5])
        xmin = int(xmin * x_scale)
        ymin = int(ymin * y_scale)
        xmax = int(xmax * x_scale)
        ymax = int(ymax * y_scale)

    
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)  # GREEN
        cv2.putText(frame, f"{class_list[label]} {round(conf, 2)}", (xmin, ymin - 10), cv2.FONT_ITALIC, 1, (255, 255, 255), 2)  # WHITE


    
    # FPS calculation
    end = time.time()
    totalTime = end - start
    fps = f'FPS: {1 / totalTime:.2f}'
    print(f'Time to process 1 frame: {totalTime * 1000:.0f} milliseconds')
    print(fps)

    cv2.putText(frame, fps, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, BLUE, 2)
    cv2.imshow('frame', frame)

    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

pygame 2.5.2 (SDL 2.28.3, Python 3.8.19)
Hello from the pygame community. https://www.pygame.org/contribute.html
Using device: cpu

0: 640x640 (no detections), 499.9ms
Speed: 0.0ms preprocess, 499.9ms inference, 5.8ms postprocess per image at shape (1, 3, 640, 640)
Time to process 1 frame: 2680 milliseconds
FPS: 0.37

0: 640x640 1 FACE, 487.2ms
Speed: 0.0ms preprocess, 487.2ms inference, 5.5ms postprocess per image at shape (1, 3, 640, 640)
Time to process 1 frame: 529 milliseconds
FPS: 1.89

0: 640x640 1 FACE, 430.6ms
Speed: 0.0ms preprocess, 430.6ms inference, 6.0ms postprocess per image at shape (1, 3, 640, 640)
Time to process 1 frame: 469 milliseconds
FPS: 2.13

0: 640x640 1 FACE, 486.4ms
Speed: 0.0ms preprocess, 486.4ms inference, 6.0ms postprocess per image at shape (1, 3, 640, 640)
Time to process 1 frame: 529 milliseconds
FPS: 1.89

0: 640x640 1 FACE, 537.3ms
Speed: 0.0ms preprocess, 537.3ms inference, 7.6ms postprocess per image at shape (1, 3, 640, 640)
Time to process 1 fra

In [5]:
import cv2

In [6]:
cv2.__version__

'4.9.0'