In [2]:
import cv2
import torch
import time
import numpy as np
from ultralytics import YOLO
import mediapipe as mp

In [4]:
def initialize_label_map():
    """Initialize the label map for gestures detected by the YOLO model"""
    return {
        0: 'grabbing', 1: 'grip', 2: 'holy', 3: 'point', 4: 'call', 5: 'three3', 
        6: 'timeout', 7: 'xsign', 8: 'hand_heart', 9: 'hand_heart2', 10: 'little_finger', 
        11: 'middle_finger', 12: 'take_picture', 13: 'dislike', 14: 'fist', 15: 'four', 
        16: 'like', 17: 'mute', 18: 'ok', 19: 'one', 20: 'palm', 21: 'peace', 
        22: 'peace_inverted', 23: 'rock', 24: 'stop', 25: 'stop_inverted', 26: 'three', 
        27: 'three2', 28: 'two_up', 29: 'two_up_inverted', 30: 'three_gun', 
        31: 'thumb_index', 32: 'thumb_index2', 33: 'no_gesture'
    }

In [6]:
class HandGestureRecognizer:
    def __init__(self, model_path='./YOLOv10n_gestures.pt'):
        # Initialize YOLO model
        self.device = torch.device("cuda" if torch.cuda.is_available() else 
                                 "mps" if torch.backends.mps.is_available() else "cpu")
        print(f"Using device: {self.device}")
        self.model = YOLO(model_path).to(self.device)
        
        # Initialize MediaPipe
        self.mp_hands = mp.solutions.hands
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_drawing_styles = mp.solutions.drawing_styles
        self.hands = self.mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=2,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        
        # Initialize label map
        self.label_map = initialize_label_map()
        
    def process_frame(self, frame):
        # Convert the BGR image to RGB
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Process the frame with MediaPipe
        results_mp = self.hands.process(rgb_frame)
        
        # Run YOLO detection
        results_yolo = self.model(frame)
        
        # Draw MediaPipe hand landmarks
        if results_mp.multi_hand_landmarks:
            for hand_landmarks in results_mp.multi_hand_landmarks:
                self.mp_drawing.draw_landmarks(
                    frame,
                    hand_landmarks,
                    self.mp_hands.HAND_CONNECTIONS,
                    self.mp_drawing_styles.get_default_hand_landmarks_style(),
                    self.mp_drawing_styles.get_default_hand_connections_style()
                )
                
        # Process YOLO results
        for result in results_yolo:
            boxes = result.boxes
            for box in boxes:
                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
                confidence = box.conf[0].cpu().numpy()
                class_id = int(box.cls[0].cpu().numpy())
                
                if confidence > 0.5:
                    predicted_gesture = self.label_map.get(class_id, "Unknown Gesture")
                    
                    # Draw bounding box
                    cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
                    
                    # Add prediction text
                    text = f"{predicted_gesture} ({confidence:.2f})"
                    cv2.putText(frame, text, (int(x1), int(y1) - 10),
                              cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        
        return frame

    def real_time_recognition(self):
        cap = cv2.VideoCapture(0)
        cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
        
        prev_time = 0
        
        print("Starting real-time gesture recognition. Press 'q' to quit.")
        print("Available signs:", ", ".join(self.label_map.values()))
        
        try:
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    print("Failed to grab frame")
                    break
                
                # Calculate FPS
                current_time = time.time()
                fps = 1 / (current_time - prev_time)
                prev_time = current_time
                
                # Process frame
                frame = self.process_frame(frame)
                
                # Display FPS
                cv2.putText(frame, f"FPS: {int(fps)}", (550, 30),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)
                
                # Display the frame
                cv2.imshow('Hand Gesture Recognition', frame)
                
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
                
        finally:
            cap.release()
            cv2.destroyAllWindows()

In [8]:
try:
    recognizer = HandGestureRecognizer()
    recognizer.real_time_recognition()
except Exception as e:
    print(f"An error occurred: {str(e)}")
    cv2.VideoCapture(0).release()
    cv2.destroyAllWindows()

Using device: cuda
Starting real-time gesture recognition. Press 'q' to quit.
Available signs: grabbing, grip, holy, point, call, three3, timeout, xsign, hand_heart, hand_heart2, little_finger, middle_finger, take_picture, dislike, fist, four, like, mute, ok, one, palm, peace, peace_inverted, rock, stop, stop_inverted, three, three2, two_up, two_up_inverted, three_gun, thumb_index, thumb_index2, no_gesture

0: 480x640 (no detections), 102.9ms
Speed: 9.1ms preprocess, 102.9ms inference, 19.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 45.1ms
Speed: 4.9ms preprocess, 45.1ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 46.9ms
Speed: 4.0ms preprocess, 46.9ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 50.8ms
Speed: 3.9ms preprocess, 50.8ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 51.8ms
Speed: 3.0ms prepro



0: 480x640 1 four, 1 palm, 30.0ms
Speed: 4.0ms preprocess, 30.0ms inference, 33.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 rock, 30.2ms
Speed: 3.1ms preprocess, 30.2ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 rock, 77.5ms
Speed: 6.0ms preprocess, 77.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 rock, 30.0ms
Speed: 3.9ms preprocess, 30.0ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 rock, 34.5ms
Speed: 4.0ms preprocess, 34.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 rock, 29.5ms
Speed: 4.1ms preprocess, 29.5ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 rock, 30.9ms
Speed: 5.0ms preprocess, 30.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 rock, 30.0ms
Speed: 3.0ms preprocess, 30.0ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
