<a href="https://colab.research.google.com/github/admorsy/hand_gesture_detection/blob/main/gesture_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing modules and packages needed for code

In [None]:
!pip install mediapipe==0.10.21 mediapipe-model-maker==0.2.1.4 numpy==1.23.5 opencv-python==4.11.0.86 scikit-learn==1.6.1 tensorflow==2.15.1


Creating, training, and evaluating the custom model

In [None]:
from mediapipe_model_maker import gesture_recognizer

# Load dataset from your ASL gesture dataset
data = gesture_recognizer.Dataset.from_folder(dirname="/content/drive/MyDrive/Colab_Notebooks/asl_dataset")

# Split dataset into 80% training, 20% validation
train_data, validation_data = data.split(0.8)

# Define options for the Gesture Recognizer
options = gesture_recognizer.GestureRecognizerOptions(
    hparams=gesture_recognizer.HParams(export_dir="/content/drive/MyDrive/Colab_Notebooks/exported_model")
)

# Create and train the model
model = gesture_recognizer.GestureRecognizer.create(
    train_data=train_data,
    validation_data=validation_data,
    options=options  # Pass the GestureRecognizerOptions object
)

print("🎉 Training complete! The model has been saved.")

# Evaluate the model
metric = model.evaluate(validation_data)
print("Model Evaluation:", metric)

# Export the trained model
model.export_model(model_name="sign2number.task")

This code is responsible for opening the webcam to detect hand gestures.

The detected gesture is then recognized by gesture_recognizer, and is converted to text, in the corner of the screen, that reresents the recognized gesture.

In [None]:
import cv2
import mediapipe as mp
from mediapipe.tasks.python.vision import GestureRecognizer, GestureRecognizerOptions
model_path = "/content/drive/MyDrive/Colab_Notebooks/exported_model/sign2number.task"


# Initialize MediaPipe Hands for landmark detection
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
hands = mp_hands.Hands( static_image_mode=False,
    model_complexity=1,
    min_detection_confidence=0.75,
    min_tracking_confidence=0.75,
    max_num_hands=1)

# Load the trained gesture recognition model
options = GestureRecognizerOptions(
    base_options=mp.tasks.BaseOptions(model_asset_path=model_path)
)
recognizer = GestureRecognizer.create_from_options(options)

# Open webcam
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Flip the frame horizontally (mirror effect)
    frame = cv2.flip(frame, 1)

    # Convert frame to RGB for gesture recognition
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process hand landmarks
    results = hands.process(rgb_frame)

    # Convert the frame to grayscale
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Convert grayscale back to BGR so it can display with colored landmarks
    gray_frame = cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2BGR)

    # Draw hand landmarks if detected (on grayscale frame)
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                gray_frame,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing.DrawingSpec(color=(0, 100, 255), thickness=8, circle_radius=4),
                mp_drawing.DrawingSpec(color=(255, 10, 255), thickness=4, circle_radius=4),
            )

    # Convert frame to MediaPipe Image for gesture recognition
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)

    # Run gesture recognition
    recognition_result = recognizer.recognize(mp_image)

     # Display recognized gestures on screen with a background
    if recognition_result.gestures:
        for gesture in recognition_result.gestures:
            gesture_name = gesture[0].category_name  # Get the most confident gesture

            # Define text properties
            text = f"{gesture_name}"
            font = cv2.FONT_HERSHEY_SIMPLEX
            font_scale = 2
            font_thickness = 3
            text_color = (255, 255, 255)  # text color
            bg_color = (255, 10, 255)  # background color

            # Get text size
            text_size, _ = cv2.getTextSize(text, font, font_scale, font_thickness)
            text_w, text_h = text_size

            # Define text position
            x, y = 10, 50  # Top-left corner

            # Draw background rectangle
            cv2.rectangle(gray_frame, (x - 10, y - text_h - 10), (x + text_w + 10, y + 10), bg_color, cv2.FILLED)

            # Put text on top of the rectangle
            cv2.putText(gray_frame, text, (x, y), font, font_scale, text_color, font_thickness, cv2.LINE_AA)

    # Show the grayscale webcam feed with colored landmarks and text background
    cv2.imshow("Hand Tracking & Gesture Recognition", gray_frame)

    if cv2.waitKey(10) & 0xFF == ord("q"):
        break

cap.release()
# cv2.destroyAllWindows()