In [6]:
import numpy as np

def normalize_landmarks(landmarks, handedness):
    landmarks = np.array(landmarks)

    # Translate so that wrist is at origin
    wrist = landmarks[0]
    landmarks = landmarks - wrist

    # Scale so that distance between wrist and middle finger MCP is 1
    mcp_index = 9  # Middle finger MCP landmark index
    scale = np.linalg.norm(landmarks[mcp_index]) # euclidean distance from the origin (wrist)
    if scale > 0:
        landmarks = landmarks / scale
    
    # Mirror left hands
    if handedness == "Left":
        landmarks[:, 0]  =  -landmarks[:, 0]

    return landmarks.tolist()

In [None]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from pathlib import Path
import cv2
from tensorflow import keras
import time

DETECTOR_PATH = 'utils/mediapipe_cropper/hand_landmarker.task'
# MODEL_PATH = 'shared_artifacts/models/gesture_model_20251221_184630.keras'
MODEL_PATH = 'shared_artifacts/models/latest.keras'

class_names = ['stop', 'like', 'two_up']


model = keras.models.load_model(MODEL_PATH)

BaseOptions = mp.tasks.BaseOptions
VisionRunningMode = mp.tasks.vision.RunningMode
HandLandmarkerOptions = mp.tasks.vision.HandLandmarkerOptions
HandLandmarker = mp.tasks.vision.HandLandmarker

options = HandLandmarkerOptions(base_options=BaseOptions(model_asset_path=str(DETECTOR_PATH)),
                                num_hands=1,
                                running_mode=VisionRunningMode.VIDEO)


with HandLandmarker.create_from_options(options) as landmarker:
    # Open default camera (0)
    cap = cv2.VideoCapture(0)

    if not cap.isOpened():
        print("Cannot open camera")
        exit()

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame")
            break

        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image_rgb)

        timestamp_ms = int(time.time() * 1000)

        results = landmarker.detect_for_video(mp_image, timestamp_ms)

        if results.hand_landmarks:
            landmarks = results.hand_landmarks[0]
            handedness_category = results.handedness[0][0]
            handedness = handedness_category.category_name
            confidence = handedness_category.score

            landmark_list = []
            for lm in landmarks:
                landmark_list.append([lm.x, lm.y])

            normalized_landmarks = normalize_landmarks(landmark_list, handedness)

            input_vector = np.array(normalized_landmarks, dtype=np.float32).flatten() # (42,)

            input_vector = np.expand_dims(input_vector, axis=0) # (1, 42)

            predictions = model.predict(input_vector, verbose=0)

            predicted_idx = np.argmax(predictions[0])
            confidence = predictions[0][predicted_idx]

            predicted_gesture = class_names[predicted_idx]

            print(predictions)
            print(f"Predicted: {predicted_gesture}; Confidence: {confidence}")

        cv2.imshow("Camera", frame)

        # Press 'q' to quit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()


W0000 00:00:1766411481.950607   55708 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1766411481.965032   55708 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[[0.0046099  0.9842898  0.01110024]]
Predicted: like; Confidence: 0.984289824962616
[[0.00161893 0.9960007  0.00238033]]
Predicted: like; Confidence: 0.9960007071495056
[[5.1636045e-04 9.9912030e-01 3.6337343e-04]]
Predicted: like; Confidence: 0.99912029504776
[[5.5243005e-04 9.9912947e-01 3.1817469e-04]]
Predicted: like; Confidence: 0.9991294741630554
[[4.9629284e-04 9.9924266e-01 2.6110152e-04]]
Predicted: like; Confidence: 0.9992426633834839
[[7.1987940e-04 9.9894005e-01 3.4007372e-04]]
Predicted: like; Confidence: 0.9989400506019592
[[8.3623797e-04 9.9879020e-01 3.7363425e-04]]
Predicted: like; Confidence: 0.9987902045249939
[[7.6016953e-04 9.9894780e-01 2.9213596e-04]]
Predicted: like; Confidence: 0.99894779920578
[[5.4932036e-04 9.9922073e-01 2.2989647e-04]]
Predicted: like; Confidence: 0.9992207288742065
[[5.3199794e-04 9.9925631e-01 2.1165695e-04]]
Predicted: like; Confidence: 0.9992563128471375
[[5.5104803e-04 9.9923277e-01 2.1619085e-04]]
Predicted: like; Confidence: 0.999232