In [8]:
import numpy as np

def normalize_landmarks(landmarks, handedness):
    landmarks = np.array(landmarks)

    # Translate so that wrist is at origin
    wrist = landmarks[0]
    landmarks = landmarks - wrist

    # Scale so that distance between wrist and middle finger MCP is 1
    mcp_index = 9  # Middle finger MCP landmark index
    scale = np.linalg.norm(landmarks[mcp_index]) # euclidean distance from the origin (wrist)
    if scale > 0:
        landmarks = landmarks / scale
    
    # Mirror left hands
    if handedness == "Left":
        landmarks[:, 0]  =  -landmarks[:, 0]

    # rotate all landmarks to point up
    rotated_landmarks = normalize_rotation(landmarks)

    return rotated_landmarks.tolist()

def normalize_rotation(landmarks):
    # Reference vector: from wrist (now at origin) to middle finger MCP
    reference_vector = landmarks[9]  # Middle finger MCP (wrist is at origin)
    
    # Current angle of reference vector
    current_angle = np.arctan2(reference_vector[1], reference_vector[0])
    
    # Target angle (pointing up in image coordinates = -90 degrees = -pi/2)
    # Note: In image coordinates, Y increases downward, so "up" is negative Y
    target_angle = -np.pi / 2
    
    # Calculate rotation needed
    rotation_angle = target_angle - current_angle
    
    # Apply rotation
    cos_a = np.cos(rotation_angle)
    sin_a = np.sin(rotation_angle)
    rotation_matrix = np.array([
        [cos_a, -sin_a],
        [sin_a, cos_a]
    ])
    
    rotated_landmarks = (rotation_matrix @ landmarks.T).T
    
    return rotated_landmarks

In [13]:
import math

def compute_direction(landmark_list):
    wrist = np.array(landmark_list[0])
    index_mcp = np.array(landmark_list[5])
    middle_mcp = np.array(landmark_list[9])
    index_tip = np.array(landmark_list[8])
    middle_tip = np.array(landmark_list[12])

    palm_center = (wrist + index_mcp + middle_mcp) / 3

    finger_tip_avg = (index_tip + middle_tip) / 2
    finger_dir = finger_tip_avg - palm_center

    angle = math.atan2(finger_dir[1], finger_dir[0])  # angle in radians
    return math.degrees(angle)  # convert to degrees

def retrieve_direction(angle_degrees):
    # Normalize angle to [-180, 180]
    angle = ((angle_degrees + 180) % 360) - 180
    
    if -45 <= angle <= 45:
        return "Right"
    elif angle >= 135 or angle <= -135:
        return "Left"
    elif 45 < angle < 135:
        return "Down"
    else:
        return "Up"
    

In [10]:
import cv2

def draw_normalized_landmarks(norm_landmarks, size=300):
    canvas = np.zeros((size, size, 3), dtype=np.uint8)

    # draw axes
    center = size // 2
    cv2.line(canvas, (center, 0), (center, size), (50, 50, 50), 1)
    cv2.line(canvas, (0, center), (size, center), (50, 50, 50), 1)

    for x, y in norm_landmarks:
        px = int((x + 2) / 4 * size)
        py = int((y + 2) / 4 * size)

        # invert Y for display
        # py = size - py

        # clamp just in case
        if 0 <= px < size and 0 <= py < size:
            cv2.circle(canvas, (px, py), 4, (0, 255, 0), -1)

    return canvas


In [None]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from pathlib import Path
import cv2
from tensorflow import keras
import time

DETECTOR_PATH = 'utils/mediapipe_cropper/hand_landmarker.task'
MODEL_PATH = 'shared_artifacts/models/latest.keras'

class_names = ['stop', 'like', 'two_up']


model = keras.models.load_model(MODEL_PATH)

BaseOptions = mp.tasks.BaseOptions
VisionRunningMode = mp.tasks.vision.RunningMode
HandLandmarkerOptions = mp.tasks.vision.HandLandmarkerOptions
HandLandmarker = mp.tasks.vision.HandLandmarker

options = HandLandmarkerOptions(base_options=BaseOptions(model_asset_path=str(DETECTOR_PATH)),
                                num_hands=1,
                                running_mode=VisionRunningMode.VIDEO)


with HandLandmarker.create_from_options(options) as landmarker:
    # Open default camera (0)
    cap = cv2.VideoCapture(0)

    if not cap.isOpened():
        print("Cannot open camera")
        exit()

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame")
            break

        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image_rgb)

        timestamp_ms = int(time.time() * 1000)

        results = landmarker.detect_for_video(mp_image, timestamp_ms)

        if results.hand_landmarks:
            landmarks = results.hand_landmarks[0]
            handedness_category = results.handedness[0][0]
            handedness = handedness_category.category_name
            confidence = handedness_category.score


            landmark_list = []
            for lm in landmarks:
                landmark_list.append([lm.x, lm.y])

            angle = compute_direction(landmark_list)
            direction = retrieve_direction(angle)

            # Debugging purposes
            h, w, _ = frame.shape
            for lm in landmark_list:
                lx = int(lm[0] * w)
                ly = int(lm[1] * h)
                cv2.circle(frame, (lx, ly), 5, (0, 255, 0), -1)

            normalized_landmarks = normalize_landmarks(landmark_list, handedness)

            # Debugging purposes
            norm_canvas = draw_normalized_landmarks(normalized_landmarks)
            cv2.imshow("Normalized landmarks", norm_canvas)



            input_vector = np.array(normalized_landmarks, dtype=np.float32).flatten() # (42,)

            input_vector = np.expand_dims(input_vector, axis=0) # (1, 42)

            predictions = model.predict(input_vector, verbose=0)

            predicted_idx = np.argmax(predictions[0])
            confidence = predictions[0][predicted_idx]

            predicted_gesture = class_names[predicted_idx]

            print("Handedness: ", handedness)
            print(f"Predicted: {predicted_gesture}; Confidence: {confidence}; Direction: {direction}")

        cv2.imshow("Camera", frame)

        # Press 'q' to quit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()


W0000 00:00:1766670640.959937  170956 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1766670640.973714  170959 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Handedness:  Left
Predicted: like; Confidence: 0.9724203944206238; Direction: Right
Handedness:  Left
Predicted: like; Confidence: 0.9857919216156006; Direction: Right
Handedness:  Left
Predicted: like; Confidence: 0.9879341125488281; Direction: Up
Handedness:  Left
Predicted: like; Confidence: 0.9898760914802551; Direction: Up
Handedness:  Left
Predicted: like; Confidence: 0.9949231743812561; Direction: Up
Handedness:  Left
Predicted: like; Confidence: 0.9985150694847107; Direction: Up
Handedness:  Left
Predicted: like; Confidence: 0.9990962743759155; Direction: Right
Handedness:  Left
Predicted: like; Confidence: 0.9989767074584961; Direction: Up
Handedness:  Left
Predicted: like; Confidence: 0.9990860223770142; Direction: Up
Handedness:  Left
Predicted: like; Confidence: 0.9990307092666626; Direction: Up
Handedness:  Left
Predicted: like; Confidence: 0.9989355206489563; Direction: Up
Handedness:  Left
Predicted: like; Confidence: 0.9991236329078674; Direction: Up
Handedness:  Left
P