## Common Functions (Needed for most steps)

### Visualization

In [1]:
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np

In [2]:
MARGIN = 30  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 2
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # vibrant green

def draw_landmarks_on_image(rgb_image, detection_result, label, score = ""):
  hand_landmarks_list = detection_result.hand_landmarks
  handedness_list = detection_result.handedness
  annotated_image = np.copy(rgb_image)

  # Loop through the detected hands to visualize.
  for idx in range(len(hand_landmarks_list)):
    hand_landmarks = hand_landmarks_list[idx]
    handedness = handedness_list[idx]

    # Draw the hand landmarks.
    hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
    hand_landmarks_proto.landmark.extend([
      landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
    ])
    solutions.drawing_utils.draw_landmarks(
      annotated_image,
      hand_landmarks_proto,
      solutions.hands.HAND_CONNECTIONS,
      solutions.drawing_styles.get_default_hand_landmarks_style(),
      solutions.drawing_styles.get_default_hand_connections_style())

    # Get the top left corner of the detected hand's bounding box.
    height, width, _ = annotated_image.shape
    x_coordinates = [landmark.x for landmark in hand_landmarks]
    y_coordinates = [landmark.y for landmark in hand_landmarks]
    text_x = int(min(x_coordinates) * width)
    text_y = int(min(y_coordinates) * height) - MARGIN

    # Draw handedness (left or right hand) on the image.
    text_str = f"{handedness[0].category_name} Adding {label}"
    if score:
      text_str = f"{label} {handedness[0].category_name} {score:.2f}"

    cv2.putText(annotated_image, text_str,
                (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)

  return annotated_image

## Data Generation

### Database Functions

In [3]:
def add_detection_object(base, current_gesture, detection):
    # add detection and current_gesture to a variable that can be exported to a json file
 
    if not detection.hand_world_landmarks:
        return
        
    landmarks = []
    for landmark in detection.hand_world_landmarks[0]:
        landmarks.append([landmark.x, landmark.y, landmark.z])

    data = {
        "gesture": current_gesture,
        "landmarks": landmarks
    }

    base["data"].append(data)

### Loop

In [4]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import cv2
import time
import json

In [5]:
# Choose gestures
gestures = input('add comma seperated classes. EG: up,down,right')
gestures = gestures.split(',')
gesture_index = 0

latest_detection_result = None
def result_callback(result, output_image, timestamp_ms):
    global latest_detection_result
    latest_detection_result = result

base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
options = vision.HandLandmarkerOptions(
    base_options=base_options,
    num_hands=2,
    running_mode=vision.RunningMode.LIVE_STREAM,
    result_callback=result_callback
)
detector = vision.HandLandmarker.create_from_options(options)

# Creating variables for database
base = {
    "data": [],
    "labels": gestures
}

cap = cv2.VideoCapture(0)
while gesture_index < len(gestures):
    ret, frame = cap.read()
    if not ret:
        break

    current_gesture = gestures[gesture_index]

    # STEP 3: Load the input image.
    image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
    timestamp_ms = int(time.time() * 1000)

    # Perform detection asynchronously
    detector.detect_async(image, timestamp_ms=timestamp_ms)

    # Draw landmarks on the image
    if latest_detection_result:
        annotated_image = draw_landmarks_on_image(frame, latest_detection_result, current_gesture)
    else:
        annotated_image = frame

    # Display the resulting frame
    cv2.imshow('frame', annotated_image)

    # Controls
    key = cv2.waitKey(1) & 0xFF
    if key:
        if key == ord(' '): # Space to next image
            obj = add_detection_object(base, current_gesture, latest_detection_result)
            
        elif key == 13: # Enter to next gesture
            gesture_index += 1

        elif key == 27: # Escape to quit
            break

cap.release()
cv2.destroyAllWindows()

with open("database.json", "w") as f:
    json.dump(base, f)

I0000 00:00:1722348984.143572 11327719 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89), renderer: Apple M1 Pro
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1722348984.149196 11328124 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1722348984.154037 11328129 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


## Model Creation

In [6]:
import numpy as np
import json
import random
from tensorflow.keras.utils import to_categorical

def get_data(filepath):
    """Gets the labels and data from a JSON file.

    Args:
        filepath: Path to the JSON file.

    Returns:
        A tuple containing the labels and data.
    """
    
    with open(filepath, 'r') as f:
        file = json.load(f)

    all_labels = file["labels"]

    data = []
    data_labels = []

    file_data = file["data"]
    random.shuffle(file_data)
    for item in file_data:
        data_labels.append(all_labels.index(item['gesture']))
        data.append(item['landmarks'])
    
    data_labels = to_categorical(np.array(data_labels), num_classes=len(all_labels))
    data = np.array(data)

    return data, data_labels, all_labels

data, labels, all_labels = get_data('database.json')

train = data[:int(len(data)*0.8)]
train_labels = labels[:int(len(labels)*0.8)]
test = data[int(len(data)*0.8):]
test_labels = labels[int(len(labels)*0.8):]

num_labels = len(all_labels)

print(f"train: data {train.shape}, labels {train_labels.shape}")
print(f"test: data {test.shape}, labels {test_labels.shape}")


train: data (29, 21, 3), labels (29, 9)
test: data (8, 21, 3), labels (8, 9)


In [7]:
import tensorflow as tf
from tensorflow.keras import layers, models

def create_point_classification_model():
    # Input shape: 21 3D points, each point has x, y, z coordinates
    input_shape = (21, 3)

    model = models.Sequential([
        layers.Flatten(input_shape=input_shape),

        # Dense layers with increasing complexity
        layers.Dense(128, activation='relu'),
        layers.Dense(256, activation='relu'),
        layers.Dense(512, activation='relu'),

        # Dropout for regularization
        layers.Dropout(0.3),

        # Output layer with 36 classes
        layers.Dense(num_labels, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Create the model
model = create_point_classification_model()

# Display the model summary
model.summary()

  super().__init__(**kwargs)


In [13]:
history = model.fit(
    train, train_labels,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.8696 - loss: 0.3418 - val_accuracy: 0.6667 - val_loss: 0.6149
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.9565 - loss: 0.2879 - val_accuracy: 0.6667 - val_loss: 0.5958
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 1.0000 - loss: 0.3025 - val_accuracy: 0.6667 - val_loss: 0.5754
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.9130 - loss: 0.2802 - val_accuracy: 0.6667 - val_loss: 0.5571
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.9565 - loss: 0.2528 - val_accuracy: 0.6667 - val_loss: 0.5463
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.9565 - loss: 0.2715 - val_accuracy: 0.6667 - val_loss: 0.5407
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━

In [14]:
model.evaluate(test, test_labels)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 1.0000 - loss: 0.2976


[0.29764124751091003, 1.0]

In [15]:
model.save("model.keras")

## Running Model

In [3]:
from tensorflow.keras.models import load_model
import json

loaded_model = load_model('sign_all.keras')

# Getting label names from file
with open('database.json', 'r') as f:
    file = json.load(f)

    all_labels = file["labels"]

def predict(detection_result, threshold):
    out_landmarks = []
    if (detection_result.hand_world_landmarks):
        change = 1
        if (detection_result.handedness[0][0].category_name == "Left"):
            change = -1

        for hand in detection_result.hand_world_landmarks:
            for landmark in hand:
                out_landmarks.append([landmark.x * change, landmark.y, landmark.z])

    if len(out_landmarks) == 0:
        return ('No hand detected', 0.0)

    out_landmarks = np.array(out_landmarks)
    out_landmarks = np.expand_dims(out_landmarks, axis=0)
    prediction = loaded_model.predict(out_landmarks, verbose=0)

    if (prediction.max() < threshold):
            return ('Unknown', 0.0)

    return (np.argmax(prediction), prediction.max())
    # return (all_labels[np.argmax(prediction)], prediction.max())

In [4]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import cv2
import time

# Global variable to store the latest detection results
latest_detection_result = None

def result_callback(result, output_image, timestamp_ms):
    global latest_detection_result
    latest_detection_result = result

# STEP 2: Create a HandLandmarker object.
base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
options = vision.HandLandmarkerOptions(
    base_options=base_options,
    num_hands=1,
    running_mode=vision.RunningMode.LIVE_STREAM,
    result_callback=result_callback
)
detector = vision.HandLandmarker.create_from_options(options)

label,score = ('No hand detected', 0.0)

cap = cv2.VideoCapture(0)
while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
    if not ret:
        break

    # STEP 3: Load the input image.
    image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
    timestamp_ms = int(time.time() * 1000)

    # Perform detection asynchronously
    detector.detect_async(image, timestamp_ms=timestamp_ms)

    # Draw landmarks on the image
    if latest_detection_result:
        label, score = predict(latest_detection_result, 0.5)
        annotated_image = draw_landmarks_on_image(frame, latest_detection_result, label, score)
    else:
        annotated_image = frame

    # Display the resulting frame
    cv2.imshow('frame', annotated_image)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# When everything is done, release the capture
cap.release()
cv2.destroyAllWindows()

I0000 00:00:1723032280.723216 13210735 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89), renderer: Apple M1 Pro
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1723032280.745492 13211450 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1723032280.761013 13211454 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
