In [2]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import mediapipe as mp
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import pickle
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import collections
from tensorflow.keras.regularizers import l2

In [9]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

In [10]:
def normalize_landmarks(landmarks):
    """Normalize hand landmarks to be invariant to scale and translation"""
    # Reshape landmarks array to separate x, y, z coordinates
    landmarks_array = np.array(landmarks).reshape(-1, 3)
    
    # Get wrist position (typically the first landmark)
    wrist = landmarks_array[0]
    
    # Center landmarks around wrist
    centered = landmarks_array - wrist
    
    # Find the scale (maximum distance from any landmark to wrist)
    scale = np.max(np.linalg.norm(centered, axis=1))
    if scale > 0:
        # Normalize by scale
        normalized = centered / scale
    else:
        normalized = centered
    
    # Flatten back to original shape
    return normalized.flatten()

In [11]:
def testing(model=None):
    """Real-time hand gesture recognition with prediction smoothing using MediaPipe"""
    if model is None:
        if os.path.exists('../best_hand_gesture_model.h5'):
            model = load_model('../best_hand_gesture_model.h5')
        else:
            print("Model not found. Please train the model first.")
            return

    print("Starting real-time recognition. Press 'q' to exit.")
    cap = cv2.VideoCapture(1)
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return

    # Buffer for smoothing predictions (last 5 predictions)
    prediction_buffer = collections.deque(maxlen=5)

    with mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7) as hands:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                print("Error: Could not read frame.")
                break
            
            frame = cv2.flip(frame, 1)
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(rgb_frame)

            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    mp_drawing.draw_landmarks(
                        frame, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                        mp_drawing_styles.get_default_hand_landmarks_style(),
                        mp_drawing_styles.get_default_hand_connections_style()
                    )
                    
                    landmarks = []
                    for landmark in hand_landmarks.landmark:
                        landmarks.extend([landmark.x, landmark.y, landmark.z])
                    landmarks = normalize_landmarks(landmarks)
                    
                    prediction = model.predict(np.array([landmarks]), verbose=0)
                    pred_digit = np.argmax(prediction[0])
                    confidence = np.max(prediction[0])
                    prediction_buffer.append(pred_digit)
                    
                    # Use the mode of the buffer for a smoothed prediction
                    smoothed_pred = max(set(prediction_buffer), key=prediction_buffer.count)
                    
                    cv2.putText(
                        frame,
                        f"Digit: {smoothed_pred} ({confidence:.2f})",
                        (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        1.0,
                        (0, 255, 0),
                        2
                    )
            
            cv2.imshow('Hand Gesture Recognition', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
    cap.release()
    cv2.destroyAllWindows()

In [12]:
testing()

Starting real-time recognition. Press 'q' to exit.


I0000 00:00:1743893712.966333 1283878 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
2025-04-06 04:25:13.855 python[86763:1283878] +[IMKClient subclass]: chose IMKClient_Modern
2025-04-06 04:25:13.855 python[86763:1283878] +[IMKInputSession subclass]: chose IMKInputSession_Modern
2025-04-06 04:25:15.410590: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


: 

In [6]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score

model = load_model('../best_hand_gesture_model.h5')
# Load dataset previously collected (adjust the file path as needed)
dataset_path = '../hand_landmarks_dataset/hand_landmarks_dataset.pkl'
with open(dataset_path, 'rb') as f:
    data = pickle.load(f)

X = np.array(data['features'])  
y = np.array(data['labels'])      

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_test_cat = to_categorical(y_test, num_classes=10)

loss, accuracy = model.evaluate(X_test, y_test_cat, verbose=0)
print("Model Evaluate Test Accuracy:", accuracy)

predictions = model.predict(X_test, verbose=0)
predicted_classes = np.argmax(predictions, axis=1)
acc = accuracy_score(y_test, predicted_classes)
print("Sklearn Accuracy Score:", acc)

Model Evaluate Test Accuracy: 0.20000000298023224
Sklearn Accuracy Score: 0.2
