In [3]:
import cv2
import mediapipe as mp
import numpy as np
import os
import tensorflow as tf
import joblib
import time
import json
import collections

# === 1. Load model + encoder ===
model = tf.keras.models.load_model('/Users/atchudhansreekanth/Desktop/University/Proj/Dataset prep/gesture_transformer_model.keras')
label_encoder = joblib.load('/Users/atchudhansreekanth/Desktop/University/Proj/Dataset prep/label_encoder.pkl')
print("✅ Model and encoder loaded.")

# === 2. Setup Mediapipe ===
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, model_complexity=0, min_detection_confidence=0.5)
mp_draw = mp.solutions.drawing_utils
frame_buffer = collections.deque(maxlen=5)

# === 3. Normalize Landmarks for Both Hands ===
def normalize_landmarks(landmarks):
    try:
        points = np.array(landmarks).reshape(21, 2)
        base_x, base_y = points[0]
        points -= [base_x, base_y]
        max_dist = np.linalg.norm(points, axis=1).max()
        if max_dist > 0:
            points /= max_dist
        return points.flatten()  # This returns 42 values (21 points × 2 coordinates)
    except:
        return None

# === 4. Predict Gesture ===
def predict_gesture(frames):
    try:
        # Debug info
        print(f"Frame buffer length: {len(frames)}")
        print(f"Frame buffer item length: {len(frames[0])}")
        print(f"Total elements: {len(frames) * len(frames[0])}")
        
        input_data = np.array(frames).reshape(1, 5, 84)  # 84 features for 2 hands
        preds = model.predict(input_data, verbose=0)
        confidence = float(np.max(preds))
        label = label_encoder.inverse_transform([np.argmax(preds)])[0]
        return label, confidence
    except Exception as e:
        print(f"❌ Prediction error: {e}")
        print(f"Buffer shape: {np.array(frames).shape}")
        return "Error", 0.0

# === 5. Start Webcam ===
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise RuntimeError("❌ Could not access webcam.")

last_prediction = {"gesture": "None", "confidence": 0.0}
last_print_time = time.time()
PRINT_INTERVAL = 2  # seconds

print("🎥 Webcam started. Press 'q' to quit.")

while True:
    ret, frame = cap.read()
    if not ret:
        print("⚠️ Failed to grab frame.")
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        # Initialize an array for this frame's combined hand data
        combined_hand_data = []
        
        for hand in result.multi_hand_landmarks:
            lm = [(pt.x, pt.y) for pt in hand.landmark]
            norm = normalize_landmarks(lm)
            if norm is not None:
                combined_hand_data.extend(norm)  # Add this hand's data
            
            # Draw landmarks
            mp_draw.draw_landmarks(frame, hand, mp_hands.HAND_CONNECTIONS)
        
        # If we have exactly two hands, we should have 84 features
        if len(combined_hand_data) == 84:  # Ensure we have the right number of features
            frame_buffer.append(combined_hand_data)
        
        # If we have less than 2 hands, pad with zeros to maintain consistent shape
        elif len(result.multi_hand_landmarks) == 1 and len(combined_hand_data) == 42:
            # We have one hand (42 features), so pad with zeros for the second hand
            combined_hand_data.extend([0.0] * 42)  # Add 42 zeros for the missing hand
            frame_buffer.append(combined_hand_data)
            
        if len(frame_buffer) == 5:
            gesture, conf = predict_gesture(frame_buffer)
            if gesture != "Error":  # Only update if prediction was successful
                last_prediction = {"gesture": gesture, "confidence": round(conf, 4)}

    # Display prediction on screen
    label = last_prediction["gesture"]
    conf = last_prediction["confidence"]
    cv2.putText(frame, f"{label} ({conf*100:.1f}%)", (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Print to console every few seconds
    if time.time() - last_print_time >= PRINT_INTERVAL:
        print(json.dumps(last_prediction, indent=4))
        last_print_time = time.time()

    # Display the frame
    cv2.imshow("Gesture Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


✅ Model and encoder loaded.


I0000 00:00:1743006932.654056  216139 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1743006932.680670  223044 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1743006932.687340  223044 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


🎥 Webcam started. Press 'q' to quit.
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420


I0000 00:00:1743006934.838175  216414 service.cc:152] XLA service 0x6000002f8400 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1743006934.838198  216414 service.cc:160]   StreamExecutor device (0): Host, Default Version
I0000 00:00:1743006935.489827  216414 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


{
    "gesture": "CLASS",
    "confidence": 0.7051
}
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffe

Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
{
    "gesture": "CLASS",
    "confidence": 0.9997
}
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffe

Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
{
    "gesture": "APPRECIATE",
    "confidence": 0.9878
}
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
{
    "gesture": "APPRECIATE",
    "confidence": 0.861
}
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 

{
    "gesture": "AFRAID",
    "confidence": 0.6733
}
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buff

Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
{
    "gesture": "APPRECIATE",
    "confidence": 0.9027
}
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame buffer length: 5
Frame buffer item length: 84
Total elements: 420
Frame 

KeyboardInterrupt: 