In [2]:
import cv2
import mediapipe as mp
import numpy as np

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "false"
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
os.environ["TF_MLIR_ENABLE_METAL"] = "0"

import tensorflow as tf
import joblib
import time
import json
import collections

# === 1. Load model + encoder ===
model = tf.keras.models.load_model('/Users/atchudhansreekanth/Desktop/University/Proj/Dataset prep/gesture_transformer_model.keras')
label_encoder = joblib.load('/Users/atchudhansreekanth/Desktop/University/Proj/Dataset prep/label_encoder.pkl')
print("✅ Model and encoder loaded.")

# === 2. Setup Mediapipe ===
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, model_complexity=0, min_detection_confidence=0.5)
mp_draw = mp.solutions.drawing_utils
frame_buffer = collections.deque(maxlen=5)

# === 3. Normalize ===
def normalize_landmarks(landmarks):
    try:
        points = np.array(landmarks).reshape(21, 2)
        base_x, base_y = points[0]
        points -= [base_x, base_y]
        max_dist = np.linalg.norm(points, axis=1).max()
        if max_dist > 0:
            points /= max_dist
        return points.flatten()
    except:
        return None

# === 4. Predict Gesture ===
def predict_gesture(frames):
    input_data = np.array(frames).reshape(1, 5, 42)
    preds = model.predict(input_data, verbose=0)
    confidence = float(np.max(preds))
    label = label_encoder.inverse_transform([np.argmax(preds)])[0]
    return label, confidence

# === 5. Start Webcam ===
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise RuntimeError("❌ Could not access webcam.")

last_prediction = {"gesture": "None", "confidence": 0.0}
last_print_time = time.time()
PRINT_INTERVAL = 2  # seconds

print("🎥 Webcam started. Press 'q' to quit.")

while True:
    ret, frame = cap.read()
    if not ret:
        print("⚠️ Failed to grab frame.")
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        for hand in result.multi_hand_landmarks:
            lm = [(pt.x, pt.y) for pt in hand.landmark]
            norm = normalize_landmarks(lm)
            if norm is not None:
                frame_buffer.append(norm)

            if len(frame_buffer) == 5:
                try:
                    gesture, conf = predict_gesture(frame_buffer)
                    last_prediction = {"gesture": gesture, "confidence": round(conf, 4)}
                except Exception as e:
                    print("❌ Prediction error:", e)

            mp_draw.draw_landmarks(frame, hand, mp_hands.HAND_CONNECTIONS)

    # Display prediction on screen
    label = last_prediction["gesture"]
    conf = last_prediction["confidence"]
    cv2.putText(frame, f"{label} ({conf*100:.1f}%)", (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Print to console every few seconds
    if time.time() - last_print_time >= PRINT_INTERVAL:
        print(json.dumps(last_prediction, indent=4))
        last_print_time = time.time()

    # Display the frame
    cv2.imshow("Gesture Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


✅ Model and encoder loaded.


I0000 00:00:1743003828.064798  150152 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1743003828.091318  150721 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1743003828.098147  150721 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


🎥 Webcam started. Press 'q' to quit.


2025-03-26 21:13:50.011 python[2788:150152] +[IMKClient subclass]: chose IMKClient_Legacy
2025-03-26 21:13:50.011 python[2788:150152] +[IMKInputSession subclass]: chose IMKInputSession_Legacy
W0000 00:00:1743003830.864212  150721 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
I0000 00:00:1743003831.616416  150712 service.cc:152] XLA service 0x600002571e00 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1743003831.616430  150712 service.cc:160]   StreamExecutor device (0): Host, Default Version
I0000 00:00:1743003832.004072  150712 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


{
    "gesture": "BORED",
    "confidence": 0.9551
}
{
    "gesture": "COLLEGE_SCHOOL",
    "confidence": 0.8237
}
{
    "gesture": "ABUSE",
    "confidence": 0.8614
}
{
    "gesture": "ABUSE",
    "confidence": 0.6685
}
{
    "gesture": "COLLEGE_SCHOOL",
    "confidence": 0.9936
}
{
    "gesture": "COLLEGE_SCHOOL",
    "confidence": 0.8775
}
{
    "gesture": "ANGRY",
    "confidence": 0.9103
}
{
    "gesture": "COLLEGE_SCHOOL",
    "confidence": 0.7348
}
{
    "gesture": "COLLEGE_SCHOOL",
    "confidence": 0.9334
}
{
    "gesture": "ABUSE",
    "confidence": 0.573
}
{
    "gesture": "COLLEGE_SCHOOL",
    "confidence": 0.9997
}
{
    "gesture": "BORED",
    "confidence": 0.8385
}
{
    "gesture": "COLLEGE_SCHOOL",
    "confidence": 0.8216
}
{
    "gesture": "AFRAID",
    "confidence": 0.7745
}
{
    "gesture": "AFRAID",
    "confidence": 0.7007
}
{
    "gesture": "AFRAID",
    "confidence": 0.7007
}
{
    "gesture": "AFRAID",
    "confidence": 0.7007
}
{
    "gesture": "BORED",
    "co

KeyboardInterrupt: 