In [None]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
import pickle
import time
import json
import collections
import joblib

# -------------------------------------------------------------------
# ✅ 1) Load Model + Label Encoder
# -------------------------------------------------------------------
model_path = "/Users/atchudhansreekanth/Desktop/University/Proj/Dataset prep/gesture_transformer_model.keras"
label_encoder_path = '/Users/atchudhansreekanth/Desktop/University/Proj/Dataset prep/label_encoder.pkl'

model = tf.keras.models.load_model(model_path)
label_encoder = joblib.load(label_encoder_path)
print("✅ Model and label encoder loaded.")

# -------------------------------------------------------------------
# ✅ 2) Mediapipe Hand Detection
# -------------------------------------------------------------------
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    model_complexity=0,
    min_detection_confidence=0.5
)
mp_drawing = mp.solutions.drawing_utils
frame_buffer = collections.deque(maxlen=5)

# -------------------------------------------------------------------
# ✅ 3) Normalize Landmarks
# -------------------------------------------------------------------
def normalize_landmarks(landmarks):
    points = np.array(landmarks).reshape(21, 2)
    base_x, base_y = points[0]
    points[:, 0] -= base_x
    points[:, 1] -= base_y
    max_dist = np.linalg.norm(points, axis=1).max()
    if max_dist > 0:
        points /= max_dist
    return points.flatten()

# -------------------------------------------------------------------
# ✅ 4) Inference & Output Formatting
# -------------------------------------------------------------------
def predict_gesture(frames):
    input_data = np.array(list(frames)).reshape(1, 5, 42)
    preds = model.predict(input_data, verbose=0)
    confidence = float(np.max(preds))
    label = label_encoder.inverse_transform([np.argmax(preds)])[0]
    return label, confidence

# -------------------------------------------------------------------
# ✅ 5) Real-Time Video Loop
# -------------------------------------------------------------------
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 320)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 240)

last_prediction = {"gesture": "None", "confidence": 0.0}
PRINT_INTERVAL = 3
last_print_time = time.time()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            landmarks = [(lm.x, lm.y) for lm in hand_landmarks.landmark]
            norm = normalize_landmarks(landmarks)
            frame_buffer.append(norm)

            if len(frame_buffer) == 5:
                gesture, conf = predict_gesture(frame_buffer)
                last_prediction = {"gesture": gesture, "confidence": round(conf, 4)}

            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    # Show on screen
    gesture_text = f"{last_prediction['gesture']} ({last_prediction['confidence']*100:.1f}%)"
    cv2.putText(frame, gesture_text, (20, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # JSON logging
    if time.time() - last_print_time >= PRINT_INTERVAL:
        print(json.dumps(last_prediction, indent=4))
        last_print_time = time.time()

    cv2.imshow("Gesture Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


I0000 00:00:1742964018.246183   98817 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1742964018.246567   98817 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


✅ Model and label encoder loaded.


I0000 00:00:1742964019.208501   98817 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1742964019.234217  103730 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1742964019.242031  103730 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
2025-03-26 10:10:23.409 python[2613:98817] +[IMKClient subclass]: chose IMKClient_Legacy
2025-03-26 10:10:23.409 python[2613:98817] +[IMKInputSession subclass]: chose IMKInputSession_Legacy
W0000 00:00:1742964024.469721  103724 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


{
    "gesture": "None",
    "confidence": 0.0
}
