In [1]:
import tensorflow as tf
import numpy as np

# === Embedding Model Definition ===
def create_embedding_model(input_shape):
    input_layer = tf.keras.Input(shape=input_shape)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(input_layer)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Lambda(lambda x: tf.nn.l2_normalize(x, axis=1))(x)
    return tf.keras.Model(inputs=input_layer, outputs=x)

# === Triplet Loss (Only used for original training, not needed during loading) ===
def triplet_loss(alpha=0.2):
    def loss(y_true, y_pred):
        anchor, positive, negative = y_pred[:, :128], y_pred[:, 128:256], y_pred[:, 256:]
        pos_dist = tf.reduce_sum(tf.square(anchor - positive), axis=1)
        neg_dist = tf.reduce_sum(tf.square(anchor - negative), axis=1)
        return tf.reduce_mean(tf.maximum(pos_dist - neg_dist + alpha, 0.0))
    return loss

# === Build Triplet Model and Load Full Weights ===
def create_triplet_model(input_shape):
    embedding_model = create_embedding_model(input_shape)

    anchor_input = tf.keras.Input(shape=input_shape)
    positive_input = tf.keras.Input(shape=input_shape)
    negative_input = tf.keras.Input(shape=input_shape)

    anchor_embedding = embedding_model(anchor_input)
    positive_embedding = embedding_model(positive_input)
    negative_embedding = embedding_model(negative_input)

    concatenated = tf.keras.layers.Concatenate()([
        anchor_embedding, positive_embedding, negative_embedding
    ])

    model = tf.keras.Model(
        inputs=[anchor_input, positive_input, negative_input],
        outputs=concatenated
    )
    model.compile(optimizer='adam', loss=triplet_loss())
    return model, embedding_model

# === Load Full Weights, Extract Embedding Model ===
input_shape = (60, 126)

triplet_model, embedding_model = create_triplet_model(input_shape)
triplet_model.load_weights("/Users/atchudhansreekanth/Downloads/similarity.weights.h5")
print("✅ Full weights loaded into triplet model")

# Now embedding_model is ready to use
print("✅ Embedding model ready for single-gesture inference")


✅ Full weights loaded into triplet model
✅ Embedding model ready for single-gesture inference


  saveable.load_own_variables(weights_store.get(inner_path))


In [2]:
import tensorflow as tf
import joblib

# === Load Label Encoder ===
label_encoder = joblib.load("/Users/atchudhansreekanth/Downloads/label_encoder.pkl")
print("✅ Label encoder loaded.")

# === Redefine Custom Layer Used in Model ===
class TemporalPositionalEncoding(tf.keras.layers.Layer):
    def call(self, inputs):
        seq_len = tf.shape(inputs)[1]
        d_model = inputs.shape[-1]
        position = tf.cast(tf.range(seq_len)[:, tf.newaxis], tf.float32)
        div_term = tf.exp(tf.range(0, d_model, 2, dtype=tf.float32) * -(tf.math.log(10000.0) / tf.cast(d_model, tf.float32)))
        sin_vals = tf.sin(position * div_term)
        cos_vals = tf.cos(position * div_term)
        pe = tf.concat([sin_vals, cos_vals], axis=-1)
        pe = pe[tf.newaxis, :, :]
        return inputs + pe

# === Load Model with Custom Objects ===
classification_model = tf.keras.models.load_model(
    "/Users/atchudhansreekanth/Downloads/gesture_model.keras",
    custom_objects={"TemporalPositionalEncoding": TemporalPositionalEncoding}
)

print("✅ Model loaded successfully.")

✅ Label encoder loaded.
✅ Model loaded successfully.


In [4]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
import joblib
import time
import json
import collections
from sklearn.metrics.pairwise import cosine_similarity

# === 2. Setup Mediapipe ===
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, model_complexity=0, min_detection_confidence=0.5)
mp_draw = mp.solutions.drawing_utils
frame_buffer = collections.deque(maxlen=5)

# === 3. Normalize Landmarks ===
def normalize_landmarks(landmarks):
    try:
        points = np.array(landmarks).reshape(21, 2)
        base_x, base_y = points[0]
        points -= [base_x, base_y]
        max_dist = np.linalg.norm(points, axis=1).max()
        if max_dist > 0:
            points /= max_dist
        return points.flatten()
    except:
        return None

# === 4. Predict Classification & Similarity ===
# NOTE: These must be loaded before running this script
# classification_model
# embedding_model
# label_encoder

# Add reference embeddings as needed (optional)
reference_embeddings = {
    # Example (load with np.load): 'hello': np.load('ref_hello.npy')
}

def predict(frames):
    input_seq = np.array(frames).reshape(5, 84)
    padded_seq = np.pad(input_seq, ((0, 55), (0, 42)), mode='constant')  # (60, 126)
    model_input = np.expand_dims(padded_seq, axis=0)  # (1, 60, 126)

    # === Classification
    preds = classification_model.predict(model_input, verbose=0)
    confidence = float(np.max(preds))
    label = label_encoder.inverse_transform([np.argmax(preds)])[0]

    # === Embedding
    embedding = embedding_model.predict(model_input, verbose=0)[0]

    # === Similarity with Reference (if available)
    if label in reference_embeddings:
        ref = reference_embeddings[label]
        similarity = float(cosine_similarity([embedding], [ref])[0][0])
    else:
        similarity = float(np.dot(embedding, embedding))  # fallback self-similarity

    return label, confidence, similarity


# === 5. Start Webcam ===
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise RuntimeError("❌ Could not access webcam.")

last_prediction = {"gesture": "None", "confidence": 0.0, "similarity": 0.0}
last_print_time = time.time()
PRINT_INTERVAL = 2

print("🎥 Webcam started. Press 'q' to quit.")

while True:
    ret, frame = cap.read()
    if not ret:
        print("⚠️ Failed to grab frame.")
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        combined_hand_data = []
        for hand in result.multi_hand_landmarks:
            lm = [(pt.x, pt.y) for pt in hand.landmark]
            norm = normalize_landmarks(lm)
            if norm is not None:
                combined_hand_data.extend(norm)
            mp_draw.draw_landmarks(frame, hand, mp_hands.HAND_CONNECTIONS)

        if len(combined_hand_data) == 84:
            frame_buffer.append(combined_hand_data)
        elif len(result.multi_hand_landmarks) == 1 and len(combined_hand_data) == 42:
            combined_hand_data.extend([0.0] * 42)
            frame_buffer.append(combined_hand_data)

        if len(frame_buffer) == 5:
            gesture, conf, sim = predict(frame_buffer)

            # Only update if gesture or confidence changes significantly
            significant_change = (
                gesture != last_prediction["gesture"] or
                abs(conf - last_prediction["confidence"]) > 0.02 or
                abs(sim - last_prediction["similarity"]) > 0.05
            )

            time_elapsed = time.time() - last_print_time >= PRINT_INTERVAL

            if significant_change or time_elapsed:
                last_prediction = {
                    "gesture": gesture,
                    "confidence": round(conf, 4),
                    "similarity": round(sim, 4)
                }

                # Print to console
                print(json.dumps(last_prediction, indent=4))
                last_print_time = time.time()


    # === Display Info ===
    label = last_prediction["gesture"]
    conf = last_prediction["confidence"]
    sim = last_prediction["similarity"]
    cv2.putText(frame, f"{label} ({conf*100:.1f}%, Sim: {sim*100:.1f}%)", (20, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

    if time.time() - last_print_time >= PRINT_INTERVAL:
        print(json.dumps(last_prediction, indent=4))
        last_print_time = time.time()

    cv2.imshow("Gesture Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


I0000 00:00:1743185314.841403   59018 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1743185314.852919   68186 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1743185314.857362   68186 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


🎥 Webcam started. Press 'q' to quit.
{
    "gesture": "None",
    "confidence": 0.0,
    "similarity": 0.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
    "similarity": 1.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
    "similarity": 1.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
    "similarity": 1.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
    "similarity": 1.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
    "similarity": 1.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
    "similarity": 1.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
    "similarity": 1.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
    "similarity": 1.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
    "similarity": 1.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
    "similarity": 1.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
    "similarity": 1.0
}
{
    "gesture": "hello",
    "confidence": 0.9433,
   

KeyboardInterrupt: 