In [1]:
# Real-time recognition with OpenCV and MediaPipe
import cv2
import mediapipe as mp
import numpy as np
from keras.models import load_model

class_to_letter = {
    0: "A", 1: "B", 2: "C", 3: "D", 4: "E",
    5: "F", 6: "G", 7: "H", 8: "I", 9: "K",
    10: "L", 11: "M", 12: "N", 13: "O", 14: "P",
    15: "Q", 16: "R", 17: "S", 18: "T", 19: "U",
    20: "V", 21: "W", 22: "X", 23: "Y", 24: "Z"
}

# MediaPipe Hand model initialization
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.8)

# Load pre-trained model
model = load_model('new_model_yanna.h5')

# Video capture initialization
cap = cv2.VideoCapture(0)

# Stability variables
previous_prediction = None
stable_prediction = None
stability_counter = 0
stability_threshold = 5

# Previous bounding box for smoothing
prev_coords = None

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            h, w, _ = frame.shape
            hand_coords = [(int(lm.x * w), int(lm.y * h)) for lm in hand_landmarks.landmark]
            x_min, y_min = np.min(hand_coords, axis=0)
            x_max, y_max = np.max(hand_coords, axis=0)

            # Increase the bounding box size by adding padding
            padding = 20  # Adjust this value to increase or decrease the padding
            x_min = max(0, x_min - padding)
            y_min = max(0, y_min - padding)
            x_max = min(w, x_max + padding)
            y_max = min(h, y_max + padding)

            # Enforce a minimum bounding box size
            if x_max - x_min < 10 or y_max - y_min < 10:
                continue  # Skip small or invalid detections

            # Smooth bounding box coordinates
            if prev_coords:
                smoothing_factor = 0.5
                x_min = int(smoothing_factor * prev_coords[0] + (1 - smoothing_factor) * x_min)
                y_min = int(smoothing_factor * prev_coords[1] + (1 - smoothing_factor) * y_min)
                x_max = int(smoothing_factor * prev_coords[2] + (1 - smoothing_factor) * x_max)
                y_max = int(smoothing_factor * prev_coords[3] + (1 - smoothing_factor) * y_max)

            prev_coords = [x_min, y_min, x_max, y_max]

            # Crop and preprocess the hand region
            hand_img = frame[y_min:y_max, x_min:x_max]

            # Convert the cropped hand image to grayscale
            hand_img_gray = cv2.cvtColor(hand_img, cv2.COLOR_BGR2GRAY)

            # Adaptive thresholding for better background removal
            hand_mask = cv2.adaptiveThreshold(
                hand_img_gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
            )

            # Create a red background
            red_background = np.zeros_like(hand_img)
            red_background[:, :, 2] = 255  # Red channel

            # Combine hand region with red background
            hand_img_no_bg = cv2.bitwise_and(hand_img, hand_img, mask=hand_mask)
            alpha = 0.5
            hand_bg_red = cv2.addWeighted(hand_img_no_bg, 1 - alpha, red_background, alpha, 0)

            # Resize and normalize the grayscale hand image
            hand_img_gray_resized = cv2.resize(hand_img_gray, (28, 28)) / 255.0

            # Reshape to match the model input shape
            hand_img_reshaped = hand_img_gray_resized.reshape(1, 28, 28, 1)

            # Predict the ASL gesture
            prediction = model.predict(hand_img_reshaped)
            gesture_class = np.argmax(prediction)
            gesture_letter = class_to_letter[gesture_class]  # Convert class number to letter

            # Smooth predictions to avoid flickering
            if previous_prediction == gesture_class:
                stability_counter += 1
            else:
                stability_counter = 0

            if stability_counter >= stability_threshold:
                stable_prediction = gesture_letter

            previous_prediction = gesture_class

            # Display the letter and bounding box
            if stable_prediction:
                cv2.putText(frame, f"Gesture: {stable_prediction}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

            # Show the cropped hand for debugging
            cv2.imshow('Cropped Hand', hand_img_no_bg)

    # Show the processed frame
    cv2.imshow('ASL Gesture Recognition', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2