In [1]:
import cv2
import numpy as np
import mediapipe as mp
from tensorflow.keras.models import load_model
import time

mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils


In [2]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])


def recognize_hand_sign_language():
    cap = cv2.VideoCapture(0)

    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        # Load the pre-trained model
        model = load_model('action.h5')

        sequence = []  # Initialize an empty sequence
        recognized_sentence = ""  # Initialize an empty sentence
        sentence_delay = 5  # Set the sentence delay in seconds
        last_recognition_time = time.time()  # Initialize last recognition time

        while cap.isOpened():
            ret, frame = cap.read()
            image, results = mediapipe_detection(frame, holistic)

            # Draw Landmarks
            draw_styled_landmarks(image, results)

            keypoints = extract_keypoints(results)

            sequence.insert(0, keypoints)  # Insert the new frame into the sequence
            sequence = sequence[:30]  # Keep only the last 30 frames

            if len(sequence) == 30:
                # Convert the sequence to a NumPy array
                sequence_array = np.array(sequence)
                res = model.predict(np.expand_dims(sequence_array, axis=0))
                recognized_action = actions[np.argmax(res[0])]

                cv2.putText(image, f"Recognized Action: {recognized_action}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

                if time.time() - last_recognition_time >= sentence_delay:
                    recognized_sentence += recognized_action + " "  # Add the recognized word to the sentence
                    last_recognition_time = time.time()

                cv2.putText(image, f"Recognized Sentence: {recognized_sentence}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

            cv2.imshow('OpenCV Feed', image)

            if cv2.waitKey(10) & 0xFF == ord('q'):
                break

        cap.release()
        cv2.destroyAllWindows()
        
        return recognized_sentence

if __name__ == "__main__":
    actions = ['hello', 'thanks', 'iloveyou', 'beautiful', 'happy', 'loud']
    recognized_sentence = recognize_hand_sign_language()




In [3]:
 print("Recognized Sentence:", recognized_sentence)

Recognized Sentence: hello hello hello iloveyou iloveyou loud iloveyou happy thanks hello hello thanks thanks loud iloveyou iloveyou 
