In [None]:
!pip install tensorflow mediapipe opencv-python scikit-learn matplotlib

In [149]:
import tensorflow
import mediapipe as mp
import cv2
import numpy as np
import sklearn
import matplotlib
import os

In [150]:
# Initialize mediapipe elements: Drawing utils & Hand model
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

## Landmarks Detection Function

In [151]:
# Mediapipe Hands Detection
def hand_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)    # By default opencv uses BGR
    image.flags.writeable = False

    results = model.process(image)    # Extract the landmarks using mp model
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)    # Convert to opencv compatible color space
    return image, results

In [152]:
def draw_landmarks(image, result):
    if results.multi_hand_landmarks:    # If landmarks are recognized
        for num, hand in enumerate(results.multi_hand_landmarks):    # For every landmark point
            mp_drawing.draw_landmarks(image, hand, mp_hands.HAND_CONNECTIONS)    # Draw the corresponding joint and connection

In [153]:
# Testing the Webcam Stream

cap = cv2.VideoCapture(0)    # Webcam stream component

# Using the mediapipe hands model
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=2) as hands:
    while cap.isOpened():
        ret, frame = cap.read() # Get a frame from webcam stream

        # Detections
        image, results = hand_detection(frame, hands)
        
        # Draw Landmarks
        draw_landmarks(image, results)

        cv2.imshow('Hand Tracking', image)    # Frame after applying the mediapipe drawings
        
        if cv2.waitKey(10) & 0xFF == ord('q'):    # Press 'Q' to exit
            break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1744639037.291349    8238 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1744639037.293606   17965 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: Mesa Intel(R) Graphics (RPL-P)


In [None]:
results.multi_hand_world_landmarks[0].landmark

## Extracting Keypoints from Mediapipe output

In [154]:
def extract_keypoints(results):
    all_landmarks = []    # Array to store the flattened coordinates of the 21 + 21 hand joints
    if results.multi_hand_world_landmarks:
        for hand_landmarks in results.multi_hand_world_landmarks:
            for _, landmark in enumerate(hand_landmarks.landmark):
                test = np.array([landmark.x, landmark.y, landmark.z])
                all_landmarks.append(test)
    else:
        all_landmarks.append(np.zeros(42 * 3))    # Return array of shape (42,) with zeroes if hands are not detected (21: LH + 21: RH)
    all_landmarks = np.array(all_landmarks).flatten()

    # If second hand is not in the image
    if len(all_landmarks) == 63:
        all_landmarks = np.append(all_landmarks, np.zeros(21 * 3))
    return all_landmarks

In [155]:
extract_keypoints(results).shape

(126,)

## Setup Directories

In [164]:
data_path = os.path.join('Data')
signs = np.array(['what', 'hello', 'how are you', 'me', 'student', 'nice to meet you', 'you', 'name', 'this', 'project', 
                  'our', 'sentences', 'understand', 'computer', 'classroom', 'chairs', 'many', 'yes', 'no', 'thank you', 
                  'good', 'morning', 'afternoon'])    # 23 signs

no_sequences = 30
# left hand = 25 && right hand = 25
# Collect data twice for both hands
sequence_length = 50    # Number of frames per video collected

In [157]:
# Make Directories to store the numPy arrays

# For first set of data
for sign in signs:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(data_path, sign, str(sequence)))
        except:
            pass

## Extract Data and store in directories

In [165]:
cap = cv2.VideoCapture(0)

# Using the mediapipe hands model
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=2) as hands:
    for sign in signs:
        flag = True

        # for sequence in range(no_sequences):  # For first set of data
        for sequence in range(no_sequences):    # For second set of data
            
            for frame_number in range(-1, sequence_length):
                
                ret, frame = cap.read() # Get a frame from webcam stream

                if frame_number == -1:
                    while(flag):
                        ret, frame = cap.read()
                        cv2.putText(frame, 'START Collection \'{}\', press S.'.format(sign), (10, 50), 
                                cv2.FONT_HERSHEY_COMPLEX, 1,(0, 255, 0), 1, cv2.LINE_AA)
                        cv2.imshow('Frames Collection', frame)
                        if cv2.waitKey(10) & 0xFF == ord('s'):    # Press 'S' to start capturing data for the particular letter
                            flag = False
                    continue
                
                # Detections
                image, results = hand_detection(frame, hands)
                draw_landmarks(image, results)
                
                cv2.putText(image, 'Collecting \'{}\''.format(sign), (10, 50), 
                            cv2.FONT_HERSHEY_COMPLEX, 1,(0, 255, 0), 1, cv2.LINE_AA)
                cv2.putText(image, 'Video number {}'.format(sequence), (10, 100), 
                            cv2.FONT_HERSHEY_COMPLEX, 1,(0, 255, 0), 1, cv2.LINE_AA)
                cv2.imshow('Frames Collection', image)    # Frame after applying the mediapipe drawings

                # Extracting and saving numPy arrays                
                keypoints = extract_keypoints(results)
                
                npy_path = os.path.join(data_path, sign, str(sequence), str(frame_number))    # Path to store the array at
                np.save(npy_path, keypoints)    # Save arrays for individual frames
                    
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1744646947.349596    8238 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1744646947.351123   23495 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: Mesa Intel(R) Graphics (RPL-P)


In [163]:
cap.release()
cv2.destroyAllWindows()

## Pre-Processing data and Creating Labels

In [166]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [167]:
label_map = {label: num for num, label in enumerate(signs)}    # Create a label map to train the model on

In [168]:
# Merge labels with the corresponding sequence data
sequences, labels = [], []
for sign in signs:
    for sequence in range(0, 30):
        window = []
        for frame_number in range(1, 50):
            res = np.load(os.path.join(data_path, sign, str(sequence), "{}.npy".format(frame_number)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[sign])

## FIXING THE MESS

In [275]:
# index = []
# for i in range(126, 189):
#     index.append(i)

In [252]:
# sequences[123][16] = np.delete(sequences[123][16], index)

<!-- 690, 49, 126 -->

In [314]:
# for i in range(0, 690):
#     for j in range(0, 49):
#         if sequences[i][j].shape != (126,):
#             print(f"At {i}, {j}")

At 486, 48


In [276]:
# for i in range(0, 689):
#     for j in range(0, 48):
#         if len(sequences[i][j]) != 126:
#             print("At ", i, ", ", j, "length is: ", len(sequences[i][j]))
#             # print(i, ", ", j)
#             # sequences[i][j] = np.delete(sequences[i][j], index)

## Back to labelling

In [319]:
# temp = np.asarray(sequences, dtype="object")
np.array(sequences).shape    # (total_videos, frames_per_video, keypoints_per_frame)
# temp.shape

(690, 49, 126)

In [320]:
X = np.array(sequences)    # The coordinate data to train the model with

In [321]:
X.shape    # (a, b, c) => (b, c) is input shape for LSTM layer

(690, 49, 126)

In [322]:
y = to_categorical(labels).astype(int)    # Labels for the corresponsing X elements e.g., X[0] => y[0] : {'a': 0}

In [323]:
# Split the dataset into training (95%) and testing (5%) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

## Building and Training the model

In [324]:
from tensorflow.keras.models import Sequential

In [325]:
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [340]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)    # (Optional) Directory for evaluating model with TensorBoard

In [341]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(49, 126)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(signs.shape[0], activation='softmax'))

In [342]:
X.shape # {_, a, b} where (a, b) is the input to the LSTM layer => In this case (49, 126)

(690, 49, 126)

In [343]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
# Load the pre-trained model (h5 or keras format). Use only one!
model = tensorflow.keras.models.load_model('asl.keras')
# model.load_weights('asl.h5')

In [345]:
# !!!! Do not run if model is already trained !!!!
# Train the model with the dataset, change epochs based on size of dataset
model.fit(X_train, y_train, epochs=1000, callbacks=[tb_callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

KeyboardInterrupt: 

In [358]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_9 (LSTM)               (None, 49, 64)            48896     
                                                                 
 lstm_10 (LSTM)              (None, 49, 128)           98816     
                                                                 
 lstm_11 (LSTM)              (None, 64)                49408     
                                                                 
 dense_8 (Dense)             (None, 64)                4160      
                                                                 
 dense_9 (Dense)             (None, 32)                2080      
                                                                 
 dense_10 (Dense)            (None, 23)                759       
                                                                 
Total params: 204119 (797.34 KB)
Trainable params: 204

## Making Predictions and Saving Model

In [360]:
res = model.predict(X_test)



In [365]:
# Testing - Check if the model can predict the test split accurately
for i in range(35):
    print(i, ":")
    print(signs[np.argmax(res[i])] == signs[np.argmax(y_test[i])])

0 :
True
1 :
True
2 :
True
3 :
True
4 :
True
5 :
True
6 :
True
7 :
True
8 :
True
9 :
True
10 :
True
11 :
True
12 :
True
13 :
True
14 :
True
15 :
True
16 :
True
17 :
True
18 :
True
19 :
True
20 :
True
21 :
True
22 :
True
23 :
True
24 :
True
25 :
True
26 :
True
27 :
True
28 :
True
29 :
True
30 :
True
31 :
True
32 :
True
33 :
True
34 :
True


In [366]:
model.save('asl.h5')
model.save('asl.keras')

  saving_api.save_model(


## Testing the Model

In [384]:
sequence = []
threshold = 0.8
res = np.array([0])
sign = ' '
sentence = []
window = 0

cap = cv2.VideoCapture(0)

# Using the mediapipe hands model
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=2) as hands:
    while cap.isOpened():
        ret, frame = cap.read() # Get a frame from webcam stram

        # Detections
        image, results = hand_detection(frame, hands)
        
        # Draw Landmarks
        draw_landmarks(image, results)

        # Testing Logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-50:]

        if len(sequence) == 50:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            # print(signs[np.argmax(res)])    # To print predicted output

            # Rendering result in OpenCV
            if res[np.argmax(res)] > threshold:
                window += 1
                if window == 20:
                    if len(sentence) > 0: 
                        # if signs[np.argmax(res)] != sentence[-1]:
                        sentence.append(signs[np.argmax(res)])
                    else:
                        sentence.append(signs[np.argmax(res)])
                    window = 0
                
                sign = signs[np.argmax(res)]
            else:
                sign = "none"
                window = 0
        
            if len(sentence) > 15: 
                sentence = sentence[-15:]

        cv2.rectangle(image, (850, 640), (1280, 720), (255, 255, 255), -1)
        cv2.putText(image, "Press 's' to add space", (853, 670),
                   cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
        cv2.putText(image, "Press 'q' to exit", (853, 700),
                   cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
        
        
        cv2.rectangle(image, (0,0), (640, 40), (208, 231, 245), -1)
        cv2.putText(image, "Prediction: ", (3, 30),
                   cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
        cv2.putText(image, ''.join(sign), (200, 30),
                    cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)

        cv2.rectangle(image, (0, 50), (640, 100), (255, 255, 255), -1)
        cv2.putText(image, "Input: ", (3, 85),
                   cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
        cv2.putText(image, ''.join(sentence), (115, 85),
                   cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
        
        cv2.imshow('Hand Tracking', image)    # Frame after applying the mediapipe drawings

        key = cv2.waitKey(33) & 0b11111111
        if key == ord('q'):
            break

        if key == ord('s'):
            sentence.append(' ')

        # if cv2.waitKey(10) & 0xFF == ord('q'):
        #     break
        
        # if cv2.waitKey(10) & 0xFF == ord('s'):
        #     sentence.append(' ')        

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1744655748.631330    8238 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1744655748.633525   51796 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: Mesa Intel(R) Graphics (RPL-P)




In [None]:
cap.release()
cv2.destroyAllWindows()