In [None]:

#pip install tensorflow==2.12.0 tensorflow-cpu==2.12.0  opencv-python-headless mediapipe==0.10.5 scikit-learn matplotlib
#pip install ipykernel
#python -m ipykernel install --user --name=asl_interpreter_env --display-name "Python (asl_interpreter_env)"


In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp



In [2]:
mp_holistic = mp.solutions.holistic #holistic model
mp_drawing = mp.solutions.drawing_utils #drawing utilities 

In [3]:
def mediapipe_detection(image,model):
    image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB) #converting color
    image.flags.writeable = False
    results = model.process(image)  #detection, prediction
    image.flags.writeable = True 
    image = cv2.cvtColor(image,cv2.COLOR_RGB2BGR) #converting back
    return image, results

# we are grabbing image, converting its format for model, changing write access for memory saving

In [5]:
def draw_landmarks(image, results):

    if results.left_hand_landmarks:
        mp_drawing.draw_landmarks(image,results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                    mp_drawing.DrawingSpec(color=(60,10,10), thickness=1, circle_radius=2),
                                    mp_drawing.DrawingSpec(color=(210,206,121), thickness=1, circle_radius=2)
                                    )

    if results.right_hand_landmarks:
        mp_drawing.draw_landmarks(image,results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                        mp_drawing.DrawingSpec(color=(210,200,60), thickness=1, circle_radius=2),
                                        mp_drawing.DrawingSpec(color=(10,256,121), thickness=1, circle_radius=2)
                                        )
    
    if results.pose_landmarks:
        mp_drawing.draw_landmarks(image,results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                                    mp_drawing.DrawingSpec(color=(98,13,49), thickness=1, circle_radius=2),
                                    mp_drawing.DrawingSpec(color=(109,201,0), thickness=1, circle_radius=2)
                                    )
    if results.face_landmarks:   
        mp_drawing.draw_landmarks(image,results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                                    mp_drawing.DrawingSpec(color=(98,13,49), thickness=1, circle_radius=2),
                                    mp_drawing.DrawingSpec(color=(109,201,0), thickness=1, circle_radius=2)
                                    )

          

In [None]:

# we want to extract keypoints from hands, pose, face, but if they are not in frame they will
# be the same shape but just zeros
def extract_keypoints(result):
    pose = []
    
    if result.pose_landmarks:
        for res in result.pose_landmarks.landmark:
            pose.append(np.array([res.x, res.y, res.z, res.visibility]))
        pose = np.array(pose).flatten()
    else:
        pose = np.zeros(33*4)

    rh = []

    if result.right_hand_landmarks:
        for res in result.right_hand_landmarks.landmark:
            rh.append(np.array([res.x, res.y, res.z]))
        rh = np.array(rh).flatten()
    else:
        rh = np.zeros(21*3)

    lh = []

    if result.left_hand_landmarks:
        for res in result.left_hand_landmarks.landmark:
            lh.append(np.array([res.x, res.y, res.z]))
        lh = np.array(lh).flatten()
    else:
        lh = np.zeros(21*3)

    face = []

    if result.face_landmarks:
        for res in result.face_landmarks.landmark:
            face.append(np.array([res.x, res.y, res.z]))
        face = np.array(face).flatten()
    else:
        face = np.zeros(468*3)

    res = np.concatenate([pose, face, lh, rh])
    return res  # Return the final array



In [None]:
# debugging
extracted_keypoints = extract_keypoints(results)
print(extracted_keypoints.shape)  

(1662,)


In [7]:
# path for exported data
DATA_PATH = os.path.join('MP_Data')

# actions we are going to detect
actions = np.array(['hello','thanks','iloveyou'])

# 30 videos worth of data
no_sequences = 30

# each of thsoe videos are going to be 30 frames in length
sequence_length = 30

In [75]:
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH,action,str(sequence)))
        except:
            pass



In [76]:

cap = cv2.VideoCapture(0)

#set the media pipe model
# we make a initial detection and then tracks it
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as model:
    for action in actions:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length):
                
                # Read feed, grabbing frame
                ret, frame = cap.read()
            
                #making detection
                image, results = mediapipe_detection(frame, model)
                draw_landmarks(image,results)

                if not ret:
                    print("Failed to grab frame")  
                    break  # Exit the loop if the frame is not read successfully
                
                if frame_num == 0:
                    cv2.putText(image, 'STARTING COLLECTION', (120,200),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 1, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} video number{}'.format(action,sequence), (15,12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
                    cv2.waitKey(2000)
                else:
                    cv2.putText(image,'Collecting frames for {} video number {}'.format(action,sequence),(15,12),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5,(0,0,255),1,cv2.LINE_AA)
                  
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH,action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)
                
                # Render to screen
                cv2.imshow("OpenCV Feed", image)

                # Breaking
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

    cap.release()
    cv2.destroyAllWindows()

In [8]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [33]:
# asign each label a digit, 0 , 1  , 2
label_map = {label:num for num,label in enumerate(actions)}

In [None]:
# creating folders for out data, we will have 30 videos per gesture, each video having 30 frames, each frame will be 
# represented by a npy array that we will train out data on
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(0,sequence_length):
            path = os.path.join(DATA_PATH,action, str(sequence), "{}.npy".format(frame_num))
            res = np.load(path)
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [80]:
X = np.array(sequences)

In [None]:
# makes integer labels from 0,1,2 to 1,0,0 , 0,1,0 , 0,0,1
# our labels will be derived from our map, and the videos so we label each video as hello, thanks, or iloveyou
Y = to_categorical(labels).astype(int)

In [None]:
#splits our train and test data
# 95% of our data will be for training, the 5% being for evaluating our model
# we do this because we want to train our model on data , then test it on unseen data to see how well it works
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.05)


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [None]:
# enabling visualization during training
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)


In [None]:

# initializing a Sequential model, which lets us add layers step by step
# Sequential models are suitable for simple, feed-forward networks where layers are stacked in sequence
model = Sequential()

# adding the first LSTM layer with 64 units
# setting return_sequences=True so that this layer will return the full sequence of outputs for each input sequence
# using ReLU as the activation function, which helps with vanishing gradient issues by setting negative values to zero
# input_shape=(30, 1662) specifies that the input data has sequences of 30 timesteps, each with 1662 features
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 1662)))

# adding the second LSTM layer with 128 units
# again, setting return_sequences=True to output the entire sequence for each input sequence, which allows stacking LSTM layers
# this layer will receive the sequence output from the first LSTM layer and further process it to learn more complex patterns
model.add(LSTM(128, return_sequences=True, activation='relu'))

# adding the third LSTM layer with 64 units, but now return_sequences=False
# with return_sequences=False, this layer will output only the last timestep, which condenses the sequence into a single output
# this allows the model to distill the sequence into a fixed-size vector, useful for passing to dense layers
model.add(LSTM(64, return_sequences=False, activation='relu'))

# adding a dense layer with 64 units and ReLU activation
# dense layers are fully connected, meaning each unit in this layer is connected to every unit in the previous layer
# the 64 units here enable the model to start learning non-sequential features by combining information from the LSTM layers
model.add(Dense(64, activation='relu'))

# adding another dense layer with 32 units and ReLU activation
# by adding another dense layer with fewer units, we allow the model to gradually reduce feature complexity
# this reduction can help focus on the most important features before reaching the final output layer
model.add(Dense(32, activation='relu'))

# adding the final dense layer with a softmax activation function
# the number of units in this layer is equal to the number of actions, defined by actions.shape[0], representing the output classes
# softmax activation is used to output a probability distribution across classes, helpful for multi-class classification tasks
model.add(Dense(actions.shape[0], activation='softmax'))

In [86]:
# multiclass classification model so we have to use categorical accuracy
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
# the training :)            dont need to run the full 2000 but you can
model.fit(X_train, Y_train, epochs=2000, callbacks=[tb_callback])

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

<keras.callbacks.History at 0x175c92aa890>

In [None]:
model.save('trained.h5')

In [89]:
res = model.predict(X_test)



In [95]:
actions[np.argmax(res[0])]

'thanks'

In [94]:
actions[np.argmax(Y_test[0])]

'thanks'

In [96]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [102]:
yhat = model.predict(X_train)



In [103]:
ytrue = np.argmax(Y_train, axis=1).tolist()
yhat = np.argmax(yhat,axis=1).tolist()

In [None]:
#testing our accuracy, detecting false positives, true positives, false negatives, true nagatives
# the more values we have in the top left and bottom right of each matrix the better
multilabel_confusion_matrix(ytrue,yhat)

array([[[53,  3],
        [ 1, 28]],

       [[58,  0],
        [ 1, 26]],

       [[55,  1],
        [ 2, 27]]], dtype=int64)

In [None]:
# our accuracy of our model from data
accuracy_score(ytrue, yhat)

0.9529411764705882

In [11]:
model.load_weights('trained.h5')

In [None]:
# visualization of the predictions for each action / sign
olors = [(245,117,16),(117,245,16),(16,116,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num,prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40),colors[num], -1)
        cv2.putText(output_frame, actions[num], (0,85+num*40),cv2.FONT_HERSHEY_COMPLEX, 1, (255,255,255),2, cv2.LINE_AA)
    
    return output_frame

In [32]:
#detection vars
sequence = []
predictions = []
threshold = 0.4
predicted_sign = "hello"

cap = cv2.VideoCapture(0)


predicting = False


with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame")  
            break  # Exit if frame not read successfully

        # Making detection
        image, results = mediapipe_detection(frame, holistic)
        draw_landmarks(image, results)
        
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        if len(sequence) == 30:
            predicting = True
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))
            print(predictions[-1])
            if np.unique(predictions[-10:])[0] == np.argmax(res) and res[np.argmax(res)] > threshold:
                predicted_sign = actions[np.argmax(res)]

        cv2.putText(image, predicted_sign, (3, 30), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        if predicting:
            image = prob_viz(res, actions, image, colors)

        cv2.imshow('OpenCV Feed', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

# Release resources
cap.release()
cv2.destroyAllWindows()


2
2
2
2
2
2
2
2
2
2
2
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
2
2
2
2
2
2
2
2
0
0
0
0
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
2
2
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
1
1
1
2
2
2
2
2
2
0
0
0
0
0
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
0


In [None]:
cap.release()
cv2.destroyAllWindows()