In [1]:
#Some dependensies and version matching
#pip install tensorflow-cpu==2.12.0 opencv-python mediapipe==0.10.5 scikit-learn matplotlib
#pip install ipykernel
#python -m ipykernel install --user --name=env --display-name "Python (asl_env)"


In [2]:
import numpy as np
import os
from matplotlib import pyplot as plt
import cv2
import time
import mediapipe as mp



In [3]:
mp_holistic = mp.solutions.holistic #holistic model
mp_drawing = mp.solutions.drawing_utils #drawing utilities 

In [4]:
def draw_landmarks(image, results):

    if results.left_hand_landmarks:
        mp_drawing.draw_landmarks(image,results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                    mp_drawing.DrawingSpec(color=(60,10,10), thickness=1, circle_radius=2),
                                    mp_drawing.DrawingSpec(color=(210,206,121), thickness=1, circle_radius=2)
                                    )

    if results.right_hand_landmarks:
        mp_drawing.draw_landmarks(image,results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                        mp_drawing.DrawingSpec(color=(210,200,60), thickness=1, circle_radius=2),
                                        mp_drawing.DrawingSpec(color=(10,256,121), thickness=1, circle_radius=2)
                                        )
    
    if results.pose_landmarks:
        mp_drawing.draw_landmarks(image,results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                                    mp_drawing.DrawingSpec(color=(98,13,49), thickness=1, circle_radius=2),
                                    mp_drawing.DrawingSpec(color=(109,201,0), thickness=1, circle_radius=2)
                                    )
    if results.face_landmarks:   
        mp_drawing.draw_landmarks(image,results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                                    mp_drawing.DrawingSpec(color=(98,13,49), thickness=1, circle_radius=2),
                                    mp_drawing.DrawingSpec(color=(109,201,0), thickness=1, circle_radius=2)
                                    )

          

In [5]:
def mediapipe_detection(image,model):
    image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB) #converting color
    image.flags.writeable = False
    results = model.process(image)  #detection, prediction
    image.flags.writeable = True 
    image = cv2.cvtColor(image,cv2.COLOR_RGB2BGR) #converting back
    return image, results

# we are grabbing image, converting its format for model, changing write access for memory saving

In [6]:

# we want to extract keypoints from hands, pose, face, but if they are not in frame they will
# be the same shape but just zeros
def extract_keypoints(result):
    pose = []
    
    if result.pose_landmarks:
        for res in result.pose_landmarks.landmark:
            pose.append(np.array([res.x, res.y, res.z, res.visibility]))
        pose = np.array(pose).flatten()
    else:
        pose = np.zeros(33*4)

    rh = []

    if result.right_hand_landmarks:
        for res in result.right_hand_landmarks.landmark:
            rh.append(np.array([res.x, res.y, res.z]))
        rh = np.array(rh).flatten()
    else:
        rh = np.zeros(21*3)

    lh = []

    if result.left_hand_landmarks:
        for res in result.left_hand_landmarks.landmark:
            lh.append(np.array([res.x, res.y, res.z]))
        lh = np.array(lh).flatten()
    else:
        lh = np.zeros(21*3)

    face = []

    if result.face_landmarks:
        for res in result.face_landmarks.landmark:
            face.append(np.array([res.x, res.y, res.z]))
        face = np.array(face).flatten()
    else:
        face = np.zeros(468*3)

    res = np.concatenate([pose, face, lh, rh])
    return res  # Return the final array



In [43]:
# path for exported data
DATA_PATH = os.path.join('MP_Data')

# actions we are going to detect
actions = np.array(['hello','yes','no','iloveyou','please','thanks', "i/me", "you", "bye"])

# 70 videos worth of data
no_sequences = 70

# each of thsoe videos are going to be 30 frames in length
sequence_length = 40

In [17]:
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH,action,str(sequence)))
        except:
            pass



In [29]:
redo_action = "thanks"

for sequence in range(no_sequences):
    try:
        os.makedirs(os.path.join(DATA_PATH,redo_action,str(sequence)))
    except:
        pass

In [41]:
redo_action = "you"

for sequence in range(no_sequences):
    try:
        os.makedirs(os.path.join(DATA_PATH,redo_action,str(sequence)))
    except:
        pass

In [18]:

cap = cv2.VideoCapture(2)

#set the media pipe model
# we make a initial detection and then tracks it
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.6) as model:
    for action in actions:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length):
                
                # Read feed, grabbing frame
                ret, frame = cap.read()
            
                #making detection
                image, results = mediapipe_detection(frame, model)
                draw_landmarks(image,results)

                if not ret:
                    print("Failed to read frame")  
                    break  
                
                if frame_num == 0:
                    cv2.putText(image, 'Processing {} video number{}'.format(action,sequence), (15,12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
                    cv2.waitKey(300)
                else:
                    cv2.putText(image,'Collecting frames for {} video number {}'.format(action,sequence),(15,12),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5,(0,0,255),1,cv2.LINE_AA)
                  
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH,action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)
                
                # Render to screen
                cv2.imshow("ASL_Interpreter", image)

                # Breaking
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

    cap.release()
    cv2.destroyAllWindows()

KeyboardInterrupt: 

In [46]:
cap = cv2.VideoCapture(2)

#actions -> (['hello','yes','no','iloveyou','please','thanks', "i/me", "you", "bye"])


# to process specific actions, data processing is taking a long time to do in one sitting
# now that I am increasing the number of signs to process
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.6) as model:
    
    # processing specific range of signs to continue processing
    for action in actions[8:]:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length):
                
                # Read feed, grabbing frame
                ret, frame = cap.read()
            
                #making detection
                image, results = mediapipe_detection(frame, model)
                draw_landmarks(image,results)

                if not ret:
                    print("Failed to read frame")  
                    break  
                
                if frame_num == 0:
                    cv2.putText(image, 'Processing {} video number{}'.format(action,sequence), (15,12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
                    cv2.waitKey(300)
                else:
                    cv2.putText(image,'Collecting frames for {} video number {}'.format(action,sequence),(15,12),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5,(0,0,255),1,cv2.LINE_AA)
                  
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH,action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)
                
                # Render to screen
                cv2.imshow("ASL_Interpreter", image)

                # Breaking
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

    cap.release()
    cv2.destroyAllWindows()

In [33]:
cap.release()
cv2.destroyAllWindows()

In [47]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [48]:
# asign each label a digit, 0 , 1  , 2 ...
label_map = {label:num for num,label in enumerate(actions)}

In [49]:
label_map

{'hello': 0,
 'yes': 1,
 'no': 2,
 'iloveyou': 3,
 'please': 4,
 'thanks': 5,
 'i/me': 6,
 'you': 7,
 'bye': 8}

In [50]:
# creating folders for out data, we will have 30 videos per gesture, each video having 30 frames, each frame will be 
# represented by a npy array that we will train out data on
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(0,sequence_length):
            path = os.path.join(DATA_PATH,action, str(sequence), "{}.npy".format(frame_num))
            res = np.load(path)
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [51]:
X = np.array(sequences)

In [52]:
X

array([[[ 0.57304472,  0.30909419, -0.72445041, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.5676344 ,  0.30874875, -0.7605356 , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.56345373,  0.30581823, -0.75220895, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.51964313,  0.30530462, -0.76329499, ...,  0.07696582,
          0.10980567, -0.03430143],
        [ 0.51964283,  0.30538416, -0.75322348, ...,  0.07719659,
          0.11012232, -0.03758257],
        [ 0.51994562,  0.3055757 , -0.75474364, ...,  0.07757088,
          0.10955711, -0.03929814]],

       [[ 0.5201841 ,  0.30558914, -0.75098741, ...,  0.07808821,
          0.10954332, -0.03614534],
        [ 0.52120614,  0.30542672, -0.73388386, ...,  0.07997476,
          0.11059047, -0.03670026],
        [ 0.5216592 ,  0.30524984, -0.72416341, ...,  0.08008858,
          0.10934004, -0.03851278],
        ...,
        [ 0.52179706,  0.32521373, -0.74535441, ...,  

In [53]:
# makes integer labels from 0,1,2 to 1,0,0 , 0,1,0 , 0,0,1
# our labels will be derived from our map, and the videos so we label each video as hello, thanks, or iloveyou
Y = to_categorical(labels).astype(int)

In [54]:
Y

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [55]:
#splits our train and test data
# 95% of our data will be for training, the 5% being for evaluating our model
# we do this because we want to train our model on data , then test it on unseen data to see how well it works
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.05)


In [56]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard


In [58]:
# enabling visualization during training
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)


In [84]:
# initializing a Sequential model, which lets us add layers step by step
# Sequential models are suitable for simple, feed-forward networks where layers are stacked in sequence
model = Sequential()

# setting return_sequences=True so that this layer will return the full sequence of outputs for each input sequence
# using ReLU as the activation function, which helps with vanishing gradient issues by setting negative values to zero
# input_shape=(60, 1662) specifies that the input data has sequences of 60 timesteps, each with 1662 features
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(40, 1662)))

# again, setting return_sequences=True to output the entire sequence for each input sequence, which allows stacking LSTM layers
# this layer will receive the sequence output from the first LSTM layer and further process it to learn more complex patterns
model.add(LSTM(128, return_sequences=True, activation='relu'))

# with return_sequences=False, this layer will output only the last timestep, which condenses the sequence into a single output
# this allows the model to distill the sequence into a fixed-size vector, useful for passing to dense layers
model.add(LSTM(64, return_sequences=False, activation='relu'))

# dense layers are fully connected, meaning each unit in this layer is connected to every unit in the previous layer
# the 64 units here enable the model to start learning non-sequential features by combining information from the LSTM layers
model.add(Dense(64, activation='relu'))

# by adding another dense layer with fewer units, we allow the model to gradually reduce feature complexity
# this reduction can help focus on the most important features before reaching the final output layer
model.add(Dense(32, activation='relu'))

# the number of units in this layer is equal to the number of actions, defined by actions.shape[0], representing the output classes
# softmax activation is used to output a probability distribution across classes, helpful for multi-class classification tasks
model.add(Dense(actions.shape[0], activation='softmax'))


In [85]:
# multiclass classification model so we have to use categorical accuracy
#model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])


In [None]:
# the training :)     
model.fit(X_train, Y_train, epochs=2000, callbacks=[tb_callback])

In [124]:
# NEW MODEL <---------


from tensorflow.keras.layers import Dropout
from keras.layers import TimeDistributed

model = Sequential()


model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(40, 1662)))

model.add(LSTM(64, return_sequences=False, activation='relu'))

#model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))

model.add(Dense(actions.shape[0], activation='softmax'))

model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])


In [None]:
model.fit(X_train, Y_train, epochs=2000, callbacks=[tb_callback])

In [146]:
# testing bidirectional


from tensorflow.keras.layers import Dropout, Bidirectional
from keras.layers import TimeDistributed

model = Sequential()


model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(40, 1662)))

model.add(LSTM(64, return_sequences=False, activation='relu'))

model.add(Dropout(0.05))
model.add(Dense(64, activation='relu'))

model.add(Dense(actions.shape[0], activation='softmax'))

model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [147]:
model.fit(X_train, Y_train, epochs=2000, callbacks=[tb_callback])

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

KeyboardInterrupt: 

In [148]:
model.save('model3.h5')

In [149]:
res = model.predict(X_test)



In [150]:
actions[np.argmax(res[0])]

'please'

In [151]:
actions[np.argmax(Y_test[0])]

'please'

In [152]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [153]:
yhat = model.predict(X_train)



In [154]:
ytrue = np.argmax(Y_train, axis=1).tolist()
yhat = np.argmax(yhat,axis=1).tolist()

In [155]:
#testing our accuracy, detecting false positives, true positives, false negatives, true nagatives
# the more values we have in the top left and bottom right of each matrix the better
multilabel_confusion_matrix(ytrue,yhat)

array([[[535,   0],
        [  0,  63]],

       [[527,   4],
        [  4,  63]],

       [[522,  10],
        [  6,  60]],

       [[531,   2],
        [  6,  59]],

       [[530,   0],
        [  0,  68]],

       [[530,   0],
        [  0,  68]],

       [[531,   0],
        [  0,  67]],

       [[533,   0],
        [  0,  65]],

       [[529,   0],
        [  0,  69]]], dtype=int64)

In [156]:
# our accuracy of our model from data
accuracy_score(ytrue, yhat)

0.9732441471571907

In [135]:
model.load_weights('model3.h5')

In [157]:
# visualization of the predictions for each action / sign
colors = [(25,127,126),(17,24,111),(225,16,45),(25,0,245),(255,233,45),(253,127,126),(253,17,126),(17,2,1),(253,7,126)]

def visualize(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    
    #starting point and dimensions for the bars
    bar_x = 125
    bar_height = 30
    bar_spacing = 50  # space between each bar
    
    for num, prob in enumerate(res):
        # calc the width based on probability
        bar_width = int(prob * 150)  

        #  filling rectangle for the probability
        start_y = 60 + num * bar_spacing
        end_y = start_y + bar_height

        print(num)
        cv2.rectangle(output_frame, (bar_x, start_y), (bar_x + bar_width, end_y), colors[num], -1)

        # drawing white border around each bar
        cv2.rectangle(output_frame, (bar_x, start_y), (bar_x + bar_width, end_y), (255, 255, 255), 1)

        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 0.7
        thickness = 2

        action_text = actions[num]
        action_position = (10, end_y - 5)

        
        cv2.putText(output_frame, action_text, action_position, font, font_scale, (255, 255, 255), thickness, cv2.LINE_AA)

        percentage_text = f"{int(prob * 100)}%"
        percent_position = (bar_x + bar_width + 10, end_y - 5)

    
        cv2.putText(output_frame, percentage_text, percent_position, font, font_scale, (255, 255, 255), thickness, cv2.LINE_AA)
    
    return output_frame

In [158]:
#detection vars
sequence = []
predictions = []

cap = cv2.VideoCapture(2)


predicting = False


with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        ret, frame = cap.read()
        if not ret:
            print("Failed to read frame")  
            break  

        # Making detection
        image, results = mediapipe_detection(frame, holistic)
        draw_landmarks(image, results)
        
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-20:]

        if len(sequence) == 20:
            predicting = True
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))
        else:
            predicting = False

        if predicting:
            image = visualize(res, actions, image, colors)

        cv2.imshow('ASL_Interpreter', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

# Release resources
cap.release()
cv2.destroyAllWindows()


0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4


KeyboardInterrupt: 

In [159]:
cap.release()
cv2.destroyAllWindows()