Importing and Installing dependencies

In [1]:
%pip install mediapipe

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp 

Keypoints using mp_holistic and mp_drawing

In [3]:
# used for bringing the holistic model through .holistic
mp_holistic = mp.solutions.holistic # Holistic model
# used for drawing the utilities - points and structure from the midiapipe which is its main function through .drawing_utils
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [4]:
# creating a function so that I do not have to write the above cell again and agian
# passing two variables image and model . image from the user and the model for mediapipe utilization

def mediapipe_detection(image, model):
    # so opencv reads the image in form of bgr but for detection using mediapipe we require the format to be RGB
    # so cv2.cvtcolor helps in recolouring the image
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable this helps us in saving a bit of memory
    # so here image is going to be a frame from video
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    # so that opencv can produce results in bgr format
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [5]:

def draw_landmarks(image, results):
    # so we are passing it into the .draw_landmarks resulting the producing a structure to the image in the video
    # passing the image and results with respect to the lists of the various types of landmarks like face, left hand or right hand etc
    # this will provide us with the items present in the lists with a comprehensive details for the perticular landmarks section
    # mp_holistic is allowing us to pass the image via the connection map for a perticular landmark 
    # draw_landmark func does not return the image but rather applies the landmark visualizations to the current image in place.
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections

In [6]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    # comes with the mediapipe a helper function mp_drawing Draws the landmarks and the connections on the image.
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), # color the joint 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1) #color the connection
                             ) 
    
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [7]:
mp_holistic.POSE_CONNECTIONS

frozenset({(0, 1),
           (0, 4),
           (1, 2),
           (2, 3),
           (3, 7),
           (4, 5),
           (5, 6),
           (6, 8),
           (9, 10),
           (11, 12),
           (11, 13),
           (11, 23),
           (12, 14),
           (12, 24),
           (13, 15),
           (14, 16),
           (15, 17),
           (15, 19),
           (15, 21),
           (16, 18),
           (16, 20),
           (16, 22),
           (17, 19),
           (18, 20),
           (23, 24),
           (23, 25),
           (24, 26),
           (25, 27),
           (26, 28),
           (27, 29),
           (27, 31),
           (28, 30),
           (28, 32),
           (29, 31),
           (30, 32)})

In [9]:
cap = cv2.VideoCapture(0)
# Set mediapipe model 
# so we are accesing the mediapipe model using the with mp_holistic.Holistic 
# so how the mediapipe model works is that it actuallly makes an initial detection using the min_detection_confidence 
# then track the key points with min_tracking_confidence=0.5 we can change it as well
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        # for entering the function
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # # Draw landmarks
        # helps in accessing the draw_landmarks func allowing to draw landmarks through mediapipe 
        draw_styled_landmarks(image, results)

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break 
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [10]:
cap.release()
cv2.destroyAllWindows()

Extracting keypoint values

In [11]:
# results.
# here we have acquired the results parameters from the model like the mediapipe is effectively working
# as for eg:
results.pose_landmarks.landmark[0].visibility
# which helps in confirming the mediapipe model
# further here we are also conforming the "visibility" parameter of the landmark from pose through the test array formed  


0.9999323487281799

In [12]:
len(results.pose_landmarks.landmark)
# getting 33 because mediapipe offers 33 types of landmarks for the pose_landmarks func
# like nose , ears, shoulders wrists,etc

33

In [13]:
# pose=[]
# for res in results.pose_landmarks.landmark:
#     test=np.array([res.x,res.y,res.z,res.visibility])
#     pose.append(test)


# so here we are basically forming an array pose which basically contains the parameters for the landmarks in the res variable obtained from the results from the pose_landmark
# again pose_landmarks denotes the func from the mediapipe lib and "landmark" shows the perticular landmark value out of those 33 parameters
# so we used .flatten to make it compatible for the LSTM model used further
pose=np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)

In [14]:
pose.shape
# conforming the shape of the array i.e 1 d

(132,)

In [15]:
# Handle left hand detection
lh = (np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() 
      if results.left_hand_landmarks else np.zeros(21 * 3))

# Handle right hand detection
rh = (np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() 
      if results.right_hand_landmarks else np.zeros(21 * 3))

In [16]:
# similarly for face
face_all_parameters=len(results.face_landmarks.landmark)*3
print(face_all_parameters)
face=np.array([[res.x,res.y,res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)

1404


In [17]:
def extract_keypoints(results):
    pose=np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    lh=np.array([[res.x,res.y,res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh=np.array([[res.x,res.y,res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    face=np.array([[res.x,res.y,res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    return np.concatenate([pose,lh,rh,face])
# concatenating for the model to detect the sign language


In [18]:
extract_keypoints(results).shape

(1662,)

Setting up folders for collection

In [None]:
# Path for exported data, numpy arrays
DATA_PATH=os.path.join('./tempdata')


# so what here we are going to do is that here the data will be collected and for that
# 30 number of sequences are taken into consideration which means that 30 videos worth of data
# for each sequence as well here 30 frames in length are taken into consideration which means 30*30 data
# again here we have for ex 3 gestures so the data becomes 30*30*3
# again here we have 1662 keypoints for the landmarks obtained earlier as a result the final data = 30*30*3*1662
# Actions that we try to detect
actions = np.array(['hello', 'my', 'name','Abhay','Soham', 'Subhadeep', 'Thank you', 'I love you'])

# Thirty videos worth of data
no_of_sequences = 30

# Videos are going to be 30 frames in length
sequence_length = 30

In [20]:
# just creating the folders and sub folders
# action and seq in nested loop for forming folders

for action in actions: 
    for sequence in range(no_of_sequences):
        try: 
# makedirs used for making the sub directories
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

Collecting keypoint values for Training nd Testing

In [None]:
# Set mediapipe model 
cap = cv2.VideoCapture(0)
# 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    # NEW LOOP
    # Loop through actions
    # this loop is for specifications of the number of times the frames need to be saved for every sequence w.r.t every action
    for action in actions:
        # Looping through sequences aka videos
        for sequence in range(no_of_sequences):
            # Looping through video length aka sequence length (mentioned above) - can be changed
            for frame_num in range(sequence_length):

                # Reading the frame from the video. This frame will be used for further analysis
                ret, frame = cap.read() 
                # Making the detections using the mediapipe_detection func where the model will be able to process the frame like BGR to RGB 
                image, results = mediapipe_detection(frame, holistic)
                # print(results)
                # Drawing landmarks on the acquired frame
                draw_styled_landmarks(image, results)


                # logic is for the formating portion
                if frame_num == 0: 
                    cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                    # providing the break for adjusting the posture
                    cv2.waitKey(2000) #2 sec
                else: 
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                

                # NEW Export keypoints
                # now for this frame or results obtained from mediapipe_extraction which will be RGB the keypoints will be extracted from the extract_keypoint func in the form of a 1 d array.
                # again in ideal senario the no of extracted points will be 1662 as found
                keypoints = extract_keypoints(results)
                # providing the path for the save
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                # saving the array in the location
                np.save(npy_path, keypoints)

                # Break for this frame and continue for next respective iteration
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
                    
    # when loop ends the window closes
    cap.release()
    cv2.destroyAllWindows()

In [22]:
cap.release()
cv2.destroyAllWindows()

Preprocessing data and creating labels w.r.t. actions

In [14]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [15]:
# creating a dict where label is mapped with a num by def starting from 0
# enumerate calling the next label from the actions list , earlier created as the array
label_map = {label:num for num, label in enumerate(actions)}

In [16]:
label_map

{'hello': 0, 'thanks': 1, 'iloveyou': 2, 'my': 3, 'name': 4}

In [17]:

# forming two arrays named as mentioned
# just as a graph where seq will contain all the data with respect to all the videos and frames recorded during the training
# and labels for denoting the actions

# now again this is our main task 
sequences, labels = [], []

# 3 actions so 3 iterations
for action in actions:
    # 30 videos with respect to each action so 30 iterations 
    for sequence in range(no_of_sequences):
        # forming a blank array for storing that x data of all the collection done till now
        window = []
        # for each frame recorded in each of the seq
        for frame_num in range(sequence_length):
            # so res is basically helping in loading the data for each frame through the os lib
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num))) # frame_num shows the exact name to counter in the loop
            # appending the window array with res
            window.append(res)
        # now the loop for the frames for a perticular seq is over
        # adding the window data in the sequences array as a 2 d array with one parameter as each frame and second one as keypoints -1662
        sequences.append(window)

        # appending the labels array only once with only action i.e the action running the loop
        # values are going to be 1d because action is just a label which is currently 0 or 1 or 2 running action * seq times , here 30 
        # not any data containing 2 d array
        labels.append(label_map[action])

In [18]:
np.array(sequences).shape

(150, 30, 1662)

In [19]:
X = np.array(sequences)

In [20]:
X.shape

(150, 30, 1662)

In [21]:
# changing the labels from 0,1,2 to categorical data for easier accessebility
y = to_categorical(labels).astype(int)
y

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0,

In [22]:
# so spliting the data into train and test with 5 percent of testing 
# data contains the seq with frames and keypoints respectively in form of a 3 d array   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)
X_test.shape


(8, 30, 1662)

Building and training LSTM neural network

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [24]:
# adding the logs folder
log_dir = os.path.join('Logs')
# tensorboard is a part of tensorflow monitoring the model training using a web app
# will help to track the accuracy during the training
tb_callback = TensorBoard(log_dir=log_dir)

In [25]:
# neural network

# adding sequential API cuz it will allow in building the model fluidly
model = Sequential()
# adding the three layers of LSTM consisting of 3 positional argument and 1 keyword argument
# positional arg - depends on the position of the value. wrong position wrong output
# keyword arg - depends w.r.t the value assigned with the variable
# returning sequence is necessery because here if not then the next lstm layer will not follow the prev layer
# adding 65 units in first layer and so on . activation is relu
# input shape is 30,1662 for each video i.e 30 frames and 1662 keypoints
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
# return seq as false cuz next is dense layer so not required
model.add(LSTM(64, return_sequences=False, activation='relu'))

# adding 64 units for dense layer
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
# actions is having three values so the actions.shape of [0] is also 3 in shape 
# using softmax so that the values are confined in 0 to 1 the value will sum up and provide 1
model.add(Dense(actions.shape[0], activation='softmax'))

  super().__init__(**kwargs)


In [26]:
# eg
eg_res = [0.2, 0.1, 0.3,0.2,0.2]
actions[np.argmax(eg_res)]

'iloveyou'

In [27]:
# using the adam optimizer
# categorical_crossentropy for multiclasss classification 
# metrics for evaluation
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [28]:
model.fit(X_train, y_train, epochs=330, callbacks=[tb_callback])
# tensorboard --logdir=.

Epoch 1/330
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - categorical_accuracy: 0.1934 - loss: 1.7574
Epoch 2/330
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - categorical_accuracy: 0.1959 - loss: 1.6152
Epoch 3/330
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - categorical_accuracy: 0.2789 - loss: 1.6037
Epoch 4/330
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - categorical_accuracy: 0.2881 - loss: 1.6072
Epoch 5/330
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - categorical_accuracy: 0.2157 - loss: 1.8349
Epoch 6/330
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - categorical_accuracy: 0.2135 - loss: 1.5551
Epoch 7/330
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - categorical_accuracy: 0.1716 - loss: 7.6548
Epoch 8/330
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - cate

<keras.src.callbacks.history.History at 0x2b4eb4cf940>

In [29]:
model.summary()

8. Making the predictions

In [30]:
res=model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 228ms/step


In [31]:
# again the actions with the max value provided by softmax is returned
actions[np.argmax(res[0])]

'thanks'

In [32]:
actions[np.argmax(y_test[4])]

'hello'

Saving weights for future accessability

In [33]:
model.save('./action.h5')



In [34]:
model.load_weights('./action.h5')

Evaluation using Confusion Matrix and Accuracy score

In [35]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [36]:
yhat = model.predict(X_train)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step 


In [37]:
# so here we are check the results w.r.t the axis - 1 i.e the row no 1 having the values of actions i.e 3 values
# then converting them in list format and finding the max value
ytrue = np.argmax(y_train, axis=1).tolist()
# one hot encoding
yhat = np.argmax(yhat, axis=1).tolist()

In [38]:
yhat

[0,
 3,
 4,
 0,
 4,
 2,
 3,
 2,
 0,
 4,
 3,
 3,
 2,
 2,
 0,
 3,
 2,
 2,
 4,
 1,
 1,
 4,
 2,
 0,
 0,
 3,
 3,
 4,
 4,
 1,
 1,
 1,
 4,
 0,
 4,
 4,
 0,
 1,
 3,
 2,
 4,
 3,
 4,
 2,
 0,
 1,
 2,
 0,
 1,
 1,
 1,
 3,
 2,
 1,
 0,
 3,
 4,
 4,
 0,
 2,
 0,
 3,
 4,
 1,
 4,
 3,
 0,
 4,
 0,
 4,
 1,
 1,
 1,
 3,
 0,
 2,
 3,
 1,
 2,
 3,
 1,
 1,
 2,
 3,
 3,
 3,
 4,
 0,
 2,
 2,
 4,
 2,
 1,
 3,
 0,
 3,
 0,
 1,
 1,
 4,
 1,
 1,
 2,
 2,
 2,
 3,
 2,
 1,
 0,
 2,
 0,
 3,
 4,
 4,
 1,
 3,
 1,
 0,
 2,
 1,
 4,
 4,
 1,
 1,
 2,
 0,
 0,
 4,
 0,
 4,
 0,
 2,
 1,
 4,
 3,
 1,
 2,
 3,
 4,
 2,
 0,
 0]

In [39]:
# confution matrix
multilabel_confusion_matrix(ytrue, yhat)

array([[[114,   0],
        [  0,  28]],

       [[111,   2],
        [  0,  29]],

       [[112,   0],
        [  2,  28]],

       [[116,   0],
        [  0,  26]],

       [[113,   0],
        [  0,  29]]], dtype=int64)

In [40]:
accuracy_score(ytrue, yhat)

0.9859154929577465

FINAL Testing in real time

In [41]:
# for coloring the actions
colors = [(245,117,16), (117,245,16), (16,117,245)]

# results from the model prediction, actions, image from the video, colors from above
def prob_viz(res, actions, input_frame, colors):
    
    output_frame = input_frame.copy()
    # so here prob can be obtained from the softmax from earlier - have 3 values
    for num, prob in enumerate(res):
        # .rectangle for formation of rectangle
        # here the 2nd parameter denotes the position of the color where num can be 0, 1, 2 based on the action and changes the y axis accordingly
        # int prob for x input will help in setting the bar length based on the accuracy of the model prediction and y axis same as above and
        # colors will call the colors function based on the num (of action)
        # -1 for filling up the box
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [None]:
import cv2
import numpy as np
import mediapipe as mp
import tensorflow as tf


# Initialize variables
sequence = []
sentence = []
predictions = []
threshold = 0.7

cap = cv2.VideoCapture(0)

# Load your trained model
model = tf.keras.models.load_model("./action.h5")  # Ensure model is loaded

# Set up Mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        # Read frame
        ret, frame = cap.read()

        # Perform detections
        image, results = mediapipe_detection(frame, holistic)
        draw_styled_landmarks(image, results)

        # Extract keypoints
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]  # Keep only the last 30 frames

        # Predict when we have 30 frames
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]  # Model prediction
            predicted_action = actions[np.argmax(res)]

            # Debugging prints
            print("Raw Predictions:", res)
            print("Predicted Index:", np.argmax(res))
            print("Predicted Action:", predicted_action)

            # Ensure stable prediction before displaying
            predictions.append(np.argmax(res))
            predictions = predictions[-10:]  # Keep only last 10 predictions

            if len(predictions) > 0 and np.bincount(predictions).argmax() == np.argmax(res):
                if res[np.argmax(res)] > threshold:
                    if len(sentence) == 0 or (predicted_action != sentence[-1]):
                        sentence = [predicted_action]  # Show only the latest correct label
                        print("Updated Sentence:", sentence)

        # Display the predicted label
        cv2.putText(image, sentence[0] if sentence else "Waiting...", (3, 30),
            cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Show video feed
        cv2.imshow('OpenCV Feed', image)

        # Exit on pressing 'q'
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step
Raw Predictions: [9.9953735e-01 7.9614627e-05 3.8311866e-04 2.2966116e-08 3.3960984e-10]
Predicted Index: 0
Predicted Action: hello
Updated Sentence: ['hello']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Raw Predictions: [9.9693203e-01 4.4441494e-04 2.6233466e-03 1.4162698e-07 3.4636343e-09]
Predicted Index: 0
Predicted Action: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Raw Predictions: [9.6630180e-01 2.9707926e-03 3.0726375e-02 9.9352985e-07 2.7089126e-08]
Predicted Index: 0
Predicted Action: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Raw Predictions: [9.3376118e-01 6.7612664e-03 5.9475727e-02 1.8096079e-06 4.9363223e-08]
Predicted Index: 0
Predicted Action: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Raw Predictions: [8.2322854e-01 1.9001169e-02 1.5776451e-01 5.6437902e-06 1.6460720

In [46]:
cap.release()
cv2.destroyAllWindows()

In [58]:
import pickle
with open('./model.pkl','wb') as file:
    pickle.dump(model,file)