In [1]:
!pip install tensorflow  opencv-python mediapipe scikit-learn matplotlib






In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp





In [12]:
mp_holistic=mp.solutions.holistic
mp_drawing=mp.solutions.drawing_utils

def mediapipe_detection(image,model):
    image=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    image.flags.writeable=False
    results=model.process(image)
    image.flags.writeable=True
    image=cv2.cvtColor(image,cv2.COLOR_RGB2BGR)
    return image,results 

def draw_landmarks(image,results):
    mp_drawing.draw_landmarks(image,results.face_landmarks,mp_holistic.FACEMESH_TESSELATION)
    mp_drawing.draw_landmarks(image,results.pose_landmarks,mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image,results.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image,results.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS)
    
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

# data_path=os.path.join("new_data")
data_path=os.path.join("trial_data")
# actions=np.array(['hello','thanks','iloveyou','stop','yes','ready','hungry','nice','meetyou'])
actions=np.array(['hello','thanks','iloveyou','ready','explain','help','putaway'])
no_sequences=30
sequence_length=30

In [21]:
# setup folders -run only once
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(data_path,action,str(sequence)))
        except:
            pass

In [15]:
# collect keypoint values from training and testing
cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    # NEW LOOP
    # Loop through actions
    for action in actions:
        # Loop through sequences aka videos
        for sequence in range(no_sequences):
            # Loop through video length aka sequence length
            for frame_num in range(sequence_length):

                # Read feed
                ret, frame = cap.read()

                # Make detections
                image, results = mediapipe_detection(frame, holistic)

                # Draw landmarks
                draw_landmarks(image, results)
                
                # NEW Apply wait logic
                if frame_num == 0: 
                    cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                    cv2.waitKey(2000)
                else: 
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                
                # NEW Export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(data_path, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                # Break gracefully
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
                    
    cap.release()
    cv2.destroyAllWindows()

error: OpenCV(4.8.1) D:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'


In [10]:
             
#     cap.release()
#     cv2.destroyAllWindows()

In [13]:

from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
label_map = {label:num for num, label in enumerate(actions)}
print(label_map)

sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(data_path, action))).astype(int):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(data_path, action, str(sequence), "{}.npy".format(frame_num))) #res -an array size 1662
            window.append(res) #for each sequence 30 res in window
        sequences.append(window) #actions*sequences window total
        labels.append(label_map[action]) #30 0's 30 1's...30 6's =210 labels
        
X=np.array(sequences) #210*30*1662
y = to_categorical(labels).astype(int) #210*7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05) #199-11

{'hello': 0, 'thanks': 1, 'iloveyou': 2, 'ready': 3, 'explain': 4, 'help': 5, 'putaway': 6}


In [4]:
X_test

array([[[ 0.51667857,  0.3724618 , -0.64619881, ...,  0.14148998,
          0.84127384, -0.01789512],
        [ 0.54065043,  0.37469491, -0.92258853, ...,  0.31682423,
          0.91324341, -0.0307733 ],
        [ 0.54902685,  0.37632662, -0.95348948, ...,  0.3231051 ,
          0.90720206, -0.02734156],
        ...,
        [ 0.54784667,  0.37943533, -1.05931878, ...,  0.05213603,
          0.75316554, -0.03584174],
        [ 0.5486396 ,  0.37908044, -1.05486798, ...,  0.05159093,
          0.7548449 , -0.03947632],
        [ 0.54884547,  0.37902883, -1.05417991, ...,  0.04727267,
          0.75384855, -0.01715948]],

       [[ 0.56943029,  0.37783888, -0.98650235, ...,  0.51728547,
          0.76390177, -0.03871305],
        [ 0.56881905,  0.38081461, -0.53021473, ...,  0.55616784,
          0.48600978, -0.03383533],
        [ 0.56846213,  0.38246253, -0.49263066, ...,  0.55499595,
          0.48524114, -0.03166963],
        ...,
        [ 0.56728244,  0.37880409, -0.9272877 , ...,  

In [5]:
np.array(sequences).shape


(210, 30, 1662)

In [21]:
# model training
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.optimizers import Adam

log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

model = Sequential()
model.add(LSTM(128, return_sequences=True, activation='relu', input_shape=(30,1662)))
model.add(LSTM(256, return_sequences=True, activation='relu'))
model.add(LSTM(256, return_sequences=False, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))
# custom_optimizer = Adam(learning_rate=0.01)
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])




In [33]:
model.fit(X_train, y_train, epochs=300, callbacks=[tb_callback])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
1/7 [===>..........................] - ETA: 0s - loss: 3.4086e-04 - categorical_accuracy: 1.0000

KeyboardInterrupt: 

In [34]:
 model.save('myactionssigns.h5')

  saving_api.save_model(


In [14]:
import tensorflow as tf
from tensorflow import keras
model=tf.keras.models.load_model('myactionssigns.h5')

In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 30, 128)           916992    
                                                                 
 lstm_1 (LSTM)               (None, 30, 256)           394240    
                                                                 
 lstm_2 (LSTM)               (None, 256)               525312    
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 7)                 455       
                                                                 
Total params: 1878151 (7.16 MB)
Trainable params: 187815

In [26]:
res=model.predict(X_test)



In [27]:
actions[np.argmax(y_test)]

'thanks'

In [7]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
yhat = model.predict(X_test) #give max value index for 11 sequences in X_test
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()
multilabel_confusion_matrix(ytrue, yhat)



array([[[ 9,  0],
        [ 0,  2]],

       [[10,  0],
        [ 0,  1]],

       [[10,  0],
        [ 0,  1]],

       [[ 9,  0],
        [ 0,  2]],

       [[ 7,  0],
        [ 0,  4]],

       [[10,  0],
        [ 0,  1]]], dtype=int64)

In [8]:
# print(multilabel_confusion_matrix(ytrue, yhat).shape)
print(np.argmax(y_test,axis=1))
print(model.predict(X_test))
# X_test.shape
# ytrue

[5 5 1 3 3 6 0 0 2 5 5]
[[6.11577185e-15 2.45193341e-08 1.95285780e-12 2.06134904e-14
  2.06890483e-09 9.99999881e-01 6.76768153e-08]
 [1.43692956e-14 2.03024783e-07 4.94177778e-13 1.78388950e-14
  4.30559544e-09 9.99995351e-01 4.39974383e-06]
 [3.62462197e-05 9.99219656e-01 3.55220936e-06 1.88218124e-04
  3.20376595e-04 5.42174093e-05 1.77766997e-04]
 [4.40257708e-07 1.48701371e-08 2.73642642e-03 9.97256815e-01
  5.22979826e-06 1.01943681e-06 7.11270758e-08]
 [1.95183929e-06 5.32656008e-09 3.03029306e-02 9.69676673e-01
  1.69986524e-05 1.19323795e-06 2.77906508e-07]
 [1.27984027e-12 7.79262521e-08 3.34558159e-08 3.65087710e-10
  1.10130641e-05 6.60824735e-05 9.99922872e-01]
 [9.99999642e-01 4.14807327e-10 3.86934175e-07 6.09738891e-16
  1.54928910e-17 2.19882681e-18 7.11603415e-11]
 [9.99999762e-01 3.88560711e-10 2.82480357e-07 5.49417757e-16
  1.84286824e-17 4.29539961e-18 5.66967723e-11]
 [6.27746398e-04 2.10826467e-08 9.99350369e-01 1.55023031e-06
  1.27788007e-06 1.71025150e-08 1.

In [9]:
accuracy_score(ytrue, yhat)

1.0

In [15]:
import pyttsx3

In [17]:
def prob_viz(res, actions, input_frame):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), (245,117,16), -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

talks={'hello':'hello','thanks':'thank you','iloveyou':'i love you','explain':'explain','ready':'ready','putaway':'put away','help':'help'}
sequence = []
sentence = []
predictions = []
threshold = 0.5
engine = pyttsx3.init()
cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
#         print(results)
        
        # Draw landmarks
        draw_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
#             print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            
        #3. Viz logic
            if np.unique(predictions[-10:])[0]==np.argmax(res): 
                if res[np.argmax(res)] > threshold: 
                    
                    if len(sentence) > 0: 
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                            print(actions[np.argmax(res)])
                            engine.say(talks[actions[np.argmax(res)]])
                            engine.runAndWait()
                    else:
                        sentence.append(actions[np.argmax(res)])
            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()


iloveyou
thanks
ready
explain
putaway
help
putaway


In [18]:
cap.release()
cv2.destroyAllWindows()