In [1]:
#importing dependencies
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
import time
import mediapipe as mp

Creating KeyPoints MP_HOLISTIC

In [2]:
#create variables for mediapipe
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [3]:
#create 1st function for mediapipe
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    image.flags.writeable = False
    return image, results


In [7]:
#use mediapipe function in opencv feeds
cap = cv2.VideoCapture(1)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as model:
    while cap.isOpened():
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, model)
        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
cap.release()
cv2.destroyAllWindows()


In [4]:
def draw_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [5]:
#using mediapipe_detection draw_landmarks function in opencv feeds
cap = cv2.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as model:
    while cap.isOpened():
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, model)
        draw_landmarks(image, results)
        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
cap.release()
cv2.destroyAllWindows()

Extract Key Points

In [6]:
print(len(results.face_landmarks.landmark))
print(results.face_landmarks.landmark)
print(len(results.pose_landmarks.landmark))
len(results.left_hand_landmarks.landmark)
    




468
[x: 0.516160786151886
y: 0.5951870083808899
z: -0.02162105031311512
, x: 0.517796516418457
y: 0.5591045618057251
z: -0.03590644896030426
, x: 0.5148395895957947
y: 0.5704966187477112
z: -0.019225072115659714
, x: 0.5077155828475952
y: 0.5297714471817017
z: -0.02720894291996956
, x: 0.5177580118179321
y: 0.5494559407234192
z: -0.03796325996518135
, x: 0.5160744190216064
y: 0.5374327301979065
z: -0.034858208149671555
, x: 0.510517418384552
y: 0.5082076787948608
z: -0.015541011467576027
, x: 0.45104900002479553
y: 0.513178288936615
z: 0.0010578763904049993
, x: 0.5080863237380981
y: 0.4832828938961029
z: -0.010587790980935097
, x: 0.5076390504837036
y: 0.4695852994918823
z: -0.01151118241250515
, x: 0.5043689012527466
y: 0.4257459342479706
z: -0.0029619086999446154
, x: 0.5162693858146667
y: 0.6000678539276123
z: -0.020802604034543037
, x: 0.5158750414848328
y: 0.6042227149009705
z: -0.01841549761593342
, x: 0.515047013759613
y: 0.6063788533210754
z: -0.015296352095901966
, x: 0.51514

21

In [7]:
pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks.landmark else np.zeros(132)


In [8]:
print(pose)
pose.shape

[ 5.39036214e-01  5.67712843e-01 -5.80048203e-01  9.99975383e-01
  5.36509871e-01  5.16183138e-01 -5.24142385e-01  9.99951541e-01
  5.41461885e-01  5.12822449e-01 -5.24175525e-01  9.99953032e-01
  5.46626210e-01  5.09840786e-01 -5.23827136e-01  9.99947846e-01
  5.11198103e-01  5.24213135e-01 -5.61782360e-01  9.99959409e-01
  4.96892393e-01  5.26811063e-01 -5.61780572e-01  9.99958456e-01
  4.81839091e-01  5.30165076e-01 -5.62042475e-01  9.99947965e-01
  5.33664227e-01  5.24067581e-01 -2.00534731e-01  9.99952316e-01
  4.40670431e-01  5.44790804e-01 -3.69742870e-01  9.99975979e-01
  5.47881603e-01  6.06287003e-01 -4.57600892e-01  9.99965429e-01
  5.19115567e-01  6.18767798e-01 -5.05434275e-01  9.99978542e-01
  6.21519864e-01  7.27773070e-01  1.71266519e-03  9.99829948e-01
  3.41467857e-01  7.79762447e-01 -3.13083738e-01  9.98788774e-01
  8.38973701e-01  8.15454721e-01 -3.65726709e-01  9.93364811e-01
  3.63452584e-01  1.03067148e+00 -3.76413703e-01  6.87026739e-01
  7.91589618e-01  4.84915

(132,)

In [9]:
pose.shape

(132,)

In [10]:
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*4)
lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

In [11]:
#function for extracting keypoints
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [12]:
result_test = extract_keypoints(results)

In [13]:
result_test.shape

(1662,)

In [14]:
result_test

array([ 0.53903621,  0.56771284, -0.5800482 , ...,  0.        ,
        0.        ,  0.        ])

In [15]:
DATA_PATH = os.path.join('MP_Data') 
actions = np.array(['hello', 'thanks', 'iloveyou'])
no_sequences = 30
sequence_length = 30

In [16]:
for action in actions: 
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

In [22]:
#using mediapipe_detection draw_landmarks function in opencv feeds
cap = cv2.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as model:
    for action in actions:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length):
                ret, frame = cap.read()
                image, results = mediapipe_detection(frame, model)
                draw_landmarks(image, results)
                
                if frame_num == 0:
                    cv2.putText(image, 'starting collection', (120, 200), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 4, cv2.LINE_AA)

                    cv2.putText(image, 'collecting frames for {} video number{}'.format(action, sequence), (15, 12),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 4, cv2.LINE_AA)

                    cv2.waitKey(500)

                else:
                    cv2.putText(image, 'collecting frames for {} video number{}'.format(action, sequence), (15, 12),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 4, cv2.LINE_AA)    

                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))    
                np.save(npy_path, keypoints)

                cv2.imshow('OpenCV Feed', image)
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

cap.release()
cv2.destroyAllWindows()

In [17]:
label_map = {'hello': 0, 'thanks': 1, 'iloveyou': 2}
label_map

{'hello': 0, 'thanks': 1, 'iloveyou': 2}

In [18]:
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            
            res =  np.load(os.path.join(DATA_PATH, action, str(sequence), '{}.npy'.format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [19]:
labels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2]

In [20]:
np.array(sequences).shape


(90, 30, 1662)

In [21]:
np.array(labels).shape

(90,)

In [22]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical




In [23]:
x = np.array(sequences)
y = to_categorical(labels).astype(int)

In [24]:
y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0,

In [25]:
x

array([[[ 0.53726774,  0.61049461, -0.61498922, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.54437214,  0.61255985, -0.58813274, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.55315059,  0.61294138, -0.58959508, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.5688566 ,  0.56717813, -0.61203039, ...,  0.24393383,
          0.32617205, -0.01151416],
        [ 0.56843287,  0.56834823, -0.62186176, ...,  0.23621726,
          0.33446598, -0.01460908],
        [ 0.56781775,  0.56850201, -0.69872022, ...,  0.2302855 ,
          0.34916097, -0.01292336]],

       [[ 0.56854254,  0.56806189, -0.62884849, ...,  0.22869411,
          0.35270432, -0.0083457 ],
        [ 0.56837875,  0.56707501, -0.6407662 , ...,  0.22861171,
          0.33769938, -0.01621867],
        [ 0.56771058,  0.56617433, -0.60354775, ...,  0.21258108,
          0.32423174, -0.02233068],
        ...,
        [ 0.58086652,  0.56772846, -0.69758809, ...,  

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05)

In [27]:
x_train.shape

(85, 30, 1662)

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.callbacks import TensorBoard

In [29]:
log_dir = os.path.join('logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [30]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(256, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 30, 64)            442112    
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 128)           98816     
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 64)                16448     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 99        
Total params: 953,795
Trainable params: 953,795
Non-trainable params: 0
__________________________________________________

In [34]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=50, callbacks=[tb_callback])

Train on 85 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x24412259188>

In [39]:
model.save('lstm_model_1.h5')
model.load_weights('lstm_model_1.h5')

In [36]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
yhat = model.predict(x_test)
yhat = np.argmax(yhat, axis=1).tolist()
ytrue = np.argmax(y_test, axis=1).tolist()

multilabel_confusion_matrix(ytrue, yhat)


array([[[4, 0],
        [1, 0]],

       [[2, 0],
        [0, 3]],

       [[3, 1],
        [0, 1]]], dtype=int64)

In [39]:
print(accuracy_score(ytrue, yhat))

0.8
