# 1. Import and Install Dependencies

In [1]:
!pip install tensorflow==2.12.0 opencv-python mediapipe scikit-learn matplotlib pygame

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow==2.12.0
  Using cached tensorflow-2.12.0-cp311-cp311-win_amd64.whl (1.9 kB)
Collecting tensorflow-intel==2.12.0 (from tensorflow==2.12.0)
  Using cached tensorflow_intel-2.12.0-cp311-cp311-win_amd64.whl (272.9 MB)
Collecting tensorboard<2.13,>=2.12 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Using cached tensorboard-2.12.3-py3-none-any.whl (5.6 MB)
Collecting tensorflow-estimator<2.13,>=2.12.0 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Using cached tensorflow_estimator-2.12.0-py2.py3-none-any.whl (440 kB)
Collecting keras<2.13,>=2.12.0 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Using cached keras-2.12.0-py2.py3-none-any.whl (1.7 MB)
Installing collected packages: tensorflow-estimator, keras, tensorboard, tensorflow-intel, tensorflow
  Attempting uninstall: tensorflow-estimator
    Found existing installation: tensorflow-estimator 2.13.0
    Uninstalling te

In [2]:
import sys
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
import pygame

pygame 2.5.2 (SDL 2.28.3, Python 3.11.5)
Hello from the pygame community. https://www.pygame.org/contribute.html


# 2. Keypoints using MP Holistic

In [3]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [4]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [5]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections

In [6]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

# 3. Extract Keypoint Values

In [7]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

# 4. Setup Folders for Collection

In [10]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data') 

# Actions that we try to detect
actions = np.array(['No-Pose', 'L-Pose', 'X-Pose', 'Namaste-Pose'])

# Thirty videos worth of data
no_sequences = 30

# Videos are going to be 30 frames in length
sequence_length = 30

# Folder start
start_folder = 0

In [13]:
for action in actions: 
    #dirmax = np.max(np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int))
    #for sequence in range(1,no_sequences+1):
     for sequence in range(no_sequences):
        try: 
            #os.makedirs(os.path.join(DATA_PATH, action, str(dirmax+sequence)))
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

# 5. Collect Keypoint Values for Training and Testing

In [14]:
cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    # NEW LOOP
    # Loop through actions
    for action in actions:
        # Loop through sequences aka videos
        for sequence in range(no_sequences):
            # Loop through video length aka sequence length
            for frame_num in range(sequence_length):

                # Read feed
                ret, frame = cap.read()

                # Make detections
                image, results = mediapipe_detection(frame, holistic)
#                 print(results)

                # Draw landmarks
                draw_styled_landmarks(image, results)
                
                # NEW Apply wait logic
                if frame_num == 0: 
                    cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                    # cv2.waitKey(2000)
                else: 
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                
                # NEW Export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                # Break gracefully
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
                    
    cap.release()
    cv2.destroyAllWindows()

In [15]:
cap.release()
cv2.destroyAllWindows()

# 6. Preprocess Data and Create Labels and Features

In [16]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [17]:
label_map = {label:num for num, label in enumerate(actions)}

In [18]:
label_map

{'No-Pose': 0, 'L-Pose': 1}

In [19]:
sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [20]:
# Ok to skip
np.array(sequences).shape

(60, 30, 1662)

In [21]:
# Ok to skip
np.array(labels).shape

(60,)

In [22]:
# Ok to skip
X = np.array(sequences)

In [23]:
# Ok to skip
X.shape

(60, 30, 1662)

In [24]:
# Ok to skip
y = to_categorical(labels).astype(int)

In [25]:
# Ok to skip
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

In [26]:
# Ok to skip
y_test.shape

(3, 2)

# 7. Build and Train LSTM Neural Network

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [35]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [36]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [37]:
model.fit(X_train, y_train, epochs=90)

Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90
Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90
Epoch 59/90
Epoch 60/90
Epoch 61/90
Epoch 62/90
Epoch 63/90
Epoch 64/90
Epoch 65/90
Epoch 66/90
Epoch 67/90
Epoch 68/90
Epoch 69/90
Epoch 70/90
Epoch 71/90
Epoch 72/90
Epoch 73/90
Epoch 74/90
Epoch 75/90


Epoch 76/90
Epoch 77/90
Epoch 78/90
Epoch 79/90
Epoch 80/90
Epoch 81/90
Epoch 82/90
Epoch 83/90
Epoch 84/90
Epoch 85/90
Epoch 86/90
Epoch 87/90
Epoch 88/90
Epoch 89/90
Epoch 90/90


<keras.callbacks.History at 0x255b06d3ed0>

In [38]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 30, 64)            442112    
                                                                 
 lstm_4 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 2)                 66        
                                                                 
Total params: 596,642
Trainable params: 596,642
Non-tr

# 8. Make Predictions

In [39]:
res = model.predict(X_test)



In [40]:
actions[np.argmax(res[2])]

'L-Pose'

In [41]:
actions[np.argmax(y_test[2])]

'L-Pose'

# 9. Save Weights

In [42]:
model.save('action.h5')

In [43]:
del model

In [44]:
# Once training is done, u do not have to train it again (no need to call the fit() finction). Just need to setup the model
# and compile it. then load the saved weights.
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [45]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [46]:
model.load_weights('action.h5')

# 10. Evaluation using Confusion Matrix and Accuracy

In [47]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [48]:
yhat = model.predict(X_test)



In [49]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [50]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[0, 0],
        [0, 3]]], dtype=int64)

In [51]:
accuracy_score(ytrue, yhat)

1.0

# 11. Test in Real Time

In [52]:
from scipy import stats

In [53]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [56]:
sequence = []
sentence = []
threshold = 0.8

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            
            
        #3. Viz logic
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution

L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution

L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
No-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
No-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
No-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
No-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solu

<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.SolutionOutputs'>
L-Pose
<class 'mediapipe.python.solution_base.S