In [52]:

!pip3 install tensorflow opencv-python mediapipe scikit-learn matplotlib


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [53]:
import cv2 
import numpy as np 
import os # easier to acess files
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [54]:
mp_holistic = mp.solutions.holistic # holistic model - make detections
mp_drawing = mp.solutions.drawing_utils # drawing utilities - draw detections

In [55]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # color conversion
    # cv2Color converts images from one colorspace to another
    image.flags.writeable = False
    results = model.process(image) # make prediction from image grame
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # color conversion
    return image, results

In [56]:
def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                                mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)) # draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                                mp_drawing.DrawingSpec(color=(80,110,10), thickness=2, circle_radius=4),
                                mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)) # draw face connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                                mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)) # draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2))

In [57]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    return np.concatenate([pose, face, lh, rh])

In [58]:
# path for exported data, numpy arrays
DATA_PATH = os.path.join("MP_Data")

#Actions that we try to detect
actions = np.array(['hello', 'thanks', 'iloveyou'])

# thirty videos with of data
no_sequences = 30

# videos are going to be 30 frames of length
sequence_length = 30

#1662 datapoints, 30 

In [59]:
for action in actions:
    # for each action
    for sequence in range(no_sequences):
        try:
            # if folder already exist, will pass, else make numbered folders in action
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

In [60]:
from sklearn.model_selection import train_test_split # used for training and testing
from tensorflow.keras.utils import to_categorical # used to make labels

In [61]:
label_map = {label:num for num, label in enumerate(actions)}

In [62]:
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = [] # all frames for specific sequence (video)
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            # loads numpy frame 0, frame 1, .. 
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])
        # append 

In [63]:
X = np.array(sequences) # makes to np array
y = to_categorical(labels).astype(int) # uses one hot encoding to prevent bias

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05) # splits data

In [65]:
from tensorflow.keras.models import Sequential # Sequential lets you build a sequential NN
from tensorflow.keras.layers import LSTM, Dense # LSTM is temporal (involves time) and lets build model
from tensorflow.keras.callbacks import TensorBoard # allows to logging in tensor board

In [66]:
# Tensorboard is webapp to see neural network training
log_dir = os.path.join("Logs")
tb_callback = TensorBoard(log_dir=log_dir)

In [67]:
model = Sequential() # easy to make neural network

# 3 sets of lstm levels 
# first adds 64 neurons, and if it needs to pass to next level return seqs must be true, 30 frames with 1662 datapoints
# lstm layer is good for long term sequences
# dont return lstm layer on last layer
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

# actions will turn output into 3 layers, and softmax will show probabiliies from sum

In [68]:
model.compile(optimizer='Adam', loss="categorical_crossentropy", metrics=["categorical_accuracy"])

In [69]:
# model.fit(X_train, y_train, epochs=500, callbacks=[tb_callback])
model.load_weights('gen2.h5')

In [70]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 30, 64)            442112    
                                                                 
 lstm_4 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 3)                 99        
                                                                 
Total params: 596675 (2.28 MB)
Trainable params: 59667

In [71]:
# res = model.predict(X_test)
# actions[np.argmax(res[3])]
# actions[np.argmax(y_test[3])]

In [72]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [73]:
yhat = model.predict(X_test)



In [74]:
ytrue = np.argmax(y_test, axis=1).tolist() # conv
yhat = np.argmax(yhat, axis =1).tolist()

In [75]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[3, 0],
        [0, 2]],

       [[3, 1],
        [0, 1]],

       [[3, 0],
        [1, 1]]])

In [76]:
accuracy_score(ytrue, yhat)

0.8

In [77]:
# needs 30 frames to make a movement
sequence = []
sentence = []
threshold = 0.4

# code to access openCV

cap = cv2.VideoCapture("../uploads/video.mp4")  # Access video file # access video cam on device port 0

# with is used to handle resource management
# set mediapipe model

# min detection is initial detection, tracking confidence is preceding tracking confidence
# if you want higher inital confidence in answer, then increase it and vice versa
action_list = []
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
	while cap.isOpened(): # while camera is on

		# Read feed (reading frame from webcam)
		ret, frame = cap.read()

		if ret == False:
			break

		

		# make detections
		image, results = mediapipe_detection(frame, holistic)
		
		# print(results)

		# draw landmarks
		draw_styled_landmarks(image, results)
		
        # 2. prediction logic
		keypoints = extract_keypoints(results)
		sequence.insert(0,keypoints)
		sequence = sequence[:30]
		
		if (len(sequence) == 30):
			res = model.predict(np.expand_dims(sequence, axis=0))[0]
			# expand dims allows us to test one sequence since its expecting (0, 30, 1662)
			print("Seen action: {}", actions[np.argmax(res)])
			action_list.append(actions[np.argmax(res)])


		# show frame to screen
		# cv2.imshow('OpenCV Feed', image)
		
		# break gracefully
		if cv2.waitKey(1) & 0xFF == ord('q'):
			break
		
OUTPUT_PATH = os.path.join("outputs") 
output = os.path.join(OUTPUT_PATH, "test")
print(output)
np.save(output, action_list)

# release cv2 and close all windows
cv2.destroyAllWindows()
cv2.waitKey(1)
cap.release()

I0000 00:00:1705822330.974061       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 83.1), renderer: Apple M1


Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen action: {} hello
Seen actio