# 1. Dependencies

In [19]:
import cv2
import re
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp  
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

# Make sure in a python 3.8 env
# pip install tensorflow==2.4.1 tensorflow-gpu==2.4.1 opencv-python mediapipe sklearn matplotlib

# 2. Functions

In [20]:
mp_holistic = mp.solutions.holistic # bringing the holstic model
mp_drawing = mp.solutions.drawing_utils # drawing utilities

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction - detecting using mediapipe
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections
    
# CAN CHANGE THE COLOURS OF THESE TO MAKE IT DIFFERENT 
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

# Extracting data points
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    #face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    #return np.concatenate([pose, face, lh, rh])
    return np.concatenate([pose, lh, rh])

# 468*3+33*4+21*3+21*3 = 1662

def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

# 2. Data Gathering

In [21]:
INPUT_VIDEO_PATH = r'C:\Users\Tommaso\Google Drive\Current Courses\COMP9444\CodingTasks\Dataset'

# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data') 

# Videos are going to be 30 frames in length
SEQUENCE_LENGTH = 30

label_map = {}
no_sequences = {}
actions = []
# VIDEO NAMING CONVENTION = "ACTION.SEQUENCE.mp4" 
# e.g. "V.001.mp4"
for i,vid in enumerate(os.listdir(INPUT_VIDEO_PATH)):
    print(vid,i)
    string = vid.split('.')
    action = string[0]
    sequence = int(string[1])
    cap = cv2.VideoCapture(vid)
    label_map[action] = i
    
    if action in no_sequences:
        no_sequences[action] += 1
    else:
       no_sequences[action] = 1
      
    if action not in actions:
        actions.append(action)
    try: 
        os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
    except:
        print('Directory Already Exists, passing file')
        continue
        
    vid_loc = os.path.join(INPUT_VIDEO_PATH,vid)
    cap = cv2.VideoCapture(vid_loc)
    # Set mediapipe model 
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        # this is code for reading from a video file
        frame_num = 0
        while(cap.isOpened() and frame_num < SEQUENCE_LENGTH):
            # Read feed
            ret, frame = cap.read()
            
            if ret:
                image, results = mediapipe_detection(frame, holistic)
                # Draw landmarks
                draw_styled_landmarks(image, results)
                cv2.waitKey(1)
                
                # UNCOMMENT THIS TO SEE THE VIDEO DISPLAYED
                cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                # # Show to screen
                cv2.imshow('OpenCV Feed', image)
                
                # NEW Export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)
                frame_num += 1
            else:
                break
            
    cap.release()
    cv2.destroyAllWindows()



B.000.mp4 0
B.001.mp4 1
B.002.mp4 2
B.003.mp4 3
B.004.mp4 4
B.005.mp4 5
B.006.mp4 6
B.007.mp4 7
B.008.mp4 8
B.009.mp4 9
B.010.mp4 10
B.011.mp4 11
B.012.mp4 12
B.013.mp4 13
B.014.mp4 14
B.015.mp4 15
B.016.mp4 16
B.017.mp4 17
B.018.mp4 18
B.019.mp4 19
B.020.mp4 20
B.021.mp4 21
B.022.mp4 22
B.023.mp4 23
C.000.mp4 24
C.001.mp4 25
C.002.mp4 26
C.003.mp4 27
C.004.mp4 28
C.005.mp4 29
C.006.mp4 30
C.007.mp4 31
C.008.mp4 32
C.009.mp4 33
C.010.mp4 34
C.011.mp4 35
C.012.mp4 36
C.013.mp4 37
C.014.mp4 38
C.015.mp4 39
C.016.mp4 40
C.017.mp4 41
C.018.mp4 42
C.019.mp4 43
C.020.mp4 44
C.021.mp4 45
C.022.mp4 46
C.023.mp4 47
C.024.mp4 48
C.025.mp4 49
F.000.mp4 50
F.001.mp4 51
F.002.mp4 52
F.003.mp4 53
F.004.mp4 54
F.005.mp4 55
F.006.mp4 56
F.007.mp4 57
F.008.mp4 58
F.009.mp4 59
F.010.mp4 60
F.011.mp4 61
F.012.mp4 62
F.013.mp4 63
F.014.mp4 64
F.015.mp4 65
F.016.mp4 66
F.017.mp4 67
F.018.mp4 68
F.019.mp4 69
F.020.mp4 70
F.021.mp4 71
F.022.mp4 72
F.023.mp4 73
F.024.mp4 74
F.025.mp4 75
F.026.mp4 76
F.027.mp4

In [27]:
np.load(r'C:\Users\Tommaso\Google Drive\Current Courses\COMP9444\CodingTasks\Perceptron-Heros\ASL_LSTM\MP_Data\B\0\12.npy').shape

(258,)

# 3. Data Preprocessing

In [30]:
print(no_sequences)
print(actions)
label_map = {label:num for num, label in enumerate(actions)}
sequences, labels = [], []
for action in actions:
    for sequence in range(20):
        window = []
        for frame_num in range(SEQUENCE_LENGTH):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence+1), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

print(label_map)
X = np.array(sequences)
y = to_categorical(labels).astype(int)
print(np.array(sequences).shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

{'B': 24, 'C': 26, 'F': 29}
['B', 'C', 'F']
{'B': 0, 'C': 1, 'F': 2}
(60, 30, 258)


# 4. Build and Train LSTM Neural Network

In [32]:

# Reasons for doing this
# - less data to produce a hyper accurate model
# - much denser neural network (rather than 30 40 million paramters have BLANK)
# - It was a whole heap faster in detecting in real time
print(X.shape)
actions = np.asarray(actions)
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,258)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))
model.summary()
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['binary_accuracy'])
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.fit(X_train, y_train, epochs=2000, callbacks=[tb_callback])



(60, 30, 258)
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_6 (LSTM)                (None, 30, 64)            82688     
_________________________________________________________________
lstm_7 (LSTM)                (None, 30, 128)           98816     
_________________________________________________________________
lstm_8 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_7 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_8 (Dense)              (None, 3)                 99        
Total params: 237,251
Trainable params: 237,251
Non-trainable params: 0
__________________________________

KeyboardInterrupt: 

# 5. Validating and Predicting the Model

In [33]:
res = model.predict(X_test)
print(actions[np.argmax(res[0])])
print(actions[np.argmax(y_test[0])])


F
F


# 5A. Save Model

In [34]:
model.save('action_test_noface.h5')

# 5B. Load Model

In [35]:

model.load_weights('action_test_noface.h5')

# 6. Model Evaluation

In [36]:
yhat = model.predict(X_test)
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()
multilabel_confusion_matrix(ytrue, yhat)
accuracy_score(ytrue, yhat)

1.0

# 7. Real-time Testing

In [37]:
# 1. New detection variables
sequence = []
sentence = []
threshold = 0.9
colors = [(245,117,16), (117,245,16), (16,117,245)]

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
#         sequence.insert(0,keypoints)
#         sequence = sequence[:30]
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            
            
        #3. Viz logic
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti