In [None]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import mediapipe as mp
import mediapipe.python.solutions
import string

import time

In [None]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

In [None]:
def mp_hands_detect(image, model): #in this case we pass in instance of Hands class object in model args
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Mediapipe accepts only RGB, so gotta convert native color BGR to RGB
    image.flags.writeable = False # saves memory, image no longer writeable for time being
    results = model.process(image) # image comes from OpenCV frame
    #print(results.multi_hand_landmarks)
    #print(type(results.multi_hand_landmarks))
    #print(len(results.multi_hand_landmarks) if type(results.multi_hand_landmarks) == list else print(0))
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # afterwards, we convert from RGB to BGR back
    return image, results

In [None]:
def render_hand_landmarks(image, results):
    if type(results.multi_hand_landmarks) == list:
        mp_drawing.draw_landmarks(image, results.multi_hand_landmarks[0], mp_hands.HAND_CONNECTIONS)


In [None]:
def style_hand_landmarks(image, results): # same as rendering hand land marks but with extra styling
    if type(results.multi_hand_landmarks) == list:
        mp_drawing.draw_landmarks(image, 
                                  results.multi_hand_landmarks[0], 
                                  mp_hands.HAND_CONNECTIONS,
                                  # arranged in BGR because remember we converted the image back from RGB to BGR
                                  mp_drawing.DrawingSpec(color=(86,255,255), thickness=2, circle_radius=4),
                                  mp_drawing.DrawingSpec(color=(170,86,255), thickness=2, circle_radius=4))

In [None]:
cap = cv2.VideoCapture(0)

# Begin use mediapipe
with mp_hands.Hands(max_num_hands = 1, min_detection_confidence=0.3, min_tracking_confidence=0.3) as hands:
    while cap.isOpened():
        ret, frame = cap.read()

        # Make detections
        image, results = mp_hands_detect(frame, hands)
        
        # Draw landmarks
        style_hand_landmarks(image, results)

        # Display to screen
        cv2.imshow('FSL Alphabet Detector Cam Test', image)
        # if q has been pressed for about 10 frames, quit
        if (cv2.waitKey(10) & 0xFF == ord('q')):
            break
    cap.release()
    cv2.destroyAllWindows()

In [None]:
len(results.multi_hand_landmarks[0].landmark)
results.multi_hand_landmarks[0]
# np.array([[res.x, res.y, res.z] for res in results.multi_hand_landmarks.landmark]).flatten() # get all xyz values then combine them into one array using


In [None]:
def extract_keypoints(results):
    # Get landmark points from active frame
    # if there is none detected then output zeros
    single_hand = np.array([[res.x, res.y, res.z] for res in results.multi_hand_landmarks[0].landmark]).flatten() if results.multi_hand_landmarks else np.zeros(21*3)
    return single_hand
#len(results.multi_hand_landmarks[0].landmark)

In [None]:

# define directories
working_dir = os.path.join(os.path.abspath(''), 'Datasets')
training_dir = os.path.join(working_dir, 'train')
testing_dir = os.path.join(working_dir, 'testing')

In [None]:
DATA_PATH = os.path.join(os.path.abspath(''), 'MP_Data')
# stuff we try to detect
alphabets = np.array(list(string.ascii_uppercase))

# 30 videos 
no_sequences = 30

# videos are 30 frames of length
sequence_length = 30

# A
## 0
## 1
## ...29
# B
## 0
## 1
## ...29

In [None]:
for alphabet in alphabets:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, alphabet, str(sequence)))
        except:
            pass

# ignore error, just happened to push the button for it

In [None]:
cap = cv2.VideoCapture(0)

# This cell is responsible for collecting training data from images, converted into numpy array containing landmark data
with mp_hands.Hands(max_num_hands = 1, min_detection_confidence=0.5) as hands:
    for alphabet in alphabets:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length):

                ret, frame = cap.read()

                # Make detections
                image, results = mp_hands_detect(frame, hands)
                
                # Draw landmarks
                style_hand_landmarks(image, results)

                # collection time
                if frame_num == 0:
                    cv2.putText(image, 'STARTING COLLECTION, Press Y to start', (120,200),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 4, cv2.LINE_AA)
                    cv2.putText(image, f'Collecting frames for alphabet {alphabet} Video Number {sequence}', (15,12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
                    cv2.imshow('OpenCV Feed', image)
                    key = cv2.waitKey(0)
                    if key == ord('y'):
                        cv2.waitKey(1)
                else:
                    cv2.putText(image, f'Collecting frames for alphabet {alphabet} Video Number {sequence}', (15,12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
                    cv2.imshow('OpenCV Feed', image)

                # export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, alphabet, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                if (cv2.waitKey(10) & 0xFF == ord('q')):
                    break

    cap.release()
    cv2.destroyAllWindows()

In [None]:
# test read NPY file

test_dir = os.path.join(os.path.abspath(''), 'MP_Data\\A\\0')
npy_file = os.path.join(test_dir, '4.npy')
data = np.load(npy_file)
data

In [None]:
# Data preprocessing, features, and labeling
from sklearn.model_selection import train_test_split
import tensorflow as tf
# from tensorflow.keras.utils import to_categorical
 
# t = tf.keras.utils.to_categorical()

tf.test.is_built_with_cuda()
tf.config.list_physical_devices('GPU')

In [None]:
label_map = {label: num for num, label in enumerate(alphabets)}
label_map

In [None]:
sequences, labels = [], []
for alphabet in alphabets:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(no_sequences):
            res = np.load(os.path.join(DATA_PATH, alphabet, str(sequence), f'{frame_num}.npy'))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[alphabet])

In [None]:
np.array(sequences).shape
# A-Z, 30 videos * 26 alphabets = 780
# 30 sequences each,
# 21 landmark points * 3 = 63 total

In [95]:
np.array(labels)

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
        4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
        7,  7,  7,  7,  7

In [94]:
x = np.array(sequences)
y = tf.keras.utils.to_categorical(labels).astype(int) # one hot encoding
y

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [None]:
x.shape
x.shape[1]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.1)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
# Model development and training

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard # accuracy monitoring just because


In [None]:
log_dir = os.path.join(os.path.abspath(''), 'logs')
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    pass
tensorboard_cb = TensorBoard(log_dir=log_dir)

In [None]:
alphabets.shape[0]

In [None]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(x.shape[1],x.shape[2]))) # return sequences must be true so that we can return sequences that will be required by the succeeding layer
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu')) # return_sequences must be false because the next layer is a dense layer
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(alphabets.shape[0], activation='softmax'))

# Softmax, return values within p(x) 0 - 1, sum of all values adding up to 1

In [None]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
model.fit(x_train, y_train, epochs=2000, callbacks=[tensorboard_cb])
# yeah epochs may be a bit of overkill, but as long as it works is good


In [None]:
model.summary()

In [None]:
res = model.predict(x_test)

In [None]:
# checking softmax i dunno im sleepy no sleep for 24 hours apologies for this comment a
np.sum(res[0])

In [None]:
alphabets[np.argmax(res[0])]

In [None]:
alphabets[np.argmax(y_test[0])]

# The accuracy seems kinda good, hopefuly not overfitting? Skeptical mode activated, but im sleepy

In [None]:
model.save('FSL-Alphabet.h5')

In [None]:
# model.load_weights('FSL-Alphabet.h5')

In [None]:
# Evaluation using Confusion Matrix and Accuracy, in case you need it a

from sklearn.metrics import multilabel_confusion_matrix, accuracy_score


In [None]:
yhat = model.predict(x_test)

In [None]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [None]:
multilabel_confusion_matrix(ytrue, yhat)

In [None]:
# [1, 1]
# [2, 2]
# MATRIX ORGANIZAITON
# [TRUE N, FALSE P]
# [FALSE N, TRUE P]

# multilabel_confusion_matrix functions returns a matrix sorted by label order, 0,1,2,3,4,5,6 so A,B,C,D,E,F,G

In [None]:
accuracy_score(ytrue, yhat)


In [None]:
# im real skeptical now, but im sleepy as hell. Considering that we're testing on a small set (test size 10% when i splitted the thing)


In [None]:
# Time to see if it works...

# Detection
sequence = []
meaning = []
currentAlphabet = ''
confidence = 0.6

cap = cv2.VideoCapture(0)

# Begin use mediapipe
with mp_hands.Hands(max_num_hands = 1, min_detection_confidence=0.3, min_tracking_confidence=0.3) as hands:
    while cap.isOpened():
        ret, frame = cap.read()

        # Make detections
        image, results = mp_hands_detect(frame, hands)
        
        # Draw landmarks
        style_hand_landmarks(image, results)

        # Prediction block

        keypoints = extract_keypoints(results)
        #sequence.append(keypoints)
        sequence.insert(0, keypoints)
        # limiting only to about 30 frames or sequences
        # get last 30 values
        sequence = sequence[:30]

        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(alphabets[np.argmax(res)])

        # Rendering
    
            
        
        if np.argmax(res) > 78:
            continue
        # check if result is above threshold
        if res[np.argmax(res)] > confidence:
            currentAlphabet = alphabets[np.argmax(res)]
        #    if len(meaning) > 0:
        #        if alphabets[np.argmax(res)] != meaning[-1]:
        #            meaning.append(alphabets[np.argmax(res)])
        #    else:
        #        meaning.append(alphabets[np.argmax(res)])
        
        #if len(meaning) > 5:
        #    meaning = meaning[-5:]

        cv2.rectangle(image, (0,0), (640,40), (245,117,16), -1)
        cv2.putText(image, f'{currentAlphabet}', (3,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)

        # Display to screen
        cv2.imshow('FSL Alphabet Detector Cam Test', image)
        # if q has been pressed for about 10 frames, quit
        if (cv2.waitKey(10) & 0xFF == ord('q')):
            break
    cap.release()
    cv2.destroyAllWindows()


In [None]:
# Minor checking and testing of variables below, ignore this cell
print(x_test[0].shape)
#model.predict(x_test[0])
test = np.expand_dims(x_test[0], axis=0)
model.predict(test)

In [None]:
cap.release()
cv2.destroyAllWindows()