In [1]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import mediapipe as mp
import mediapipe.python.solutions
import string

import time

In [2]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

In [3]:
def mp_hands_detect(image, model): #in this case we pass in instance of Hands class object in model args
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Mediapipe accepts only RGB, so gotta convert native color BGR to RGB
    image.flags.writeable = False # saves memory, image no longer writeable for time being
    results = model.process(image) # image comes from OpenCV frame
    #print(results.multi_hand_landmarks)
    #print(type(results.multi_hand_landmarks))
    #print(len(results.multi_hand_landmarks) if type(results.multi_hand_landmarks) == list else print(0))
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # afterwards, we convert from RGB to BGR back
    return image, results

In [4]:
def render_hand_landmarks(image, results):
    if type(results.multi_hand_landmarks) == list:
        mp_drawing.draw_landmarks(image, results.multi_hand_landmarks[0], mp_hands.HAND_CONNECTIONS)


In [5]:
def style_hand_landmarks(image, results): # same as rendering hand land marks but with extra styling
    if type(results.multi_hand_landmarks) == list:
        mp_drawing.draw_landmarks(image, 
                                  results.multi_hand_landmarks[0], 
                                  mp_hands.HAND_CONNECTIONS,
                                  # arranged in BGR because remember we converted the image back from RGB to BGR
                                  mp_drawing.DrawingSpec(color=(86,255,255), thickness=2, circle_radius=4),
                                  mp_drawing.DrawingSpec(color=(170,86,255), thickness=2, circle_radius=4))

In [6]:
cap = cv2.VideoCapture(0)

# Begin use mediapipe
with mp_hands.Hands(max_num_hands = 1, min_detection_confidence=0.3, min_tracking_confidence=0.3) as hands:
    while cap.isOpened():
        ret, frame = cap.read()

        # Make detections
        image, results = mp_hands_detect(frame, hands)
        
        # Draw landmarks
        style_hand_landmarks(image, results)

        # Display to screen
        cv2.imshow('FSL Alphabet Detector Cam Test', image)
        # if q has been pressed for about 10 frames, quit
        if (cv2.waitKey(10) & 0xFF == ord('q')):
            break
    cap.release()
    cv2.destroyAllWindows()

In [14]:
len(results.multi_hand_landmarks[0].landmark)
results.multi_hand_landmarks[0]
# np.array([[res.x, res.y, res.z] for res in results.multi_hand_landmarks.landmark]).flatten() # get all xyz values then combine them into one array using


landmark {
  x: 0.2706824541091919
  y: 0.7579841613769531
  z: 8.316048933920683e-07
}
landmark {
  x: 0.3643667995929718
  y: 0.7712401747703552
  z: -0.04906079173088074
}
landmark {
  x: 0.44633594155311584
  y: 0.7371939420700073
  z: -0.08079937100410461
}
landmark {
  x: 0.5084257125854492
  y: 0.7030449509620667
  z: -0.10914406925439835
}
landmark {
  x: 0.5715792775154114
  y: 0.6892408132553101
  z: -0.14070983231067657
}
landmark {
  x: 0.4326872229576111
  y: 0.5544135570526123
  z: -0.08119431883096695
}
landmark {
  x: 0.4992033541202545
  y: 0.4523717164993286
  z: -0.12834380567073822
}
landmark {
  x: 0.5438634157180786
  y: 0.39030057191848755
  z: -0.1607019305229187
}
landmark {
  x: 0.5823380351066589
  y: 0.3344220221042633
  z: -0.18320265412330627
}
landmark {
  x: 0.37588247656822205
  y: 0.5195726752281189
  z: -0.08583048731088638
}
landmark {
  x: 0.41871264576911926
  y: 0.4030507802963257
  z: -0.12717150151729584
}
landmark {
  x: 0.45174598693847656
  y

In [6]:
def extract_keypoints(results):
    # Get landmark points from active frame
    # if there is none detected then output zeros
    single_hand = np.array([[res.x, res.y, res.z] for res in results.multi_hand_landmarks[0].landmark]).flatten() if results.multi_hand_landmarks else np.zeros(21*3)
    return single_hand
#len(results.multi_hand_landmarks[0].landmark)

In [7]:

# define directories
working_dir = os.path.join(os.path.abspath(''), 'Datasets')
training_dir = os.path.join(working_dir, 'train')
testing_dir = os.path.join(working_dir, 'testing')

In [8]:
DATA_PATH = os.path.join(os.path.abspath(''), 'MP_Data')
# stuff we try to detect
alphabets = np.array(list(string.ascii_uppercase))

# 30 videos 
no_sequences = 30

# videos are 30 frames of length
sequence_length = 30

# A
## 0
## 1
## ...29
# B
## 0
## 1
## ...29

In [9]:
for alphabet in alphabets:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, alphabet, str(sequence)))
        except:
            pass

In [10]:
cap = cv2.VideoCapture(0)

# This cell is responsible for collecting training data from images, converted into numpy array containing landmark data
with mp_hands.Hands(max_num_hands = 1, min_detection_confidence=0.5) as hands:
    for alphabet in alphabets:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length):

                ret, frame = cap.read()

                # Make detections
                image, results = mp_hands_detect(frame, hands)
                
                # Draw landmarks
                style_hand_landmarks(image, results)

                # collection time
                if frame_num == 0:
                    cv2.putText(image, 'STARTING COLLECTION, Press Y to start', (120,200),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 4, cv2.LINE_AA)
                    cv2.putText(image, f'Collecting frames for alphabet {alphabet} Video Number {sequence}', (15,12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
                    cv2.imshow('OpenCV Feed', image)
                    key = cv2.waitKey(0)
                    if key == ord('y'):
                        cv2.waitKey(1)
                else:
                    cv2.putText(image, f'Collecting frames for alphabet {alphabet} Video Number {sequence}', (15,12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
                    cv2.imshow('OpenCV Feed', image)

                # export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, alphabet, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                if (cv2.waitKey(10) & 0xFF == ord('q')):
                    break

    cap.release()
    cv2.destroyAllWindows()

In [17]:
# test read NPY file

test_dir = os.path.join(os.path.abspath(''), 'MP_Data\\A\\0')
npy_file = os.path.join(test_dir, '4.npy')
data = np.load(npy_file)
data

array([ 3.16032350e-01,  8.24139833e-01, -6.43885016e-07,  4.04884994e-01,
        7.96038628e-01, -2.23944895e-02,  4.64068592e-01,  6.75332665e-01,
       -2.29087770e-02,  4.77242380e-01,  5.65811634e-01, -2.29458753e-02,
        5.00718355e-01,  5.00734150e-01, -1.48792621e-02,  4.29109752e-01,
        5.80771983e-01, -3.11114057e-03,  4.47404563e-01,  5.10573030e-01,
       -3.82338762e-02,  4.32087481e-01,  6.05695963e-01, -4.62270044e-02,
        4.17190135e-01,  6.45211637e-01, -4.43341881e-02,  3.78706694e-01,
        5.80517232e-01, -4.32268996e-03,  3.99524599e-01,  5.18978179e-01,
       -4.37962227e-02,  3.88011128e-01,  6.30020082e-01, -4.24944721e-02,
        3.74779403e-01,  6.56488597e-01, -3.03943809e-02,  3.27853173e-01,
        5.89104116e-01, -1.26247751e-02,  3.48750234e-01,  5.36244929e-01,
       -5.90156801e-02,  3.43426168e-01,  6.44371092e-01, -4.07166407e-02,
        3.31948489e-01,  6.68388546e-01, -1.52929919e-02,  2.72800058e-01,
        6.02422595e-01, -

In [None]:
# Data preprocessing, features, and labeling
from sklearn.model_selection import train_test_split