In [15]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import mediapipe as mp
import mediapipe.python.solutions

import time

In [2]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

In [30]:
def mp_hands_detect(image, model): #in this case we pass in instance of Hands class object in model args
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Mediapipe accepts only RGB, so gotta convert native color BGR to RGB
    image.flags.writeable = False # saves memory, image no longer writeable for time being
    results = model.process(image) # image comes from OpenCV frame
    #print(results.multi_hand_landmarks)
    #print(type(results.multi_hand_landmarks))
    #print(len(results.multi_hand_landmarks) if type(results.multi_hand_landmarks) == list else print(0))
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # afterwards, we convert from RGB to BGR back
    return image, results

In [4]:
def render_hand_landmarks(image, results):
    if type(results.multi_hand_landmarks) == list:
        mp_drawing.draw_landmarks(image, results.multi_hand_landmarks[0], mp_hands.HAND_CONNECTIONS)


In [5]:
def style_hand_landmarks(image, results): # same as rendering hand land marks but with extra styling
    if type(results.multi_hand_landmarks) == list:
        mp_drawing.draw_landmarks(image, 
                                  results.multi_hand_landmarks[0], 
                                  mp_hands.HAND_CONNECTIONS,
                                  # arranged in BGR because remember we converted the image back from RGB to BGR
                                  mp_drawing.DrawingSpec(color=(86,255,255), thickness=2, circle_radius=4),
                                  mp_drawing.DrawingSpec(color=(170,86,255), thickness=2, circle_radius=4))

In [7]:
cap = cv2.VideoCapture(0)

# Begin use mediapipe
with mp_hands.Hands(max_num_hands = 1, min_detection_confidence=0.3, min_tracking_confidence=0.3) as hands:
    while cap.isOpened():
        ret, frame = cap.read()

        # Make detections
        image, results = mp_hands_detect(frame, hands)
        
        # Draw landmarks
        style_hand_landmarks(image, results)

        # Display to screen
        cv2.imshow('FSL Alphabet Detector Cam Test', image)
        # if q has been pressed for about 10 frames, quit
        if (cv2.waitKey(10) == ord('q')):
            break
    cap.release()
    cv2.destroyAllWindows()

In [14]:
len(results.multi_hand_landmarks[0].landmark)
results.multi_hand_landmarks[0]
# np.array([[res.x, res.y, res.z] for res in results.multi_hand_landmarks.landmark]).flatten() # get all xyz values then combine them into one array using


landmark {
  x: 0.2706824541091919
  y: 0.7579841613769531
  z: 8.316048933920683e-07
}
landmark {
  x: 0.3643667995929718
  y: 0.7712401747703552
  z: -0.04906079173088074
}
landmark {
  x: 0.44633594155311584
  y: 0.7371939420700073
  z: -0.08079937100410461
}
landmark {
  x: 0.5084257125854492
  y: 0.7030449509620667
  z: -0.10914406925439835
}
landmark {
  x: 0.5715792775154114
  y: 0.6892408132553101
  z: -0.14070983231067657
}
landmark {
  x: 0.4326872229576111
  y: 0.5544135570526123
  z: -0.08119431883096695
}
landmark {
  x: 0.4992033541202545
  y: 0.4523717164993286
  z: -0.12834380567073822
}
landmark {
  x: 0.5438634157180786
  y: 0.39030057191848755
  z: -0.1607019305229187
}
landmark {
  x: 0.5823380351066589
  y: 0.3344220221042633
  z: -0.18320265412330627
}
landmark {
  x: 0.37588247656822205
  y: 0.5195726752281189
  z: -0.08583048731088638
}
landmark {
  x: 0.41871264576911926
  y: 0.4030507802963257
  z: -0.12717150151729584
}
landmark {
  x: 0.45174598693847656
  y

In [106]:
def extract_keypoints(results):
    # Get landmark points from active frame
    # if there is none detected then output zeros
    single_hand = np.array([[res.x, res.y, res.z] for res in results.multi_hand_landmarks[0].landmark]).flatten() if results.multi_hand_landmarks else np.zeros(21*3)
    return single_hand
len(results.multi_hand_landmarks[0].landmark)

21

In [104]:
# get landmark points and values for training
# define directories
working_dir = os.path.join(os.path.abspath(''), 'Datasets')
training_dir = os.path.join(working_dir, 'train')
testing_dir = os.path.join(working_dir, 'testing')

In [167]:
landmarks = [] # 3D array, first element should be a 2D that belongs to label A, second 2D should belong to label B, so on and so forth
labels = []

counter = 0 
static_hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)
for idx, i_class in enumerate(os.listdir(training_dir)):
    classpath = os.path.join(training_dir, i_class)
    labels.append(os.path.basename(classpath))
    print(f'index: {idx}')
    landmarks.append([])
    for image in os.listdir(classpath):
        # setting a hard limit of 300 training data. This is because the dataset I used is utter crap, some are blurry images that causes mediapipe to fail recognizing landmarks
        # I also did this to keep consistent shape in the numpy array
        # I currently don't know any image enhancement techniques I could use to cover all images, sadly
        if counter == 310:
            break
        imagepath = os.path.join(classpath, image)
        frame = cv2.imread(imagepath)
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        
        t_results = static_hands.process(frame_rgb)
        
        if t_results.multi_hand_landmarks:
           landmarks[idx].append(np.array([[res.x, res.y, res.z] for res in t_results.multi_hand_landmarks[0].landmark]).flatten())
           
           counter+=1
        else:
            print(f"No hand landmarks found for {os.path.basename(imagepath)}")
    print(f"Class {os.path.basename(classpath)} finished.")
    counter = 0


labels

index: 0
No hand landmarks found for 100.jpg
No hand landmarks found for 105_jpg.rf.e22fb36a16d379b50625db3cf56e56b0.jpg
No hand landmarks found for 108.jpg
No hand landmarks found for 112.jpg
No hand landmarks found for 115.jpg
No hand landmarks found for 116.jpg
No hand landmarks found for 126.jpg
No hand landmarks found for 127.jpg
No hand landmarks found for 128.jpg
No hand landmarks found for 149.jpg
No hand landmarks found for 150.jpg
No hand landmarks found for 151.jpg
No hand landmarks found for 152.jpg
No hand landmarks found for 153.jpg
No hand landmarks found for 155.jpg
No hand landmarks found for 160.jpg
No hand landmarks found for 31.jpg
No hand landmarks found for 36.jpg
No hand landmarks found for 37.jpg
No hand landmarks found for 39.jpg
No hand landmarks found for 63.jpg
No hand landmarks found for 65.jpg
No hand landmarks found for 82.jpg
No hand landmarks found for 96.jpg
No hand landmarks found for 97.jpg
No hand landmarks found for A_133.jpg
No hand landmarks foun

['A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z']

In [168]:
#np.array(landmarks)

inhomo = 0
indexes = []
for idx, landmark in enumerate(landmarks):
    if np.array(landmark).shape[0] != 310 or np.array(landmark).shape[1] != 63:
        indexes.append(idx)
        inhomo+=1
print(indexes)
inhomo

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (26,) + inhomogeneous part.