In [None]:
import cv2 as cv
import numpy as np
import os
import mediapipe as mp
from picamera2 import Picamera2
import time
import copy
import itertools

In [3]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

In [None]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('Gesture_Data') 

# Actions that we try to detect
actions = np.array(['hello', 'please', 'thanks', 'receipt', 'more', 'price', 'order', 'wait', 'bag', 'water','0', '1', '2', '3', '4', '5', '(6W)', '7', '8', '(9F)', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z'])

# numer of sequences to capture
no_sequences = 45

# Videos are going to be 15 frames in length
sequence_length = 15

Creating Directories

In [None]:
# Ensure the base directory exists
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)
    
for action in actions:
    # Ensure the action folder exists
    action_path = os.path.join(DATA_PATH, action)
    if not os.path.exists(action_path):
        os.makedirs(action_path)
    
    # Find the maximum existing directory index
    dirmax = 0
    existing_dirs = os.listdir(action_path)
    if existing_dirs:
        dirmax = np.max(np.array(existing_dirs).astype(int))
    
    # print(f"Current dirmax for {action}: {dirmax}")
    
    # Create new folders starting from 0
    for sequence in range(no_sequences):  # Should start from 0
        folder_name = str(dirmax + sequence)  # Sequence starts at dirmax (which is usually 0)
        folder_path = os.path.join(action_path, folder_name)
        
        try:
            os.makedirs(folder_path)
            print(f"Created folder: {folder_path}")
        except Exception as e:
            print(f"Error creating folder {folder_path}: {e}")

In [None]:
picam2 = Picamera2()
picam2.configure(picam2.create_preview_configuration(main={"format": "RGB888", "size": (480,640)}))
picam2.start()

time.sleep(2)

In [7]:
def draw_landmarks(frame, results):
    # Draw multi hand landmarks
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                frame,
                hand_landmarks,
                None,
                mp_drawing_styles.get_default_hand_landmarks_style(),
            )

Pre-Processing

In [8]:
def gather_landmark_list(image, landmarks):
    image_width, image_height = image.shape[1],image.shape[0]

    landmark_point = []

    for _,landmark in enumerate(landmarks.landmark):
        landmark_x = min(int(landmark.x * image_width), image_width - 1)
        landmark_y = min(int(landmark.y * image_height), image_height - 1)
        landmark_z = landmark.z
        landmark_point.append([landmark_x, landmark_y, landmark_z])

    return landmark_point

In [9]:
def pre_process_landmarks(landmark_list):
    temp_landmark_list = copy.deepcopy(landmark_list)

    base_x, base_y = 0,0

    for index, landmark_point in enumerate(temp_landmark_list):
        if index == 0:
            base_x, base_y, base_z = landmark_point[0], landmark_point[1], landmark_point[2]

        temp_landmark_list[index][0] = temp_landmark_list[index][0] - base_x
        temp_landmark_list[index][1] = temp_landmark_list[index][1] - base_y
        temp_landmark_list[index][2] = temp_landmark_list[index][2] - base_z

    temp_landmark_list = list(itertools.chain.from_iterable(temp_landmark_list))

    max_value = max(list(map(abs,temp_landmark_list)))

    def normalize_(n):
        return n / max_value

    temp_landmark_list = list(map(normalize_, temp_landmark_list))

    return temp_landmark_list

In [None]:
def extract_keypoints(frame, results):

    processed_hands = {"Right": None, "Left" : None}
    
    if results.multi_hand_landmarks and results.multi_handedness:
        for hand_landmarks, handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
            hand_label = handedness.classification[0].label
    
            landmark_list = gather_landmark_list(frame, hand_landmarks)
    
            processed_landmarks = pre_process_landmarks(landmark_list)
    
            processed_hands[hand_label] = processed_landmarks

    if processed_hands["Right"] is None:
        processed_hands["Right"] = [0] * (21*3)
    if processed_hands["Left"] is None:
        processed_hands["Left"] = [0] * (21*3)
    
    return np.array(processed_hands["Right"] + processed_hands["Left"])

Main Program

In [None]:
# Set mediapipe configurations and collect data
with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    collecting = False
    running = True
    while running:
        # Capture frame
        frame = picam2.capture_array()
        frame = cv.flip(frame, -1)
        frame = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
        frame = cv.rotate(frame, cv.ROTATE_90_COUNTERCLOCKWISE)

        # Get frame dimensions
        height, width, _ = frame.shape
    
        # Apply zoom (center crop)
        zoom_factor = 1.5
        new_width = int(width / zoom_factor)
        new_height = int(height / zoom_factor)
        x_start = (width - new_width) // 2
        y_start = (height - new_height) // 2
        cropped_frame = frame[y_start:y_start+new_height, x_start:x_start+new_width]
    
        # Resize back to 480x640
        zoomed_frame = cv.resize(cropped_frame, (width, height), interpolation=cv.INTER_LINEAR)

        # Process
        results = hands.process(zoomed_frame)
        
        # Draw landmarks
        draw_landmarks(zoomed_frame, results)
        
        # Display instructions
        if not collecting:
            cv.putText(zoomed_frame, 'Press "=" to start collecting data', (15, 30),
                       cv.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2, cv.LINE_AA)
        else:
            cv.putText(zoomed_frame, 'Collecting frames...', (15, 30),
                       cv.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2, cv.LINE_AA)
        
        # Show to screen
        cv.imshow('OpenCV Feed', zoomed_frame)
        
        # Check for user input
        key = cv.waitKey(10) & 0xFF
        
        # Start collection if '=' is pressed
        if key == 61:  # ASCII code for '=' key
            collecting = True
            print("Started collecting data")
            
        # Exit if 'ESC' is pressed
        if key == 27:  # ASCII code for 'ESC'
            running = False
            break

        # Start collection logic
        if collecting:
            # Loop through actions
            for action in actions:
                for sequence in range(no_sequences):
                    for frame_num in range(sequence_length):

                        # Capture frame
                        frame = picam2.capture_array()
                        frame = cv.flip(frame, -1)
                        frame = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
                        frame = cv.rotate(frame, cv.ROTATE_90_COUNTERCLOCKWISE)

                        # Get frame dimensions
                        height, width, _ = frame.shape
                    
                        # Apply zoom (center crop)
                        zoom_factor = 1.5
                        new_width = int(width / zoom_factor)
                        new_height = int(height / zoom_factor)
                        x_start = (width - new_width) // 2
                        y_start = (height - new_height) // 2
                        cropped_frame = frame[y_start:y_start+new_height, x_start:x_start+new_width]
                    
                        # Resize back to 480x640
                        zoomed_frame = cv.resize(cropped_frame, (width, height), interpolation=cv.INTER_LINEAR)

                        # Process
                        results = hands.process(zoomed_frame)
                        
                        # Draw landmarks
                        draw_landmarks(zoomed_frame, results)
                        
                        # Display collection info
                        if frame_num == 0: 
                            cv.putText(zoomed_frame, 'STARTING COLLECTION', (120, 200), 
                                       cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv.LINE_AA)
                            cv.putText(zoomed_frame, f'Collecting frames for {action} Video Number {sequence}', (15, 12), 
                                       cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 1, cv.LINE_AA)
                            cv.imshow('OpenCV Feed', zoomed_frame)
                            cv.waitKey(2000)
                        else: 
                            cv.putText(zoomed_frame, f'Collecting frames for {action} Video Number {sequence}', (15, 12), 
                                       cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv.LINE_AA)
                            cv.imshow('OpenCV Feed', zoomed_frame)
                        
                        # Export keypoints if needed
                        keypoints = extract_keypoints(zoomed_frame, results)
                        npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                        np.save(npy_path, keypoints)
                        
                        # Check for 'ESC' key to stop collection
                        if cv.waitKey(10) & 0xFF == 27:
                            running = False
                            break
                        
                    if not running:
                        break
                if not running:
                    break
            
            # End collection
            collecting = False
        
    cv.destroyAllWindows()
    picam2.stop()
    picam2.close()

Making dataset and labelset files

In [22]:
signs = np.array(['hello', 'please', 'thanks', 'receipt', 'more', 'price', 'order', 'wait', 'bag', 'water','0', '1', '2', '3', '4', '5', '(6W)', '7', '8', '(9F)', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z'])

In [23]:
label_map = {label:num for num, label in enumerate(signs)}

In [None]:
label_map

In [25]:
sequences, labels = [], []
for sign in signs:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, sign))).astype(int):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, sign, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[sign])

In [26]:
np.save('sequences.npy', np.array(sequences))
np.save('labels.npy', np.array(labels))