In [1]:
pip install opencv-python

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install mediapipe

Collecting mediapipe
  Using cached mediapipe-0.10.21-cp312-cp312-win_amd64.whl.metadata (10 kB)
Collecting absl-py (from mediapipe)
  Using cached absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting jax (from mediapipe)
  Using cached jax-0.6.0-py3-none-any.whl.metadata (22 kB)
Collecting jaxlib (from mediapipe)
  Using cached jaxlib-0.6.0-cp312-cp312-win_amd64.whl.metadata (1.2 kB)
Collecting opencv-contrib-python (from mediapipe)
  Using cached opencv_contrib_python-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Using cached sounddevice-0.5.1-py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting ml_dtypes>=0.5.0 (from jax->mediapipe)
  Using cached ml_dtypes-0.5.1-cp312-cp312-win_amd64.whl.metadata (22 kB)
Using cached mediapipe-0.10.21-cp312-cp312-win_amd64.whl (51.0 MB)
Using cached sounddevice-0.5.1-py3-none-win_amd64.whl (363 kB)
Using cached absl_py-2.2.2-py3-none-any.whl (135 kB)
Using cached jax-0.6.0-py3-none-any.whl

In [1]:
import cv2
import mediapipe as mp
import numpy as np
import os

INITIALISING MEDIAPIPE HANDS

In [2]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands = 1, min_detection_confidence = 0.7)
mp_draw = mp.solutions.drawing_utils

In [14]:
def extract_landmarks(video_path):
    cap = cv2.VideoCapture(video_path)
    landmarks_list = []
    while cap.isOpened():
        ret, frames = cap.read()
        if not ret:
            break

        frame_rgb = cv2.cvtColor(frames, cv2.COLOR_BGR2RGB)
        results = hands.process(frame_rgb)

        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                landmarks = []
                for lm in hand_landmarks.landmark:
                    landmarks.append(lm.x)
                    landmarks.append(lm.y)
                landmarks_list.append(landmarks)

        # optional visualisation
                mp_draw.draw_landmarks(frames, hand_landmarks, mp_hands.HAND_CONNECTIONS)
        cv2.imshow('Frame', frames)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
    return np.array(landmarks_list)

PROCESSING ALL VIDEOS

In [26]:
data_dir = 'data'
labels = ['hello', 'thankyou']
all_data = []
all_labels =[]

for label in labels:  # ['hello', 'thankyou']
    label_dir = os.path.join(data_dir, label)
    for video_file in os.listdir(label_dir):
        video_path = os.path.join(label_dir, video_file)
        landmarks = extract_landmarks(video_path)
        if len(landmarks) > 0:
            all_data.append(landmarks)
            all_labels.append(label)

STANDARDISING THE DATA FOR LSTM MODEL AS IT TAKES FIXED DATA

In [27]:
max_frames = 30
def standardised_sequence(sequence, max_frames):
    if len(sequence) > max_frames:
        return sequence[:max_frames]
    elif len(sequence) < max_frames:
        padded = np.zeros((max_frames, sequence.shape[1]))
        padded[:len(sequence)] = sequence
        return padded
    return sequence

In [28]:
all_data_standardised = [standardised_sequence(data, max_frames) for data in all_data]

CONVERTING LABELS INTO NUMBERS (HELLO: 0, THANKYOU: 1)

In [30]:
label_map = {'hello': 0, 'thankyou':1}
all_labels_numeric = [label_map[label] for label in all_labels]

CONVERTING TO NUMPY ARRAYS

In [32]:
X = np.array(all_data_standardised)
y = np.array(all_labels_numeric)
np.save('X_data.npy', X)
np.save('y_data.npy', y)

In [33]:
print(f"Extracted data: {len(all_data)} videos")

Extracted data: 10 videos
