<a href="https://colab.research.google.com/github/alimomennasab/ASL-Translator/blob/main/CS4200_CNN_hands.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Data Preprocessing

In [2]:
!pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.21-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting numpy<2 (from mediapipe)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.3-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of jax to determine which version is compatible with other requirements. This could take a while.
Collecting jax (from mediapipe)
  Downloading jax-0.8.1-py3-none-any.whl.metadata (13 kB)
Collecting jaxlib (from mediapipe)
  Downloading jaxlib-0.8.1-cp312-cp312-manylinux_2_27_x86_64.whl.metadata (1.3 kB)
Collecting jax (from mediapipe)
  Do

In [5]:
# Data preprocessing: extract right and left hands from both videos

import os
import cv2
import numpy as np
import mediapipe as mp
from tqdm import tqdm

CROP_SIZE = 112
TARGET_FRAMES = 64

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

def extract_hand_boxes(frame, results):
    h, w = frame.shape[:2]
    left_box, right_box = None, None

    if results.multi_hand_landmarks and results.multi_handedness:
        for lm, handness in zip(results.multi_hand_landmarks, results.multi_handedness):
            label = handness.classification[0].label.lower()

            xs = [p.x for p in lm.landmark]
            ys = [p.y for p in lm.landmark]

            x1 = int(w * min(xs))
            x2 = int(w * max(xs))
            y1 = int(h * min(ys))
            y2 = int(h * max(ys))

            # bounding box padding
            pad = 20
            x1 = max(0, x1 - pad)
            y1 = max(0, y1 - pad)
            x2 = min(w, x2 + pad)
            y2 = min(h, y2 + pad)

            box = (x1, y1, x2, y2)

            if label == "left":
                left_box = box
            else:
                right_box = box

    return left_box, right_box


def extract_crops_from_video(path):
    frames = []
    cap = cv2.VideoCapture(path)

    while True:
        ret, f = cap.read()
        if not ret:
            break
        frames.append(f)
    cap.release()

    # 64 frames in each vid
    if len(frames) == 0:
        print(f"[WARNING] Empty or unreadable video: {path}")
        return None, None
    elif len(frames) > TARGET_FRAMES:
        idxs = np.linspace(0, len(frames)-1, TARGET_FRAMES).astype(int)
        frames = [frames[i] for i in idxs]
    elif len(frames) < TARGET_FRAMES:
        frames = frames + [frames[-1]]*(TARGET_FRAMES - len(frames))

    left_crops = []
    right_crops = []

    for f in frames:
        rgb = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb)

        lbox, rbox = extract_hand_boxes(f, results)

        # left
        if lbox is None:
            left_c = np.zeros((CROP_SIZE, CROP_SIZE, 3), dtype=np.uint8)
        else:
            x1,y1,x2,y2 = lbox
            left = f[y1:y2, x1:x2]
            left_c = cv2.resize(left, (CROP_SIZE, CROP_SIZE))

        # right
        if rbox is None:
            right_c = np.zeros((CROP_SIZE, CROP_SIZE, 3), dtype=np.uint8)
        else:
            x1,y1,x2,y2 = rbox
            right = f[y1:y2, x1:x2]
            right_c = cv2.resize(right, (CROP_SIZE, CROP_SIZE))

        left_crops.append(left_c)
        right_crops.append(right_c)

    return np.stack(left_crops), np.stack(right_crops)


def process_hands(video_root, output_root):
    for label in tqdm(os.listdir(video_root)):
        ldir = os.path.join(video_root, label)
        if not os.path.isdir(ldir):
            continue
        out_label_dir = os.path.join(output_root, label)
        os.makedirs(out_label_dir, exist_ok=True)

        for f in os.listdir(ldir):
            if not f.endswith(".mp4"):
                continue

            video_path = os.path.join(ldir, f)
            base = f.replace(".mp4","")

            left, right = extract_crops_from_video(video_path)

            np.save(os.path.join(out_label_dir, base+"_left.npy"), left)
            np.save(os.path.join(out_label_dir, base+"_right.npy"), right)

# Processing
HAND_TRAIN_INPUT = "/content/drive/MyDrive/WLASL/WLASL100_train"
HAND_VAL_INPUT = "/content/drive/MyDrive/WLASL/WLASL100_val"
HAND_TEST_INPUT = "/content/drive/MyDrive/WLASL/WLASL100_test"

HAND_TRAIN_OUTPUT = "/content/drive/MyDrive/WLASL/HAND_TRAIN"
HAND_VAL_OUTPUT = "/content/drive/MyDrive/WLASL/HAND_VAL"
HAND_TEST_OUTPUT = "/content/drive/MyDrive/WLASL/HAND_TEST"
os.makedirs(HAND_TRAIN_OUTPUT, exist_ok=True)
os.makedirs(HAND_VAL_OUTPUT, exist_ok=True)
os.makedirs(HAND_TEST_OUTPUT, exist_ok=True)

#process_hands(HAND_TRAIN_INPUT, HAND_TRAIN_OUTPUT)
process_hands(HAND_VAL_INPUT, HAND_VAL_OUTPUT)
process_hands(HAND_TEST_INPUT, HAND_TEST_OUTPUT)

  3%|▎         | 3/100 [00:19<10:26,  6.46s/it]



100%|██████████| 100/100 [12:05<00:00,  7.25s/it]
100%|██████████| 86/86 [06:43<00:00,  4.70s/it]
