<a href="https://colab.research.google.com/github/alimomennasab/ASL-Translator/blob/main/CS4200_CNN_hands.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Data Preprocessing

In [2]:
!pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.21-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting numpy<2 (from mediapipe)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.3-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of jax to determine which version is compatible with other requirements. This could take a while.
Collecting jax (from mediapipe)
  Downloading jax-0.8.1-py3-none-any.whl.metadata (13 kB)
Collecting jaxlib (from mediapipe)
  Downloading jaxlib-0.8.1-cp312-cp312-manylinux_2_27_x86_64.whl.metadata (1.3 kB)
Collecting jax (from mediapipe)
  Do

In [5]:
# Data preprocessing: extract right and left hands from both videos

import os
import cv2
import numpy as np
import mediapipe as mp
from tqdm import tqdm

CROP_SIZE = 112
TARGET_FRAMES = 64

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

def extract_hand_boxes(frame, results):
    h, w = frame.shape[:2]
    left_box, right_box = None, None

    if results.multi_hand_landmarks and results.multi_handedness:
        for lm, handness in zip(results.multi_hand_landmarks, results.multi_handedness):
            label = handness.classification[0].label.lower()

            xs = [p.x for p in lm.landmark]
            ys = [p.y for p in lm.landmark]

            x1 = int(w * min(xs))
            x2 = int(w * max(xs))
            y1 = int(h * min(ys))
            y2 = int(h * max(ys))

            # bounding box padding
            pad = 20
            x1 = max(0, x1 - pad)
            y1 = max(0, y1 - pad)
            x2 = min(w, x2 + pad)
            y2 = min(h, y2 + pad)

            box = (x1, y1, x2, y2)

            if label == "left":
                left_box = box
            else:
                right_box = box

    return left_box, right_box


def extract_crops_from_video(path):
    frames = []
    cap = cv2.VideoCapture(path)

    while True:
        ret, f = cap.read()
        if not ret:
            break
        frames.append(f)
    cap.release()

    # 64 frames in each vid
    if len(frames) == 0:
        print(f"[WARNING] Empty or unreadable video: {path}")
        return None, None
    elif len(frames) > TARGET_FRAMES:
        idxs = np.linspace(0, len(frames)-1, TARGET_FRAMES).astype(int)
        frames = [frames[i] for i in idxs]
    elif len(frames) < TARGET_FRAMES:
        frames = frames + [frames[-1]]*(TARGET_FRAMES - len(frames))

    left_crops = []
    right_crops = []

    for f in frames:
        rgb = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb)

        lbox, rbox = extract_hand_boxes(f, results)

        # left
        if lbox is None:
            left_c = np.zeros((CROP_SIZE, CROP_SIZE, 3), dtype=np.uint8)
        else:
            x1,y1,x2,y2 = lbox
            left = f[y1:y2, x1:x2]
            left_c = cv2.resize(left, (CROP_SIZE, CROP_SIZE))

        # right
        if rbox is None:
            right_c = np.zeros((CROP_SIZE, CROP_SIZE, 3), dtype=np.uint8)
        else:
            x1,y1,x2,y2 = rbox
            right = f[y1:y2, x1:x2]
            right_c = cv2.resize(right, (CROP_SIZE, CROP_SIZE))

        left_crops.append(left_c)
        right_crops.append(right_c)

    return np.stack(left_crops), np.stack(right_crops)


def process_hands(video_root, output_root):
    for label in tqdm(os.listdir(video_root)):
        ldir = os.path.join(video_root, label)
        if not os.path.isdir(ldir):
            continue

        out_label_dir = os.path.join(output_root, label)
        os.makedirs(out_label_dir, exist_ok=True)

        for f in os.listdir(ldir):
            if not f.endswith(".mp4"):
                continue

            video_path = os.path.join(ldir, f)
            base = f.replace(".mp4","")

            left, right = extract_crops_from_video(video_path)

            if left is None:
                print(f"Video unreadable: {video_path}")
                continue

            np.save(os.path.join(out_label_dir, base+"_left.npy"), left)
            np.save(os.path.join(out_label_dir, base+"_right.npy"), right)

# Processing
HAND_TRAIN_INPUT = "/content/drive/MyDrive/WLASL/WLASL100_train"
HAND_VAL_INPUT = "/content/drive/MyDrive/WLASL/WLASL100_val"
HAND_TEST_INPUT = "/content/drive/MyDrive/WLASL/WLASL100_test"

HAND_TRAIN_OUTPUT = "/content/drive/MyDrive/WLASL/HAND_TRAIN"
HAND_VAL_OUTPUT = "/content/drive/MyDrive/WLASL/HAND_VAL"
HAND_TEST_OUTPUT = "/content/drive/MyDrive/WLASL/HAND_TEST"
os.makedirs(HAND_TRAIN_OUTPUT, exist_ok=True)
os.makedirs(HAND_VAL_OUTPUT, exist_ok=True)
os.makedirs(HAND_TEST_OUTPUT, exist_ok=True)

#process_hands(HAND_TRAIN_INPUT, HAND_TRAIN_OUTPUT)
process_hands(HAND_VAL_INPUT, HAND_VAL_OUTPUT)
process_hands(HAND_TEST_INPUT, HAND_TEST_OUTPUT)

  3%|▎         | 3/100 [00:19<10:26,  6.46s/it]



100%|██████████| 100/100 [12:05<00:00,  7.25s/it]
100%|██████████| 86/86 [06:43<00:00,  4.70s/it]


In [None]:
Data Loading

In [6]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image

CROP_SIZE = 112
SEQ_LEN = 64

img_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std =[0.229, 0.224, 0.225]),
])

class HandCropDataset(Dataset):
    def __init__(self, root_dir, class_map=None, train=True):
        self.root_dir = root_dir
        self.samples = []
        self.train = train

        # build class map
        if class_map is None:
            labels = sorted([
                d for d in os.listdir(root_dir)
                if os.path.isdir(os.path.join(root_dir, d))
            ])
            self.class_to_idx = {lbl:i for i,lbl in enumerate(labels)}
        else:
            self.class_to_idx = class_map["class_to_idx"]

        self.idx_to_class = list(self.class_to_idx.keys())

        # append samples
        for lbl in self.class_to_idx:
            lbl_dir = os.path.join(root_dir, lbl)
            idx = self.class_to_idx[lbl]

            for f in os.listdir(lbl_dir):
                if f.endswith("_left.npy"):
                    base = f.replace("_left.npy", "")
                    left_path = os.path.join(lbl_dir, f)
                    right_path = os.path.join(lbl_dir, base + "_right.npy")

                    if os.path.exists(right_path):
                        self.samples.append((left_path, right_path, idx))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        left_path, right_path, label = self.samples[idx]

        left = np.load(left_path) # (num_frames, H, W, 3)
        right = np.load(right_path)

        left_ts  = torch.stack([img_tf(Image.fromarray(f)) for f in left])
        right_ts = torch.stack([img_tf(Image.fromarray(f)) for f in right])

        return left_ts, right_ts, torch.tensor(label, dtype=torch.long)


TRAIN_DIR = "/content/drive/MyDrive/WLASL/HAND_TRAIN"
VAL_DIR = "/content/drive/MyDrive/WLASL/HAND_VAL"

train_ds = HandCropDataset(TRAIN_DIR)
val_ds = HandCropDataset(VAL_DIR, class_map={'class_to_idx': train_ds.class_to_idx})

train_loader = torch.utils.data.DataLoader(train_ds, batch_size=8, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=8, shuffle=False, num_workers=2)
print(f"Train samples: {len(train_ds)}")
print(f"Val samples: {len(val_ds)}")

num_classes = len(train_ds.idx_to_class)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/WLASL/HAND_VAL/orange'

In [None]:
# Data preprocessing: extract right and left hands from both videos

import os
import cv2
import numpy as np
import mediapipe as mp
from tqdm import tqdm

CROP_SIZE = 112
TARGET_FRAMES = 64

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

def extract_hand_boxes(frame, results):
    h, w = frame.shape[:2]
    left_box, right_box = None, None

    if results.multi_hand_landmarks and results.multi_handedness:
        for lm, handness in zip(results.multi_hand_landmarks, results.multi_handedness):
            label = handness.classification[0].label.lower()

            xs = [p.x for p in lm.landmark]
            ys = [p.y for p in lm.landmark]

            x1 = int(w * min(xs))
            x2 = int(w * max(xs))
            y1 = int(h * min(ys))
            y2 = int(h * max(ys))

            # bounding box padding
            pad = 20
            x1 = max(0, x1 - pad)
            y1 = max(0, y1 - pad)
            x2 = min(w, x2 + pad)
            y2 = min(h, y2 + pad)

            box = (x1, y1, x2, y2)

            if label == "left":
                left_box = box
            else:
                right_box = box

    return left_box, right_box


def extract_crops_from_video(path):
    frames = []
    cap = cv2.VideoCapture(path)

    while True:
        ret, f = cap.read()
        if not ret:
            break
        frames.append(f)
    cap.release()

    # case: no frames were read from the video
    if not frames:
        return (np.zeros((TARGET_FRAMES, CROP_SIZE, CROP_SIZE, 3), dtype=np.uint8),
                np.zeros((TARGET_FRAMES, CROP_SIZE, CROP_SIZE, 3), dtype=np.uint8))

    # 64 frames in each vid
    if len(frames) > TARGET_FRAMES:
        idxs = np.linspace(0, len(frames)-1, TARGET_FRAMES).astype(int)
        frames = [frames[i] for i in idxs]
    elif len(frames) < TARGET_FRAMES:
        frames = frames + [frames[-1]]*(TARGET_FRAMES - len(frames))

    left_crops = []
    right_crops = []

    for f in frames:
        rgb = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb)

        lbox, rbox = extract_hand_boxes(f, results)

        # left
        if lbox is None:
            left_c = np.zeros((CROP_SIZE, CROP_SIZE, 3), dtype=np.uint8)
        else:
            x1,y1,x2,y2 = lbox
            left = f[y1:y2, x1:x2]
            left_c = cv2.resize(left, (CROP_SIZE, CROP_SIZE))

        # right
        if rbox is None:
            right_c = np.zeros((CROP_SIZE, CROP_SIZE, 3), dtype=np.uint8)
        else:
            x1,y1,x2,y2 = rbox
            right = f[y1:y2, x1:x2]
            right_c = cv2.resize(right, (CROP_SIZE, CROP_SIZE))

        left_crops.append(left_c)
        right_crops.append(right_c)

    return np.stack(left_crops), np.stack(right_crops)


def process_hands(video_root, output_root):
    for label in tqdm(os.listdir(video_root)):
        ldir = os.path.join(video_root, label)
        if not os.path.isdir(ldir):
            continue

        out_label_dir = os.path.join(output_root, label)
        os.makedirs(out_label_dir, exist_ok=True)

        for f in os.listdir(ldir):
            if not f.endswith(".mp4"):
                continue

            video_path = os.path.join(ldir, f)
            base = f.replace(".mp4","")

            left, right = extract_crops_from_video(video_path)

            # Only save if crops are not empty (i.e., video was readable)
            if not np.all(left == 0) or not np.all(right == 0):
                np.save(os.path.join(out_label_dir, base+"_left.npy"), left)
                np.save(os.path.join(out_label_dir, base+"_right.npy"), right)
            else:
                print(f"[INFO] Skipping {video_path} due to empty or unreadable content.")

# Processing
HAND_TRAIN_INPUT = "/content/drive/MyDrive/WLASL/WLASL100_train"
HAND_VAL_INPUT = "/content/drive/MyDrive/WLASL/WLASL100_val"
HAND_TEST_INPUT = "/content/drive/MyDrive/WLASL/WLASL100_test"

HAND_TRAIN_OUTPUT = "/content/drive/MyDrive/WLASL/HAND_TRAIN"
HAND_VAL_OUTPUT = "/content/drive/MyDrive/WLASL/HAND_VAL"
HAND_TEST_OUTPUT = "/content/drive/MyDrive/WLASL/HAND_TEST"
os.makedirs(HAND_TRAIN_OUTPUT, exist_ok=True)
os.makedirs(HAND_VAL_OUTPUT, exist_ok=True)
os.makedirs(HAND_TEST_OUTPUT, exist_ok=True)

process_hands(HAND_TRAIN_INPUT, HAND_TRAIN_OUTPUT)
process_hands(HAND_VAL_INPUT, HAND_VAL_OUTPUT)
process_hands(HAND_TEST_INPUT, HAND_TEST_OUTPUT)

In [7]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image

CROP_SIZE = 112
SEQ_LEN = 64

img_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std =[0.229, 0.224, 0.225]),
])

class HandCropDataset(Dataset):
    def __init__(self, root_dir, class_map=None, train=True):
        self.root_dir = root_dir
        self.samples = []
        self.train = train

        # build class map
        if class_map is None:
            labels = sorted([
                d for d in os.listdir(root_dir)
                if os.path.isdir(os.path.join(root_dir, d))
            ])
            self.class_to_idx = {lbl:i for i,lbl in enumerate(labels)}
        else:
            self.class_to_idx = class_map["class_to_idx"]

        self.idx_to_class = list(self.class_to_idx.keys())

        # append samples
        for lbl in self.class_to_idx:
            lbl_dir = os.path.join(root_dir, lbl)
            idx = self.class_to_idx[lbl]

            # Check if the label directory exists before listing its contents
            if not os.path.isdir(lbl_dir):
                print(f"[WARNING] Label directory not found: {lbl_dir}. Skipping.")
                continue

            for f in os.listdir(lbl_dir):
                if f.endswith("_left.npy"):
                    base = f.replace("_left.npy", "")
                    left_path = os.path.join(lbl_dir, f)
                    right_path = os.path.join(lbl_dir, base + "_right.npy")

                    if os.path.exists(right_path):
                        self.samples.append((left_path, right_path, idx))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        left_path, right_path, label = self.samples[idx]

        left = np.load(left_path) # (num_frames, H, W, 3)
        right = np.load(right_path)

        left_ts  = torch.stack([img_tf(Image.fromarray(f)) for f in left])
        right_ts = torch.stack([img_tf(Image.fromarray(f)) for f in right])

        return left_ts, right_ts, torch.tensor(label, dtype=torch.long)


TRAIN_DIR = "/content/drive/MyDrive/WLASL/HAND_TRAIN"
VAL_DIR = "/content/drive/MyDrive/WLASL/HAND_VAL"

train_ds = HandCropDataset(TRAIN_DIR)
val_ds = HandCropDataset(VAL_DIR, class_map={'class_to_idx': train_ds.class_to_idx})

train_loader = torch.utils.data.DataLoader(train_ds, batch_size=8, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=8, shuffle=False, num_workers=2)
print(f"Train samples: {len(train_ds)}")
print(f"Val samples: {len(val_ds)}")

num_classes = len(train_ds.idx_to_class)


Train samples: 470
Val samples: 197


Model Definition

In [8]:
import torch.nn as nn
import torch
from torchvision import models

class HandCNN(nn.Module):
    def __init__(self, num_classes, feat_dim=256):
        super().__init__()

        m = models.mobilenet_v3_small(weights=models.MobileNet_V3_Small_Weights.IMAGENET1K_V1)
        self.cnn = m.features # feature extraction layers
        self.pool = nn.AdaptiveAvgPool2d(1) # adaptive average pooling to get a fixed size feature vector (1 x 1)

        in_feats = m.classifier[0].in_features

        self.fc1 = nn.Linear(in_feats, feat_dim)
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(0.3)

        self.clf = nn.Linear(feat_dim * 2, num_classes) # feat_dim * 2 because left and right hand features are concatenated

    def forward(self, left, right):
        B, T, C, H, W = left.shape

        # combine batch size and time steps to treat each frame as an independent image
        L = left.view(B*T, C, H, W)
        R = right.view(B*T, C, H, W)

        # pass each hand's frames through the feature extractor
        Lf = self.pool(self.cnn(L)).squeeze() # (B*T, CNN_features_dim, 1, 1) -> (B*T, CNN_features_dim)
        Rf = self.pool(self.cnn(R)).squeeze() # (B*T, CNN_features_dim, 1, 1) -> (B*T, CNN_features_dim)
        Lf = self.dropout(self.relu(self.fc1(Lf)))
        Rf = self.dropout(self.relu(self.fc1(Rf)))

        # reshape features back to include the time dimension, average features over time
        Lf = Lf.view(B, T, -1).mean(dim=1) # (B, T, feat_dim) -> (B, feat_dim)
        Rf = Rf.view(B, T, -1).mean(dim=1)

        # concatenate the averaged left and right hand features
        fused = torch.cat([Lf, Rf], dim=1) # (B, feat_dim * 2)

        # classify with fused features
        return self.clf(fused)