# Sign Language Hand Detection and ROI Extraction

This notebook implements hand detection and Region of Interest (ROI) extraction from sign language videos using MediaPipe and OpenCV.

## Workflow Overview:
1. **Load Data**: Load videos from the compressed videos dataset
2. **Hand Detection**: Use MediaPipe to detect hands in each video frame
3. **ROI Extraction**: Extract the region of interest (hands) from each frame

---

## 1. Install and Import Required Libraries

Make sure you have the following packages installed:
- opencv-python
- mediapipe
- numpy
- matplotlib

In [1]:
# Install required packages (uncomment if needed)
# !pip install opencv-python mediapipe numpy matplotlib

In [2]:
import cv2
import mediapipe as mp
import numpy as np
import json
import os
from pathlib import Path
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple
import random

print("✓ All libraries imported successfully!")
print(f"OpenCV version: {cv2.__version__}")
print(f"MediaPipe version: {mp.__version__}")

✓ All libraries imported successfully!
OpenCV version: 4.11.0
MediaPipe version: 0.10.21


## 2. Configuration and Path Setup

Define paths to the dataset and configuration parameters.

In [3]:
# Path configurations
BASE_DIR = Path(r'c:\Users\shifttech\Desktop\Univ_M\AV\Tp\sign_Lang')
DATA_DIR = BASE_DIR / 'data' / 'compressed videos'
VIDEOS_DIR = DATA_DIR
OUTPUT_DIR = BASE_DIR / 'output'

# Create output directory if it doesn't exist
OUTPUT_DIR.mkdir(exist_ok=True)

# Configuration parameters
MIN_DETECTION_CONFIDENCE = 0.5  # Minimum confidence for hand detection
MIN_TRACKING_CONFIDENCE = 0.5  # Minimum confidence for hand tracking
MIN_HAND_PRESENCE_CONFIDENCE = 0.5  # Minimum confidence for hand presence
MAX_VIDEOS_PER_WORD = 400  # Maximum number of videos to process per word

# Automatically discover words from compressed videos directory
TARGET_WORDS = []
if VIDEOS_DIR.exists():
    TARGET_WORDS = sorted([d.name for d in VIDEOS_DIR.iterdir() if d.is_dir()])

print(f"✓ Configuration complete!")
print(f"  - Videos directory: {VIDEOS_DIR}")

print(f"  - Output directory: {OUTPUT_DIR}")
print(f"  - Max videos per word: {MAX_VIDEOS_PER_WORD}")
print(f"  - Discovered {len(TARGET_WORDS)} words: {', '.join(TARGET_WORDS) if TARGET_WORDS else 'None'}")

✓ Configuration complete!
  - Videos directory: c:\Users\shifttech\Desktop\Univ_M\AV\Tp\sign_Lang\data\compressed videos
  - Output directory: c:\Users\shifttech\Desktop\Univ_M\AV\Tp\sign_Lang\output
  - Max videos per word: 400
  - Discovered 20 words: baby, eat, father, finish, good, happy, hear, house, important, love, mall, me, mosque, mother, normal, sad, stop, thanks, thinking, worry


## 3. Load Dataset from Folder Structure

Load videos from the compressed videos folder structure where each word has its own folder.

In [4]:
def select_words_from_folders(videos_dir: Path, 
                              target_glosses: List[str] = None,
                              max_videos_per_word: int = 200) -> Dict:
    """
    Select words from folder-based dataset structure.
    Each word has its own folder containing video files directly.
    
    Args:
        videos_dir: Directory containing word folders
        target_glosses: List of specific words to select (if None, selects all)
        max_videos_per_word: Maximum number of videos to select per word
    
    Returns:
        Dictionary with word information and video file paths
    """
    result = {
        'selected_words': [],
        'word_to_videos': {},
        'total_videos': 0
    }
    
    # If no target glosses specified, discover all word folders
    if target_glosses is None or len(target_glosses) == 0:
        if not videos_dir.exists():
            print(f"✗ Videos directory does not exist: {videos_dir}")
            return result
        target_glosses = sorted([d.name for d in videos_dir.iterdir() if d.is_dir()])
    
    print(f"\n{'='*60}")
    print("WORD SELECTION ANALYSIS")
    print(f"{'='*60}")
    print(f"Scanning directory: {videos_dir}")
    print(f"Words to process: {len(target_glosses)}")
    print(f"Max videos per word: {max_videos_per_word}")
    print(f"{'='*60}")
    print("SELECTED WORDS (Video Files)")
    print(f"{'='*60}")
    
    words_not_found = []
    words_no_videos = []
    
    for idx, gloss in enumerate(target_glosses, 1):
        # Check if word folder exists
        word_folder = videos_dir / gloss
        if not word_folder.exists():
            words_not_found.append(gloss)
            print(f"{idx:2d}. {gloss:20s} - ✗ NOT FOUND (folder doesn't exist)")
            continue
        
        # Get all video files in the word folder
        all_video_files = list(word_folder.glob('*.mp4')) + list(word_folder.glob('*.avi')) + list(word_folder.glob('*.mov'))
        
        if len(all_video_files) == 0:
            words_no_videos.append(gloss)
            print(f"{idx:2d}. {gloss:20s} - ✗ No video files found in folder")
            continue
        
        # Limit to max_videos_per_word
        video_files = all_video_files[:max_videos_per_word]
        
        # Store video file paths
        result['selected_words'].append(gloss)
        result['word_to_videos'][gloss] = {
            'video_paths': video_files,
            'video_count': len(video_files)
        }
        result['total_videos'] += len(video_files)
        
        if len(all_video_files) > max_videos_per_word:
            print(f"{idx:2d}. {gloss:20s} - ✓ {len(video_files)} videos (limited from {len(all_video_files)})")
        else:
            print(f"{idx:2d}. {gloss:20s} - ✓ {len(video_files)} videos")
    
    print(f"\n{'='*60}")
    print(f"Total: {len(result['selected_words'])} words, {result['total_videos']} videos")
    
    if words_not_found:
        print(f"\n⚠ Words not found: {', '.join(words_not_found)}")
    if words_no_videos:
        print(f"⚠ Words with no video files: {', '.join(words_no_videos)}")
    
    print(f"{'='*60}")
    
    return result


print(f"\n{'='*60}")
print("LOADING AND SELECTING WORDS")
print(f"{'='*60}")

# Select words from folder structure
selection_result = select_words_from_folders(
    VIDEOS_DIR,
    target_glosses=TARGET_WORDS,
    max_videos_per_word=MAX_VIDEOS_PER_WORD
)

# Extract data for processing
target_words = selection_result['selected_words']
word_to_videos = selection_result['word_to_videos']

print(f"\n✓ Dataset loaded and words selected successfully!")


LOADING AND SELECTING WORDS

WORD SELECTION ANALYSIS
Scanning directory: c:\Users\shifttech\Desktop\Univ_M\AV\Tp\sign_Lang\data\compressed videos
Words to process: 20
Max videos per word: 400
SELECTED WORDS (Video Files)
 1. baby                 - ✓ 400 videos (limited from 430)
 2. eat                  - ✓ 400 videos (limited from 440)
 3. father               - ✓ 400 videos (limited from 452)
 4. finish               - ✓ 400 videos (limited from 440)
 5. good                 - ✓ 400 videos (limited from 436)
 6. happy                - ✓ 400 videos (limited from 445)
 7. hear                 - ✓ 400 videos (limited from 433)
 8. house                - ✓ 400 videos (limited from 421)
 9. important            - ✓ 400 videos (limited from 446)
10. love                 - ✓ 400 videos (limited from 435)
11. mall                 - ✓ 400 videos (limited from 414)
12. me                   - ✓ 400 videos (limited from 430)
13. mosque               - ✓ 400 videos (limited from 427)
14. mother 

## 4. Step 2: Hand Detection with MediaPipe

Initialize MediaPipe Hands solution and create a function to detect hands in video frames.

In [5]:
# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

# Create Hands detector
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=MIN_DETECTION_CONFIDENCE,
    min_tracking_confidence=MIN_TRACKING_CONFIDENCE
)

print("✓ MediaPipe Hands initialized successfully!")
print(f"  - Max hands: 2")
print(f"  - Detection confidence: {MIN_DETECTION_CONFIDENCE}")
print(f"  - Tracking confidence: {MIN_TRACKING_CONFIDENCE}")

✓ MediaPipe Hands initialized successfully!
  - Max hands: 2
  - Detection confidence: 0.5
  - Tracking confidence: 0.5


In [6]:
def detect_hands_in_frame(frame: np.ndarray, hands_detector) -> Tuple[np.ndarray, any]:
    """
    Detect hands in a single frame using MediaPipe.
    
    Args:
        frame: Input frame (BGR format)
        hands_detector: MediaPipe Hands detector
    
    Returns:
        Tuple of (processed frame in RGB, detection results)
    """
    # Convert BGR to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process the frame
    results = hands_detector.process(frame_rgb)
    
    return frame_rgb, results


def draw_hand_landmarks(frame: np.ndarray, results) -> np.ndarray:
    """
    Draw hand landmarks on the frame.
    
    Args:
        frame: Input frame (RGB format)
        results: MediaPipe detection results
    
    Returns:
        Frame with drawn landmarks
    """
    annotated_frame = frame.copy()
    
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                annotated_frame,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style()
            )
    
    return annotated_frame


print("✓ Hand detection functions defined!")

✓ Hand detection functions defined!


## 5. Step 3: ROI Extraction

Extract the Region of Interest (ROI) containing the detected hands.

In [7]:
def get_hand_bounding_box(hand_landmarks, frame_width: int, frame_height: int, 
                          padding: float = 0.1) -> Tuple[int, int, int, int]:
    """
    Calculate bounding box for a hand with padding.
    
    Args:
        hand_landmarks: MediaPipe hand landmarks
        frame_width: Width of the frame
        frame_height: Height of the frame
        padding: Padding ratio around the hand (0.1 = 10%)
    
    Returns:
        Tuple of (x_min, y_min, x_max, y_max)
    """
    # Get all landmark coordinates
    x_coords = [lm.x for lm in hand_landmarks.landmark]
    y_coords = [lm.y for lm in hand_landmarks.landmark]
    
    # Calculate bounding box
    x_min = min(x_coords)
    x_max = max(x_coords)
    y_min = min(y_coords)
    y_max = max(y_coords)
    
    # Add padding
    width = x_max - x_min
    height = y_max - y_min
    x_min = max(0, x_min - width * padding)
    x_max = min(1, x_max + width * padding)
    y_min = max(0, y_min - height * padding)
    y_max = min(1, y_max + height * padding)
    
    # Convert to pixel coordinates
    x_min_px = int(x_min * frame_width)
    x_max_px = int(x_max * frame_width)
    y_min_px = int(y_min * frame_height)
    y_max_px = int(y_max * frame_height)
    
    return x_min_px, y_min_px, x_max_px, y_max_px


def extract_roi(frame: np.ndarray, results, padding: float = 0.1) -> List[np.ndarray]:
    """
    Extract ROI (Region of Interest) for each detected hand.
    
    Args:
        frame: Input frame (RGB format)
        results: MediaPipe detection results
        padding: Padding ratio around the hand
    
    Returns:
        List of cropped hand images
    """
    rois = []
    
    if results.multi_hand_landmarks:
        frame_height, frame_width = frame.shape[:2]
        
        for hand_landmarks in results.multi_hand_landmarks:
            # Get bounding box
            x_min, y_min, x_max, y_max = get_hand_bounding_box(
                hand_landmarks, frame_width, frame_height, padding
            )
            
            # Extract ROI
            roi = frame[y_min:y_max, x_min:x_max]
            
            if roi.size > 0:  # Check if ROI is valid
                rois.append(roi)
    
    return rois


def draw_bounding_boxes(frame: np.ndarray, results, padding: float = 0.1) -> np.ndarray:
    """
    Draw bounding boxes around detected hands.
    
    Args:
        frame: Input frame (RGB format)
        results: MediaPipe detection results
        padding: Padding ratio around the hand
    
    Returns:
        Frame with bounding boxes drawn
    """
    annotated_frame = frame.copy()
    
    if results.multi_hand_landmarks:
        frame_height, frame_width = frame.shape[:2]
        
        for idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
            # Get bounding box
            x_min, y_min, x_max, y_max = get_hand_bounding_box(
                hand_landmarks, frame_width, frame_height, padding
            )
            
            # Draw rectangle
            color = (0, 255, 0) if idx == 0 else (255, 0, 0)  # Green for first hand, red for second
            cv2.rectangle(annotated_frame, (x_min, y_min), (x_max, y_max), color, 2)
            
            # Add label
            label = f"Hand {idx + 1}"
            cv2.putText(annotated_frame, label, (x_min, y_min - 10),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    
    return annotated_frame


print("✓ ROI extraction functions defined!")

✓ ROI extraction functions defined!


## 6. Process Videos

Process each video to detect hands and extract ROIs.

In [8]:
def process_video(video_path: Path, hands_detector, output_dir: Path = None, 
                 max_frames: int = None, visualize: bool = False) -> Dict:
    """
    Process a single video for hand detection and ROI extraction.
    
    Args:
        video_path: Path to the video file
        hands_detector: MediaPipe Hands detector
        output_dir: Directory to save output (optional)
        max_frames: Maximum number of frames to process (optional)
        visualize: Whether to save visualization frames
    
    Returns:
        Dictionary containing processing results
    """
    # Open video
    cap = cv2.VideoCapture(str(video_path))
    
    if not cap.isOpened():
        print(f"  ✗ Error: Could not open video {video_path.name}")
        return None
    
    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Limit frames if specified
    frames_to_process = min(total_frames, max_frames) if max_frames else total_frames
    
    # Results storage
    results_data = {
        'video_name': video_path.name,
        'fps': fps,
        'total_frames': total_frames,
        'processed_frames': 0,
        'frames_with_hands': 0,
        'rois_extracted': 0
    }
    
    # Create output directory for this video if needed
    if output_dir and visualize:
        video_output_dir = output_dir / video_path.stem
        video_output_dir.mkdir(exist_ok=True)
    
    frame_idx = 0
    
    while cap.isOpened() and frame_idx < frames_to_process:
        ret, frame = cap.read()
        
        if not ret:
            break
        
        # Detect hands
        frame_rgb, detection_results = detect_hands_in_frame(frame, hands_detector)
        
        # Extract ROIs
        rois = extract_roi(frame_rgb, detection_results)
        
        # Update statistics
        results_data['processed_frames'] += 1
        if detection_results.multi_hand_landmarks:
            results_data['frames_with_hands'] += 1
            results_data['rois_extracted'] += len(rois)
        
        # Save hand ROIs for every frame (needed for training)
        if visualize and output_dir and len(rois) > 0:
            # Save individual hand ROIs for each frame
            for roi_idx, roi in enumerate(rois):
                roi_path = video_output_dir / f"frame_{frame_idx:04d}_hand_{roi_idx}.jpg"
                cv2.imwrite(str(roi_path), cv2.cvtColor(roi, cv2.COLOR_RGB2BGR))
        
        # Save annotated visualization frames (every 10th frame to save space)
        if visualize and output_dir and frame_idx % 10 == 0:
            # Draw landmarks and bounding boxes
            annotated_frame = draw_hand_landmarks(frame_rgb, detection_results)
            annotated_frame = draw_bounding_boxes(annotated_frame, detection_results)
            
            # Save annotated frame
            output_path = video_output_dir / f"frame_{frame_idx:04d}.jpg"
            cv2.imwrite(str(output_path), cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))
        
        frame_idx += 1
    
    cap.release()
    
    return results_data


print("✓ Video processing function defined!")

✓ Video processing function defined!


## 7. Process All Selected Videos

Process all selected videos and display results.

In [9]:
# Process all selected videos
print("\n" + "="*60)
print("PROCESSING VIDEOS")
print("="*60)

all_results = []
video_count = 0
total_videos = sum(len(word_to_videos[word]['video_paths']) for word in target_words)

# Process videos from folder structure
for word in target_words:
    word_data = word_to_videos.get(word)
    if not word_data:
        continue
    
    video_paths = word_data['video_paths']
    print(f"\nProcessing class: {word} ({len(video_paths)} videos)")
    
    for video_path in video_paths:
        video_count += 1
        
        # Show progress bar
        progress = video_count / total_videos * 100
        bar_length = 50
        filled = int(bar_length * video_count / total_videos)
        bar = '█' * filled + '░' * (bar_length - filled)
        print(f"\r[{bar}] {progress:.1f}% ({video_count}/{total_videos})", end='', flush=True)
        
        # Process video (all frames, save output for all videos)
        results = process_video(
            video_path, 
            hands, 
            output_dir=OUTPUT_DIR,
            max_frames=None,  # Process all frames
            visualize=True  # Save hand ROIs for all videos
        )
        
        if results:
            all_results.append(results)

print("\n\n" + "="*60)
print("PROCESSING COMPLETE!")
print("="*60)


PROCESSING VIDEOS

Processing class: baby (400 videos)
[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.1% (10/7915)

KeyboardInterrupt: 

## 8. Summary Statistics and Visualization

Display overall statistics and sample visualizations.

In [None]:
# Calculate summary statistics
if all_results:
    total_frames = sum(r['processed_frames'] for r in all_results)
    total_frames_with_hands = sum(r['frames_with_hands'] for r in all_results)
    total_rois = sum(r['rois_extracted'] for r in all_results)
    avg_detection_rate = (total_frames_with_hands / total_frames * 100) if total_frames > 0 else 0
    
    print("\n" + "="*60)
    print("SUMMARY STATISTICS")
    print("="*60)
    print(f"Total videos processed: {len(all_results)}")
    print(f"Total frames processed: {total_frames}")
    print(f"Frames with hands detected: {total_frames_with_hands} ({avg_detection_rate:.1f}%)")
    print(f"Total ROIs extracted: {total_rois}")
    print(f"Average ROIs per frame with hands: {total_rois / total_frames_with_hands:.2f}" 
          if total_frames_with_hands > 0 else "N/A")
    print("="*60)

In [None]:
# Visualize sample results
print("\nVisualizing sample results...")

# Find the first video output directory with saved frames
sample_dirs = [d for d in OUTPUT_DIR.iterdir() if d.is_dir()]

if sample_dirs:
    sample_dir = sample_dirs[0]
    frame_files = sorted([f for f in sample_dir.glob('frame_*.jpg') if '_hand_' not in f.name])[:4]
    
    if frame_files:
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        axes = axes.flatten()
        
        for idx, frame_file in enumerate(frame_files):
            img = cv2.imread(str(frame_file))
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            axes[idx].imshow(img_rgb)
            axes[idx].set_title(f'Frame {frame_file.stem}', fontsize=12)
            axes[idx].axis('off')
        
        plt.suptitle(f'Sample Hand Detection Results - {sample_dir.name}', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        print(f"✓ Visualization complete! Check the output directory: {OUTPUT_DIR}")
    else:
        print("  No sample frames found for visualization.")
else:
    print("  No output directories found. Set visualize=True when processing videos.")

In [None]:
# show the image.png
image_path = "./image.png"
img = cv2.imread(image_path)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(img_rgb)
plt.axis('off')
plt.show()

## 9. Display Sample ROIs

Show extracted hand ROIs from the first video.

In [None]:
# Display sample ROIs
if sample_dirs:
    sample_dir = sample_dirs[0]
    roi_files = sorted(sample_dir.glob('*_hand_*.jpg'))[:6]
    
    if roi_files:
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        axes = axes.flatten()
        
        for idx, roi_file in enumerate(roi_files):
            img = cv2.imread(str(roi_file))
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            axes[idx].imshow(img_rgb)
            axes[idx].set_title(f'{roi_file.stem}', fontsize=10)
            axes[idx].axis('off')
        
        # Hide empty subplots
        for idx in range(len(roi_files), len(axes)):
            axes[idx].axis('off')
        
        plt.suptitle('Sample Extracted Hand ROIs', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        print("✓ ROI visualization complete!")
    else:
        print("  No ROI files found for visualization.")

## 10. Cleanup

Release resources and clean up.

In [None]:
# Close MediaPipe Hands
hands.close()

print("\n✓ All resources released successfully!")
print("\n" + "="*60)
print("NOTEBOOK EXECUTION COMPLETE")
print("="*60)
print(f"\nOutput saved to: {OUTPUT_DIR}")
print("\nNext steps:")
print("1. Review the extracted ROIs in the output directory")
print("2. Use these ROIs for further processing (e.g., feature extraction, training)")
print("3. Adjust parameters (NUM_WORDS, MIN_VIDEOS_PER_WORD, etc.) as needed")