In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import cv2
import mediapipe as mp
import csv

In [20]:
# MediaPipe setup
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands_instance = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)

In [21]:
def extract_landmarks_from_image(image, hands_instance):
    """
    Extract landmarks from the given image using MediaPipe.
    Args:
        image: The input image.
        hands_instance: An instance of MediaPipe Hands solution.

    Returns:
        A flattened list of landmarks (x, y coordinates) or None if no hands detected.
    """
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands_instance.process(rgb_image)

    if results.multi_hand_landmarks:
        hand_landmarks = results.multi_hand_landmarks[0]
        landmarks = [lm.x for lm in hand_landmarks.landmark] + [lm.y for lm in hand_landmarks.landmark]
        return landmarks
    return None

In [22]:
# Convert the dataset of hand gesture images to landmark data and save it to a CSV
from torchvision import datasets, transforms

# Define the paths
dataset_path = "../data/archive/train"  # Path to the existing dataset
output_csv_path = "landmarks_dataset.csv"

# Load the dataset using ImageFolder
dataset = datasets.ImageFolder(root=dataset_path, transform=transforms.ToTensor())

# Open CSV file to write landmarks data
with open(output_csv_path, mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    # Write header: landmarks (42 values) + label
    header = [f'lm_{i}' for i in range(42)] + ['label']
    csv_writer.writerow(header)

    # Iterate over the dataset to extract landmarks and write to CSV
    for i, (img, label) in enumerate(dataset):
        # Convert the PIL image to OpenCV format
        img = transforms.ToPILImage()(img)  # Convert tensor to PIL image
        img = np.array(img)  # Convert PIL image to NumPy array (OpenCV format)

        # Extract landmarks
        landmarks = extract_landmarks_from_image(img, hands_instance)

        if landmarks is not None:
            # Append the label to landmarks
            row = landmarks + [label]
            csv_writer.writerow(row)

        if i % 100 == 0:
            print(f"Processed {i} images")

print("Finished extracting landmarks from dataset.")

Processed 0 images
Processed 100 images
Processed 200 images
Processed 300 images
Processed 400 images
Processed 500 images
Processed 600 images
Processed 700 images
Processed 800 images
Processed 900 images
Processed 1000 images
Processed 1100 images
Processed 1200 images
Processed 1300 images
Processed 1400 images
Processed 1500 images
Processed 1600 images
Processed 1700 images
Processed 1800 images
Processed 1900 images
Processed 2000 images
Processed 2100 images
Processed 2200 images
Processed 2300 images
Processed 2400 images
Processed 2500 images
Processed 2600 images
Processed 2700 images
Processed 2800 images
Processed 2900 images
Processed 3000 images
Processed 3100 images
Processed 3200 images
Processed 3300 images
Processed 3400 images
Processed 3500 images
Processed 3600 images
Processed 3700 images
Processed 3800 images
Processed 3900 images
Processed 4000 images
Processed 4100 images
Processed 4200 images
Processed 4300 images
Processed 4400 images
Processed 4500 images


In [23]:
# Define a custom dataset for the landmark data
class GestureLandmarkDataset(DataLoader):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        landmarks = row[:-1].values.astype(np.float32)
        label = int(row[-1])
        return torch.tensor(landmarks), torch.tensor(label)

# Load the landmark dataset
landmark_dataset = GestureLandmarkDataset("landmarks_dataset.csv")

In [24]:
num_classes = 20

class GestureRecognitionModel(nn.Module):
    def __init__(self):
        super(GestureRecognitionModel, self).__init__()
        self.fc1 = nn.Linear(42, 128)  # 21 landmarks * 2 (x and y)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)
        self.dropout1 = nn.Dropout(0.5)
        self.dropout2 = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

In [25]:
model = GestureRecognitionModel()
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [26]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = None
        self.counter = 0

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            return False

        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1

        if self.counter >= self.patience:
            return True

        return False

In [27]:
# Training loop
val_dataset_path = "../data/archive/test"
val_dataset = datasets.ImageFolder(root=val_dataset_path, transform=transforms.ToTensor())

# Extract landmarks for validation dataset and save to CSV
val_output_csv_path = "landmarks_val_dataset.csv"
with open(val_output_csv_path, mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    header = [f'lm_{i}' for i in range(42)] + ['label']
    csv_writer.writerow(header)

    for i, (img, label) in enumerate(val_dataset):
        img = transforms.ToPILImage()(img)
        img = np.array(img)
        landmarks = extract_landmarks_from_image(img, hands_instance)

        if landmarks is not None:
            row = landmarks + [label]
            csv_writer.writerow(row)

        if i % 100 == 0:
            print(f"Processed {i} validation images")

print("Finished extracting landmarks from validation dataset.")

# Load validation landmark dataset
val_landmark_dataset = GestureLandmarkDataset("landmarks_val_dataset.csv")

train_loader = DataLoader(landmark_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_landmark_dataset, batch_size=32, shuffle=False)

def train_gesture_model(epochs=10, patience=4):
    early_stopping = EarlyStopping(patience=patience)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for data, labels in train_loader:
            # Forward pass
            outputs = model(data)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        epoch_loss = running_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{epochs}], Training Loss: {epoch_loss:.4f}")

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for data, labels in val_loader:
                outputs = model(data)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        print(f"Validation Loss: {val_loss:.4f}")

        # Early stopping check
        if early_stopping(val_loss):
            print("Early stopping")
            break

Processed 0 validation images
Processed 100 validation images
Processed 200 validation images
Processed 300 validation images
Processed 400 validation images
Processed 500 validation images




Processed 600 validation images
Processed 700 validation images
Processed 800 validation images
Processed 900 validation images
Processed 1000 validation images
Processed 1100 validation images
Processed 1200 validation images
Processed 1300 validation images
Processed 1400 validation images
Processed 1500 validation images
Processed 1600 validation images
Processed 1700 validation images
Processed 1800 validation images
Processed 1900 validation images
Processed 2000 validation images
Processed 2100 validation images
Processed 2200 validation images
Processed 2300 validation images
Processed 2400 validation images
Processed 2500 validation images
Processed 2600 validation images
Processed 2700 validation images
Processed 2800 validation images
Processed 2900 validation images
Processed 3000 validation images
Processed 3100 validation images
Processed 3200 validation images
Processed 3300 validation images
Processed 3400 validation images
Processed 3500 validation images
Processed 3600

In [28]:
train_gesture_model()

Epoch [1/10], Training Loss: 3.1159
Validation Loss: 3.0546
Epoch [2/10], Training Loss: 3.1051
Validation Loss: 3.0238
Epoch [3/10], Training Loss: 2.9896
Validation Loss: 2.9941
Epoch [4/10], Training Loss: 3.0896
Validation Loss: 2.9674
Epoch [5/10], Training Loss: 2.9735
Validation Loss: 2.9391
Epoch [6/10], Training Loss: 2.9408
Validation Loss: 2.9102
Epoch [7/10], Training Loss: 2.9349
Validation Loss: 2.8804
Epoch [8/10], Training Loss: 2.9605
Validation Loss: 2.8488
Epoch [9/10], Training Loss: 2.8625
Validation Loss: 2.8162
Epoch [10/10], Training Loss: 2.9076
Validation Loss: 2.7862


  label = int(row[-1])


In [29]:
# Gesture recognition function (for real-time prediction)
def recognize_gesture(landmarks):
    model.eval()
    with torch.no_grad():
        landmarks = torch.tensor(landmarks, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
        output = model(landmarks)
        _, predicted = torch.max(output.data, 1)
        return predicted.item()

In [30]:
# Example use in the main detection loop
def gesture_recognition_integration(hand_landmarks):
    if hand_landmarks:
        landmarks_array = np.array([[lm.x, lm.y] for lm in hand_landmarks.landmark]).flatten()
        predicted_gesture = recognize_gesture(landmarks_array)
        return predicted_gesture
    return None