In [2]:
import os
import cv2
import numpy as np
import mediapipe as mp
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from collections import deque
import time


In [3]:

class GestureLSTM(nn.Module):
    def __init__(self, input_size=63, hidden_size=128, num_layers=2, num_classes=6):  # now 5 classes
        super(GestureLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size, 
            hidden_size=hidden_size, 
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return out

In [4]:
model = GestureLSTM().cuda()  # Use .to("cuda") or .to("cpu") depending on your setup

In [5]:

# Load model and move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load("trained_model.pth", map_location=device))
model.to(device)
model.eval()

# Your gesture classes
class_names = ['Swiping_Two_Fingers_Down', 'Swiping_Two_Fingers_Left', 'Swiping_Two_Fingers_Right', 'Swiping_Two_Fingers_Up','No_gesture','Zooming_In_With_Full_Hand']

# MediaPipe setup
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7)

# Video and sequence setup
cap = cv2.VideoCapture(0)
seq_length = 37
keypoint_seq = deque(maxlen=seq_length)

# FPS calculation
prev_time = time.time()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Flip and convert to RGB
    # image = cv2.cvtColor(cv2.flip(frame, 1), cv2.COLOR_BGR2RGB)
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = hands.process(image)
    image.flags.writeable = True

    keypoints = np.zeros((21, 3))  # Fallback if no hand detected

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            for i, lm in enumerate(hand_landmarks.landmark):
                keypoints[i] = [lm.x, lm.y, lm.z]
            # Draw hand landmarks
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    # Save current keypoints to sequence
    keypoint_seq.append(keypoints.flatten())

    # Run inference only if sequence is ready
    if len(keypoint_seq) == seq_length:
        sequence_array = np.array(keypoint_seq)
        input_tensor = torch.tensor(sequence_array, dtype=torch.float32).unsqueeze(0).to(device)  # (1, 37, 63)

        with torch.no_grad():
            output = model(input_tensor)
            probs = torch.softmax(output, dim=1)
            max_prob, predicted = torch.max(probs, 1)

            if max_prob.item() < 0.6:
                label = "No Gesture"
                display_text = f"Gesture: {label}"
            else:
                label = class_names[predicted.item()]
                confidence = max_prob.item() * 100
                display_text = f"{label} ({confidence:.1f}%)"

        # Display result
        cv2.putText(frame, display_text, (10, 30),
            cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

    # FPS
    curr_time = time.time()
    fps = 1 / (curr_time - prev_time)
    prev_time = curr_time
    cv2.putText(frame, f"FPS: {int(fps)}", (10, 70),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

    # Show video
    cv2.imshow("Real-Time Hand Gesture Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release everything
cap.release()
cv2.destroyAllWindows()


  model.load_state_dict(torch.load("trained_model.pth", map_location=device))
