# Gesture Recognition for Machine Automation

## Imports

In [1]:
import numpy as np
import cv2
import os
import pyautogui
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Dense

## Gestures and Gesture Functions

In [2]:
gestures = np.array(["minimize", "switch", "volume_down", "volume_up"])
vids = 30
frames = 30
data = os.path.join("ges_rec_data")

In [30]:
def minimize_func():
    pyautogui.hotkey('command', 'm')

def switch_func():
    pyautogui.hotkey('command', 'tab')

def volume_down_func():
    pyautogui.press('f11')

def volume_up_func():
    pyautogui.press('f12')

## Data Collection for Features

In [4]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

In [5]:
def get_landmark_values(_results):
    face_vals = np.array([[val.x, val.y, val.z] for val in _results.face_landmarks.landmark]).flatten() if _results.face_landmarks else np.zeros(1404)
    left_hand_vals = np.array([[val.x, val.y, val.z] for val in _results.left_hand_landmarks.landmark]).flatten() if _results.left_hand_landmarks else np.zeros(63)
    right_hand_vals = np.array([[val.x, val.y, val.z] for val in _results.right_hand_landmarks.landmark]).flatten() if _results.right_hand_landmarks else np.zeros(63)

    return np.concatenate((face_vals, left_hand_vals, right_hand_vals))

## Engineer Feature and Label Arrays

In [6]:
gesture_dict = {gesture: pos for pos, gesture in enumerate(gestures)}
features, labels = [], []

for gesture in gestures:
        for video in range(1, vids + 1):
            all_frames = []
            for frame in range(1, frames + 1):
                np_arr = np.load(os.path.join(data, gesture, str(video), "{}.npy".format(frame)))
                all_frames.append(np_arr)
            features.append(all_frames)
            labels.append(gesture_dict[gesture])

In [8]:
X = np.array(features)
y = to_categorical(labels).astype(int)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=17)

## Model Architecture

In [7]:
model = Sequential()
model.add(LSTM(32, activation='relu', return_sequences=True, input_shape=(30, 1530)))
model.add(LSTM(64, return_sequences=True, activation='relu'))
model.add(LSTM(128, return_sequences=False, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(4, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 30, 32)            200064    
                                                                 
 lstm_1 (LSTM)               (None, 30, 64)            24832     
                                                                 
 lstm_2 (LSTM)               (None, 128)               98816     
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dense_1 (Dense)             (None, 4)                 132       
                                                                 
Total params: 327,972
Trainable params: 327,972
Non-trainable params: 0
_________________________________________________________________


## Train Model

In [26]:
epochs = 100
history = model.fit(
    X_train,
    y_train,
    epochs=epochs
)

test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [27]:
model.save('alfred')

INFO:tensorflow:Assets written to: alfred/assets


## Load Model - ALFRED

In [8]:
model.load_weights('alfred')

2023-07-10 13:06:48.251513: W tensorflow/core/util/tensor_slice_reader.cc:97] Could not open alfred: FAILED_PRECONDITION: alfred; Is a directory: perhaps your file is in a different file format and you need to use a different restore operator?


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x28592e860>

## Realtime Gesture Detection and Task Automation

In [31]:
gesture_data = []
confidence = 0.65
previous_gesture = None

cap = cv2.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
  while cap.isOpened():
    success, image = cap.read()
    if not success:
      print("Ignoring empty camera frame.")
      # If loading a video, use 'break' instead of 'continue'.
      continue

    # To improve performance, optionally mark the image as not writeable to
    # pass by reference.
    image.flags.writeable = False
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = holistic.process(image)

    # Draw landmark annotation on the image.
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, landmark_drawing_spec=None, connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_contours_style())
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    
    landmarks = get_landmark_values(results)
    gesture_data.append(landmarks)
    gesture_data = gesture_data[-30:]

    if len(gesture_data) >= 30:
      prediction_arr = model.predict(np.expand_dims(gesture_data, axis=0))
      gesture = gestures[np.argmax(prediction_arr)]

      left_right_vals = gesture_data[0][-(63*2):] 
      zero_arr = np.zeros(63*2)

      if np.array_equal(left_right_vals, zero_arr) or np.max(prediction_arr) < confidence:
        gesture = "none"

      if gesture != previous_gesture:
        if gesture == "minimize":
          minimize_func()

        if gesture == "switch":
          switch_func()

      previous_gesture = gesture

      cv2.putText(image, '{}'.format(gesture), (800, 450), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
      cv2.imshow('MediaPipe Holistic', image)

    else:
      cv2.imshow('MediaPipe Holistic', image)

    # Exit out of webcam by pressing q
    if cv2.waitKey(1) == ord('q'):
      break
    
cap.release()
cv2.destroyAllWindows()

