# Hand Gesture Classification
### by Adrian Abraham

## 1 - Package installation
- **tensorflow**: for model creation and predictions
- **pandas**: for csv_reading
- **numpy**: for data storage and manipulation
- **mediapipe**: contains pre-made hand detection module for data collection
- **open-cv**: for live feed
- **ast**: to convert string literal lists into actual lists

In [1]:
!pip install tensorflow



In [2]:
!pip install pandas



In [3]:
!pip install mediapipe



In [4]:
!pip install opencv-python



## 2 - Importing packages

In [40]:
import tensorflow as tf
import numpy as np
import pandas as pd
import ast
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Input
from tensorflow.keras.activations import linear, relu, sigmoid
from tensorflow.keras.losses import SparseCategoricalCrossentropy

### Quick check that tensorflow is working

In [41]:
print(tf.__version__)

2.16.2


## 3 - Importing data

In [42]:
# Positive and negative data are stored in csv format.
# We can use read_csv() to get the contents of each file
peace_set = pd.read_csv('peace.csv')
heart_set = pd.read_csv('heart.csv')
shaka_set = pd.read_csv('shaka.csv')
none_set  = pd.read_csv('none.csv' )

In [43]:
peace_set.info(), heart_set.info(), shaka_set.info(), none_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1984 entries, 0 to 1983
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   coords  1984 non-null   object
dtypes: object(1)
memory usage: 15.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752 entries, 0 to 1751
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   coords  1752 non-null   object
dtypes: object(1)
memory usage: 13.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1513 entries, 0 to 1512
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   coords  1513 non-null   object
dtypes: object(1)
memory usage: 11.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1997 entries, 0 to 1996
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   coords  1997 non-null   object
dtypes: object(1)
mem

(None, None, None, None)

In [44]:
# The format of our csv has a list of data as a string, 
# to turn them into lists so we use the ast library
peace_set['coords'] = peace_set['coords'].apply(ast.literal_eval)
peace_set = np.array(peace_set['coords'].tolist())

heart_set['coords'] = heart_set['coords'].apply(ast.literal_eval)
heart_set = np.array(heart_set['coords'].tolist())

shaka_set['coords'] = shaka_set['coords'].apply(ast.literal_eval)
shaka_set = np.array(shaka_set['coords'].tolist())

none_set['coords'] = none_set['coords'].apply(ast.literal_eval)
none_set = np.array(none_set['coords'].tolist())

In [45]:
# We can concatenate them together to have all our data in one np array
data = np.concatenate((peace_set,heart_set,shaka_set,none_set))
data.shape

(7246, 21, 2)

In [46]:
gestures = ["peace","heart","shaka","none"]

In [54]:
target1 = np.zeros(peace_set.shape[0])    # Class 0 for data1
target2 = np.ones(heart_set.shape[0])     # Class 1 for data2
target3 = np.full(shaka_set.shape[0], 2)  # Class 2 for data3
target4 = np.full(none_set.shape[0], 3)

targets = np.concatenate((target1, target2, target3, target4))
targets

array([0., 0., 0., ..., 3., 3., 3.])

In [70]:
model = Sequential([
    Input(shape=data.shape[1:]),              # Define the input shape here
    Flatten(),                                # Flatten the (21, 2) input into a 1D vector
    Dense(32, activation='relu',  name='L1'), # Fully connected layer with 25 neurons
    Dense(16, activation='relu',  name='L2'), # Fully connected layer with 16 neurons
    Dense(4, activation='linear', name='L3')  # Output layer for binary classification
])
model.summary()

In [71]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=SparseCategoricalCrossentropy(from_logits=True),  # Use appropriate loss function for classification
              metrics=['accuracy'])

In [72]:
history = model.fit(
    data,targets,
    epochs=40
)

Epoch 1/40
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299us/step - accuracy: 0.5706 - loss: 1.1582
Epoch 2/40
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 294us/step - accuracy: 0.9127 - loss: 0.3738
Epoch 3/40
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295us/step - accuracy: 0.9472 - loss: 0.1796
Epoch 4/40
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293us/step - accuracy: 0.9641 - loss: 0.1234
Epoch 5/40
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295us/step - accuracy: 0.9683 - loss: 0.1003
Epoch 6/40
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293us/step - accuracy: 0.9747 - loss: 0.0816
Epoch 7/40
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293us/step - accuracy: 0.9813 - loss: 0.0648
Epoch 8/40
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293us/step - accuracy: 0.9815 - loss: 0.0587
Epoch 9/40
[1m227/227[

In [73]:
import mediapipe as mp
import cv2

In [74]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False,
                       max_num_hands=1,
                       min_detection_confidence=0.5,
                       min_tracking_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

I0000 00:00:1720378637.860701 15429725 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1


In [None]:
cap = cv2.VideoCapture(0)
BOX_MARGIN = 50
while cap.isOpened():
    # capturing the current frame
    ret, frame = cap.read()
    # getting window dimensions, shape contains height, width, and channels
    height, width, _ = frame.shape

    # if no frame returned, break
    if not ret:
        break
        
    frame = cv2.flip(frame, 1)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)
    if results.multi_hand_landmarks:
        # for each hand found we calculate the coords of the landmarks
        for hand_landmarks in results.multi_hand_landmarks:
            hand_coords = np.array([ (int(landmark.x * width), int(landmark.y * height)) for landmark in hand_landmarks.landmark ])
        mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
        
        hand_x_min, hand_x_max = min(x for x,y in hand_coords), max(x for x,y in hand_coords)
        hand_y_min, hand_y_max = min(y for x,y in hand_coords), max(y for x,y in hand_coords)
        
        rect_start, rect_end = (hand_x_min - BOX_MARGIN, hand_y_min - BOX_MARGIN), (hand_x_max + BOX_MARGIN, hand_y_max + BOX_MARGIN)
        rel_coords = np.array([ (x - rect_start[0], y - rect_start[1]) for x,y in hand_coords ])
        
        rel_x_max = max(x for x,y in rel_coords)
        rel_y_max = max(y for x,y in rel_coords)

        normalized_rel_coords = np.array([ (x / rel_x_max, y / rel_y_max) for x,y in rel_coords ])
        normalized_rel_coords = np.expand_dims(normalized_rel_coords, axis=0)

        predictions = model.predict(normalized_rel_coords)[0]
        prediction_p = tf.nn.softmax(predictions).numpy()
        max_prediction = np.argmax(prediction_p)
        print(max_prediction)

        if max_prediction != 3:
            text = f"{gestures[max_prediction]}, {(prediction_p[max_prediction] * 100):.2f}%"
        else:
            text = "none"

        cv2.putText(frame, text, (50,50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
        
    cv2.imshow('Hand Landmarks', frame)

    # when q pressed, we end video capture
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
        


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [78]:
type(prediction_p)

tensorflow.python.framework.ops.EagerTensor