In [1]:
import cv2
import numpy as np
import os
import sys
import tensorflow as tf
from datetime import datetime
from sklearn.model_selection import train_test_split
from tensorflow.python.ops.gen_math_ops import mod
from PIL import Image, ImageFilter
import mediapipe

In [2]:
EPOCHS = 10
IMG_WIDTH = 256
IMG_HEIGHT = 256
NUM_CATEGORIES = 6
TEST_SIZE = 0.5
GESTURE = {0:"ok", 1:"down", 2:"up", 3:"palm", 4:"fist", 5:"l"}

In [3]:
def load_data(data_dir):
    """
    Load image data from directory `data_dir`.
    Assume `data_dir` has one directory named after each category, numbered
    0 through NUM_CATEGORIES - 1. Inside each category directory will be some
    number of image files.
    Return tuple `(images, labels)`. `images` should be a list of all
    of the images in the data directory, where each image is formatted as a
    numpy ndarray with dimensions IMG_WIDTH x IMG_HEIGHT x 3. `labels` should
    be a list of integer labels, representing the categories for each of the
    corresponding `images`.
    """
    images = []
    labels = []
    
    for dir in range(0, NUM_CATEGORIES):
        # get path for each gesture like "/home/arpine/Desktop/data/0":  
        d = os.path.join(data_dir, f"{str(dir)}")
        # os.listdir(d) return the list of all names of images in that folder
        for image_path in os.listdir(d):
            # get the full path of specific image 
            full_path = os.path.join(data_dir, f"{str(dir)}", image_path)
            # Returns an image that is loaded from the specified file
            image = cv2.imread(full_path, )
            # image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            # cv2.imshow("im", image)
            # get dimension for each image
            dim = (IMG_WIDTH, IMG_HEIGHT)
            # resized the image
            image_resized = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
            
            # add image and their directory name to images and labels list
            images.append(image_resized)
            labels.append(dir)
    
    return images, labels

In [4]:
start_time = datetime.now()  
print("Loading ===========") 
images, labels = load_data("/home/arpine/Desktop/test1")   
# images, labels = load_data("/home/arpine/Desktop/Gesture/test3")  
# images, labels = load_data("/home/arpine/Desktop/Gesture/poqr")  
# images, labels = load_data("/home/arpine/Desktop/Gesture/test (copy)")
finish_loading_time = datetime.now()
print("Images load time: ", finish_loading_time - start_time)

Images load time:  0:00:01.884775


In [5]:
def get_model():
    """
    Returns a compiled convolutional neural network model. Assume that the
    `input_shape` of the first layer is `(IMG_WIDTH, IMG_HEIGHT, 3)`.
    The output layer should have `NUM_CATEGORIES` units, one for each category.
    """
    # Create a convolutional neural network
    model = tf.keras.models.Sequential(
        [
        # Convolutional layer. Learn 32 filters using a 3x3 kernel
        tf.keras.layers.Conv2D(
            32, (5, 5), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)
        ),
        # Max-pooling layer, using 2x2 pool size
        tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(
            64, (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)
        ),
        # Max-pooling layer, using 2x2 pool size
        tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(
            128, (3, 3), activation='relu', input_shape=((IMG_WIDTH), (IMG_HEIGHT), 3)
        ),
        tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(
            256, (3, 3), activation='relu', input_shape=((IMG_WIDTH), (IMG_HEIGHT), 3)
        ),
        tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
        
        tf.keras.layers.Flatten(),
        # Add a hidden layer with dropout
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        # Add an output layer with output units for all 6 gestures
        tf.keras.layers.Dense(NUM_CATEGORIES, activation='softmax')
    ])

    # Train neural network
    model.compile(
        optimizer='adam',
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

In [6]:
labels = tf.keras.utils.to_categorical(labels)
x_train, x_test, y_train, y_test = train_test_split(
    np.array(images), np.array(labels), test_size=TEST_SIZE)

# Get a compiled neural network
model = get_model()

# Fit model on training data
model.fit(x_train, y_train, batch_size=64, epochs=EPOCHS)

# Evaluate neural network performance
model.evaluate(x_test, y_test, verbose=2)
fitting_time = datetime.now()

print("NN fit time: ", fitting_time - finish_loading_time)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
3/3 - 1s - loss: 1.4422 - accuracy: 0.3735
NN fit time:  0:00:50.479795


In [8]:
video = cv2.VideoCapture(0)
    
while True:
        # Capture the video frame
        ret, img = video.read()

         # Display the resulting frame
        # to flip the video with 180 degree 
        image = cv2.flip(img, 1)
        #cv2.imshow('frame', image)
        
        # save image for prediction
        image = cv2.imwrite('Frame'+str(0)+'.jpg', image)
        image_addr = "Frame0.jpg"
        
        handsModule = mediapipe.solutions.hands
        with handsModule.Hands(static_image_mode=True) as hands:

            image = cv2.imread(image_addr)
            results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
            image_height, image_width, _ = image.shape

            if results.multi_hand_landmarks:
                for hand_landmark in results.multi_hand_landmarks:
                    x = [landmark.x for landmark in hand_landmark.landmark]
                    y = [landmark.y for landmark in hand_landmark.landmark]
                
                    center = np.array([np.mean(x)*image_width, np.mean(y)*image_height]).astype('int32')
                    cv2.imshow('video', image)
                    cv2.circle(image, tuple(center), 10, (255,0,0), 1) #for checking the center
                    cv2.rectangle(image, (center[0]-128,center[1]-128), (center[0]+128,center[1]+128), (255,0,0), 1)
                    hand = image[center[1]-128:center[1]+128, center[0]-128:center[0]+128]
                #     cv2.imshow('video', hand)
                    if hand.shape==(256, 256, 3):
                        cv2.imwrite(image_addr, hand)
       
        # rest, thresh = cv2.threshold(image, 70, 255, cv2.THRESH_BINARY)
        # _, contours = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        # image = cv2.imwrite(address, thresh)
        # image = cv2.imwrite('Frame'+str(0)+'.png', image)

        # image = "Frame0.png"
        #image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        dim = (IMG_WIDTH, IMG_HEIGHT)
        
        image = tf.keras.preprocessing.image.load_img(image_addr, target_size=dim)
        # Converts a PIL Image instance to a Numpy array. Return a 3D Numpy array.
        input_arr = tf.keras.preprocessing.image.img_to_array(image)
        # Convert single image to a batch.
        input_arr = np.array([input_arr])
        input_arr = input_arr.astype('float32')/255
        # Generates output predictions for the input samples. Return Numpy array(s) of predictions.
        predictions = model.predict(input_arr)
        print(predictions)
        
        # Return the index_array of the maximum values along an axis.
        pre_class = np.argmax(predictions, axis=-1)
        print(pre_class)
        #print(GESTURE[pre_class[0]])
        text = GESTURE[pre_class[0]]
        font = cv2.FONT_HERSHEY_SIMPLEX
        image = cv2.flip(img, 1)
        if predictions[0][pre_class[0]] >0.166: 
                cv2.putText(image, 
                        text, 
                        (50, 50), 
                        font, 2, 
                        (0, 0, 0), 
                        2, 
                        cv2.LINE_4)
        cv2.imshow('video', image)
    

        # the 'q' button is set as the
        # quitting button you may use any
        # desired button of your choice
        k = cv2.waitKey(1)
        if k == ord('q'):
                break

video.release()       
cv2.destroyAllWindows()



[[0.16639152 0.1660171  0.16711429 0.16586922 0.16718695 0.167421  ]]
[5]
[[0.16639996 0.16601337 0.16710094 0.16589278 0.16717643 0.16741647]]
[5]
[[0.1663832  0.16603966 0.16707848 0.16592325 0.1671748  0.16740058]]
[5]
[[0.1664158  0.16597441 0.16709417 0.16591911 0.16717906 0.16741747]]
[5]
[[0.16639572 0.16602601 0.16709104 0.16589946 0.16717578 0.16741204]]
[5]
[[0.166389   0.16601317 0.16709958 0.16590948 0.16717935 0.1674094 ]]
[5]
[[0.16639267 0.16599797 0.16709635 0.1659047  0.16718452 0.16742378]]
[5]
[[0.16640197 0.1659701  0.1670793  0.16591871 0.16718458 0.16744536]]
[5]
[[0.16640052 0.16597994 0.16708325 0.16589707 0.16719028 0.16744894]]
[5]
[[0.16639692 0.16597953 0.16708218 0.1659045  0.16719484 0.16744198]]
[5]
[[0.16638947 0.16598804 0.16709727 0.16590807 0.16719377 0.16742343]]
[5]
[[0.16639683 0.16598618 0.16709407 0.1659117  0.16719094 0.16742031]]
[5]
[[0.16638865 0.16600242 0.16709629 0.16592044 0.16718501 0.16740716]]
[5]
[[0.16639756 0.16599144 0.16708666 0.1

error: OpenCV(4.5.2) /tmp/pip-req-build-eirhwqtr/opencv/modules/highgui/src/window.cpp:404: error: (-215:Assertion failed) size.width>0 && size.height>0 in function 'imshow'


In [None]:
# import cv2
# import numpy as np
# import os
# import sys
# import tensorflow as tf
# from datetime import datetime
# from sklearn.model_selection import train_test_split
# from tensorflow.python.ops.gen_math_ops import mod

# from PIL import Image, ImageFilter

# EPOCHS = 10
# IMG_WIDTH = 640
# IMG_HEIGHT = 480
# NUM_CATEGORIES = 6
# TEST_SIZE = 0.5
# GESTURE = {0:"ok", 1:"down", 2:"up", 3:"palm", 4:"fist", 5:"l"}

# # Open image
# def load_data(data_dir):
#     """
#     Load image data from directory `data_dir`.
#     Assume `data_dir` has one directory named after each category, numbered
#     0 through NUM_CATEGORIES - 1. Inside each category directory will be some
#     number of image files.
#     Return tuple `(images, labels)`. `images` should be a list of all
#     of the images in the data directory, where each image is formatted as a
#     numpy ndarray with dimensions IMG_WIDTH x IMG_HEIGHT x 3. `labels` should
#     be a list of integer labels, representing the categories for each of the
#     corresponding `images`.
#     """
#     images = []
#     labels = []
    
#     for dir in range(0, NUM_CATEGORIES):
#         # get path for each gesture like "/home/arpine/Desktop/data/0":  
#         d = os.path.join(data_dir, f"{str(dir)}")
#         # os.listdir(d) return the list of all names of images in that folder
#         for image_path in os.listdir(d):
#             # get the full path of specific image 
#             full_path = os.path.join(data_dir, f"{str(dir)}", image_path)
#             image = Image.open(full_path).convert("RGB")
#             image = image.filter(ImageFilter.Kernel(
#                 size=(3, 3),
#                 kernel=[-1, -1, -1, -1, 8, -1, -1, -1, -1],
#                 scale=1
#             ))
#             image.save(full_path)
#             # Returns an image that is loaded from the specified file
#             image = cv2.imread(full_path, )
#             # image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#             # cv2.imshow("im", image)
#             # get dimension for each image
#             dim = (IMG_WIDTH, IMG_HEIGHT)
#             # # resized the image
#             image_resized = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
            
#              # add image and their directory name to images and labels list
#             images.append(image_resized)
#             labels.append(dir)
    
#     return images, labels
#     # print(images)


# # Filter image according to edge detection kernel
# # image = image.filter(ImageFilter.Kernel(
# #     size=(3, 3),
# #     kernel=[-1, -1, -1, -1, 8, -1, -1, -1, -1],
# #     scale=1
# # ))

# # # Show resulting image
# # image.show()


# # cv2.waitKey(0)
# # cv2.destroyAllWindows()
# load_data("/home/arpine/Desktop/Gesture/test")

In [None]:
# import cv2
# NUM_CATEGORIES = 6

# GESTURE = {0:"ok", 1:"down", 2:"up", 3:"palm", 4:"fist", 5:"l"}

# def load_data(data_dir):
#     for dir in range(0, NUM_CATEGORIES):
#     # adr = "/home/arpine/Desktop/Gesture/poqr/ok/bcbf8425-d850-11eb-9ec6-0ba2456509ee.png"
#         d = os.path.join(data_dir, f"{str(dir)}")
#         # os.listdir(d) return the list of all names of images in that folder
#         for image_path in os.listdir(d):
#         # get the full path of specific image 
#             full_path = os.path.join(data_dir, f"{str(dir)}", image_path)
#             image = cv2.imread(full_path, 0)
#             rest, thresh = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY)
#             _, contours = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
#             cv2.imwrite(full_path, thresh)

In [None]:
# full_path = "/home/arpine/Desktop/Gesture/image/carmen/ok/000000014.jpg"
# image = cv2.imread(full_path, 0)
# rest, thresh = cv2.threshold(image, 80, 80, cv2.THRESH_BINARY)
# _, contours = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
# cv2.imwrite("frame.png", thresh)