In [10]:
import os
import sys
import cv2
import numpy as np
import tensorflow as tf
from datetime import datetime
from sklearn.model_selection import train_test_split

In [11]:
EPOCHS = 10
IMG_WIDTH = 100
IMG_HEIGHT = 77
NUM_GESTURE = 5
TEST_SIZE = 0.4
GESTURE = {0:"ok", 1:"down", 2:"up", 3:"palm", 4:"l"}

In [12]:
def load_data(data_dir):
    """
    Load image data from directory `data_dir`.
    Assume `data_dir` has one directory named after each category, numbered
    0 through NUM_CATEGORIES - 1. Inside each category directory will be some
    number of image files.
    Return tuple `(images, labels)`. `images` should be a list of all
    of the images in the data directory, where each image is formatted as a
    numpy ndarray with dimensions IMG_WIDTH x IMG_HEIGHT x 3. `labels` should
    be a list of integer labels, representing the categories for each of the
    corresponding `images`.
    """
    images = []
    labels = []
    
    for dir in range(0, NUM_CATEGORIES):
        # get path for each gesture like "/home/arpine/Desktop/data/0":  
        d = os.path.join(data_dir, f"{str(dir)}")
        # os.listdir(d) return the list of all names of images in that folder
        for image_path in os.listdir(d):
            # get the full path of specific image 
            full_path = os.path.join(data_dir, f"{str(dir)}", image_path)
            # Returns an image that is loaded from the specified file
            image = cv2.imread(full_path, )
            # image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            # cv2.imshow("im", image)
            # get dimension for each image
            dim = (IMG_WIDTH, IMG_HEIGHT)
            # resized the image
            image_resized = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
            
            # add image and their directory name to images and labels list
            images.append(image_resized)
            labels.append(dir)
    
    return images, labels

In [6]:
start_time = datetime.now()  
print("Loading ===========")

# load data 
images, labels = load_data("/home/arpine/Desktop/Gesture/DATA")  

finish_loading_time = datetime.now()
print("Images load time: ", finish_loading_time - start_time)

Images load time:  0:00:00.575476


In [7]:
def get_model():
    """
    Returns a compiled convolutional neural network model. Assume that the
    `input_shape` of the first layer is `(IMG_WIDTH, IMG_HEIGHT, 3)`.
    The output layer should have `NUM_CATEGORIES` units, one for each category.
    """
    # Create a convolutional neural network
    model = tf.keras.models.Sequential(
        [
        # Convolutional layer. Learn 32 filters using a 3x3 kernel
        tf.keras.layers.Conv2D(
            32, (5, 5), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)
        ),
        # Max-pooling layer, using 2x2 pool size
        tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(
            64, (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)
        ),
        # Max-pooling layer, using 2x2 pool size
        tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(
            64, (3, 3), activation='relu', input_shape=((IMG_WIDTH), (IMG_HEIGHT), 3)
        ),
        tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(
            128, (3, 3), activation='relu', input_shape=((IMG_WIDTH), (IMG_HEIGHT), 3)
        ),
        tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
        
        tf.keras.layers.Flatten(),
        # Add a hidden layer with dropout
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        # Add an output layer with output units for all 6 gestures
        tf.keras.layers.Dense(NUM_GESTURE, activation='softmax')
    ])

    # Train neural network
    model.compile(
        optimizer='adam',
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

In [8]:
labels = tf.keras.utils.to_categorical(labels)
# get data for train and test
x_train, x_test, y_train, y_test = train_test_split(
    np.array(images), np.array(labels), test_size=TEST_SIZE)

# Get a compiled neural network
model = get_model()

# Fit model on training data
model.fit(x_train, y_train, batch_size=64, epochs=EPOCHS)

# Evaluate neural network performance
model.evaluate(x_test, y_test, verbose=2)
fitting_time = datetime.now()

print("NN fit time: ", fitting_time - finish_loading_time)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
407/407 - 13s - loss: 0.0532 - accuracy: 0.9789
NN fit time:  0:15:14.006998


In [9]:
video = cv2.VideoCapture(0)
    
while True:
        # Capture the video frame
        ret, img = video.read()

        # Display the resulting frame
        # to flip the video with 180 degree 
        image = cv2.flip(img, 1)
        
        # save image for prediction
        image = cv2.imwrite('Frame'+str(0)+'.jpg', image)
        image_addr = "Frame0.jpg"
        image = cv2.imread(image_addr)
        
        dim = (IMG_WIDTH, IMG_HEIGHT)
        
        image = tf.keras.preprocessing.image.load_img(image_addr, target_size=dim)
        # Converts a PIL Image instance to a Numpy array. Return a 3D Numpy array.
        input_arr = tf.keras.preprocessing.image.img_to_array(image)
        # Convert single image to a batch.
        input_arr = np.array([input_arr])
        input_arr = input_arr.astype('float32')/255
        # Generates output predictions for the input samples. Return Numpy array(s) of predictions.
        predictions = model.predict(input_arr)
        print(predictions)
        # Return the index_array of the maximum values along an axis.
        pre_class = np.argmax(predictions, axis=-1)
        print(pre_class)
        # for writing in the video
        text = GESTURE[pre_class[0]]
        font = cv2.FONT_HERSHEY_SIMPLEX
        image = cv2.flip(img, 1)

        cv2.putText(image, 
                text, 
                (50, 50), 
                font, 2, 
                (0, 0, 0), 
                2, 
                cv2.LINE_4)
                
        cv2.imshow('video', image)
    

        # the 'q' button is set as the
        # quitting button you may use any
        # desired button of your choice

        k = cv2.waitKey(1)
        if k == ord('q'):
                break

video.release()       
cv2.destroyAllWindows()



 0.20668222 0.19988896]]
[3]
[[0.20579761 0.19310328 0.19416276 0.20677978 0.2001566 ]]
[3]
[[0.20570152 0.19278237 0.1943227  0.20719007 0.20000331]]
[3]
[[0.2072787  0.19199479 0.19411968 0.20687099 0.19973586]]
[0]
[[0.20640649 0.19236055 0.19382781 0.20594293 0.20146221]]
[0]
[[0.20618536 0.19283386 0.19453779 0.20629993 0.20014307]]
[3]
[[0.20537625 0.19358335 0.19505766 0.20630325 0.19967946]]
[3]
[[0.20627038 0.19290476 0.19519371 0.20596889 0.19966227]]
[0]
[[0.20679805 0.19231667 0.19444203 0.20599945 0.20044379]]
[0]
[[0.20678876 0.19221723 0.19370833 0.20754096 0.19974472]]
[3]
[[0.20667312 0.1926319  0.19413154 0.20639278 0.20017071]]
[0]
[[0.20722975 0.1924256  0.19383216 0.2062984  0.20021407]]
[0]
[[0.20785867 0.19155315 0.19381699 0.20589861 0.20087259]]
[0]
[[0.20855583 0.19045483 0.19393595 0.20552531 0.20152806]]
[0]
[[0.20825888 0.19108967 0.1940605  0.20594682 0.20064412]]
[0]
[[0.20718631 0.19224738 0.19392735 0.20617    0.20046903]]
[0]
[[0.20764638 0.1916821  0.