# Imports

In [1]:
import numpy as np
import os
import cv2

# Setup Config

In [13]:
# define the directory to save your images into
DATASET_DIR = "dataset"

# define the directory to save all images
IMAGE_DIR = os.path.join(DATASET_DIR, 'all_images')
# create it if not exists yet
if not os.path.exists(IMAGE_DIR):
    os.makedirs(IMAGE_DIR)
    print(f"Created folder of {IMAGE_DIR}")

# defining the class names for the dataset
CLASS_NAMES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", 
               "K", "L", "M", "N", "O", "P", "Q", "R",
               "S", "T", "U", "V", "W", "X", "Y"]

# define the number of images to be collected for each class
# set at least 100 images each class, the more the better, up to you
IMAGES_PER_CLASS = 500

# Set this to true if you want to automatically limit the 
#  number of images for each class, otherwise you can just
#  manually stop the recording every time later,
# NOTE: Make sure the number of images for each class is balanced (almost the same number),
#  imbalanced dataset is a common problem in classification tasks that
#  we want to avoid
LIMIT = False

# Set the image size that we want to save them with
IMAGE_SIZE = (320, 320)

# Frames to capture per second
# Adjust this to capture images faster or slower every second
FPS = 10

In [3]:
## Set the config for the things to display by OpenCV

# Set position of the text to show on feed
TEXT_POS = (15, 20)
# Position to show the counter of image number for each label
COUNTER_POS = (15, 60)
FONT_SCALE = 0.6
TEXT_COLOR = (0, 255, 0)
TEXT_THICKNESS = 1

- The next cell defines the Region of interest (ROI) to capture during the recording.
- We define this region to crop out specific region from the feed to save as image
- The definition of the 4 numbers: 
    - `startX` = x-coordinate of top left
    - `endX` = x-coordinate of bottom right
    - `startY` = y-coordinate of top left
    - `endY` = y-coordinate of bottom right
- Make sure to set the dimensions to be square to make things easier
- E.g. (320, 620, 100, 400) would result in 300 x 300 dimensions

NOTE: This ROI should be based on your webcam's resolution, make sure not to exceed
the width and height of the webcam's resolution. But generally, OpenCV will set the
default width and height to be (640, 360)

In [4]:
CAPTURE_ROI = (320, 620, 100, 400)
startX, endX, startY, endY = CAPTURE_ROI

You may check the ROI box position by running the cell below. This ROI region will be cropped out to be saved as our image later.

In [5]:
## Verifying webcam is working, and the position of the ROI is ok
# Press 'ESC' key to exit

# open the camera
cap = cv2.VideoCapture(0)
while True:
    ret, frame = cap.read()
    # flip it horizontally to see clearly
    frame = cv2.flip(frame, 1)
    # draw the ROI box
    cv2.rectangle(frame, (startX, startY), (endX, endY), (180, 0, 0), 3)
    cv2.putText(frame, "Press 'ESC' key to exit",
                TEXT_POS, cv2.FONT_HERSHEY_COMPLEX,
                FONT_SCALE, TEXT_COLOR, TEXT_THICKNESS
    )
    cv2.putText(frame, "ROI", (startX + 5, startY + 22),
                cv2.FONT_HERSHEY_COMPLEX,
                FONT_SCALE, TEXT_COLOR, TEXT_THICKNESS
    )
    cv2.imshow("Webcam", frame)
    key = cv2.waitKey(1)
    if key == 27:
        # press 'ESC' to exit
        break
cap.release()
cv2.destroyAllWindows()

# Setup Folders

NOTE: You don't have to setup your own folders if you are collecting images using the webcam method in this notebook. If you are planning to collect images with other methods, then you should follow the approach explained in this section here.

- In general, the simplest approach is to structure your dataset into one folder for one each class.
- For example, for a dogs VS cats dataset, the file structure would be as follow:

```
dataset
├───dogs
└───cats
```
- Therefore, for our dataset, there will be one folder per alphabet except for "J" and "Z", which are not used in the dataset as they require specific gestures (movements of hand) in order to determine the alphabets, hence there are 24 folders in total (26 alphabets minus 2).

# Capture Image from Webcam

`label_arr` is created to know which point of index represents which class label's turn to capture image.

Basically, every class will have 2 different stages (or turns) for 2 different actions:
- a stage for standby (wait to start capture)
- a stage for capturing images

Therefore this `label_arr` is an array to keep track of the stages (point of time) of which class label should be used. Once you try checking and running the code blocks below then you will understand.

In [6]:
# initialize the stage_number
i = 0
# set the total number of stages for each class
n_stages = 2
label_arr = np.empty(len(CLASS_NAMES) * n_stages, dtype=str)
total_steps = len(label_arr)

for label in CLASS_NAMES:
    for idx in range(i, i+n_stages):
        label_arr[idx] = label
    i += n_stages

print(label_arr)

['A' 'A' 'B' 'B' 'C' 'C' 'D' 'D' 'E' 'E' 'F' 'F' 'G' 'G' 'H' 'H' 'I' 'I'
 'K' 'K' 'L' 'L' 'M' 'M' 'N' 'N' 'O' 'O' 'P' 'P' 'Q' 'Q' 'R' 'R' 'S' 'S'
 'T' 'T' 'U' 'U' 'V' 'V' 'W' 'W' 'X' 'X' 'Y' 'Y']


In [7]:
def show_standby():
    # get the current label based on the current stage (or index)
    current_label = label_arr[i]
    # show the standby text on the frame
    cv2.putText(copy, f"({current_label}) Hit 'Enter' to record when ready",
                TEXT_POS, cv2.FONT_HERSHEY_COMPLEX,
                FONT_SCALE, TEXT_COLOR, TEXT_THICKNESS
    )
    
def run_capture():
    """
    A function to show what class label is being captured now,
    and run capturing images for a specific label, and count the number of images that have 
    captured for the class.
    """
    
    # add the image count for the current class
    global image_count
    image_count += 1
    
    # get the current label from label_arr that we created above to
    # keep track of the class label for current stage
    current_label = label_arr[i]
    
    # display on the feed what class we are capturing
    cv2.putText(copy, f"Capturing class '{current_label}'",
                TEXT_POS, cv2.FONT_HERSHEY_COMPLEX,
                FONT_SCALE, TEXT_COLOR, TEXT_THICKNESS
    )
    # display the image counter on the feed
    cv2.putText(copy, f"Images captured: {image_count}", COUNTER_POS,
                cv2.FONT_HERSHEY_COMPLEX,
                FONT_SCALE, TEXT_COLOR, TEXT_THICKNESS
    )
    
    # get the specific directory for our label
    gesture_dir = os.path.join(IMAGE_DIR, current_label)
    if not os.path.exists(gesture_dir):
        # create the directory if not exists
        os.makedirs(gesture_dir)
    
    # save the image with a specific name based on the class name and image count
    image_path = os.path.join(gesture_dir, f"{current_label}_{image_count}.jpg")
    cv2.imwrite(image_path, roi)

In [8]:
## set up the different points for each different stage
#  They are set up like this so that we can alternate 3 stages for each class label

# the points of time where we show a standby feed to press 'ENTER' to start capturing
standby_points = np.arange(0, total_steps, n_stages)
# the points of time where we run capturing images
capture_points = np.arange(1, total_steps, n_stages)

Once you run the code block below, OpenCV will open up 3 windows, and you should drag them and rearrange them side by side to see clearly what's going on.

The window showing the grayscaled ROI is the frame that we will save as our image, you may choose to not convert them to grayscale if deemed necessary, as RGB color could be an important feature in many cases. In this case, RGB color is not a distinguishing feature for recognizing alphabets from gestures.

Keep pressing the 'ENTER' key when you want to proceed to next stage every time. Or you may press 'ESC' key to exit any time before completing all the class labels, but you will have incomplete dataset saved in your directories. You may want to remove the incomplete dataset if you wish.

In [11]:
## CAPTURE IMAGES

# open the camera
cap = cv2.VideoCapture(0)
# set up the frametime according to the FPS, in milliseconds
frametime = int(1 / FPS * 1000)

# initialize the stage index from zero
i = 0
# initialize image counter
image_count = 0

# keep running while not finish running all the required capturing steps
try:
    while i <= total_steps:
        # get the frame
        ret, frame = cap.read()
        if not ret:
            # if couldn't get any frame, stop running
            break
        
        # flip the frame horizontally to make it easier to see
        frame = cv2.flip(frame, 1)

        # extract the ROI that we want to save as image
        roi = frame[startY:endY, startX:endX]
        # show the ROI side by side
        cv2.imshow('ROI', roi)
        
        # create a grayscale ROI that we want to save.
        # you may want to remove this conversion to grayscale if you think RGB color
        #  is a very important feature to distinguish between classes,
        #  which is not the case for alphabet classification
        roi = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
        
        # resize the ROI to a specific size we want to save the images with
        roi = cv2.resize(roi, IMAGE_SIZE, interpolation = cv2.INTER_AREA)
    
        # show the scaled and grayscale ROI side by side
        cv2.imshow('ROI scaled and gray', roi)
        
        # draw a rectangle box on the frame to show where we should place our
        # object into, i.e. our hand with specific gesture in this case
        copy = frame.copy()
        cv2.rectangle(copy, (startX, startY), (endX, endY), (255, 0, 0), 5)
        
        if i in standby_points:
            # show the standby text on the frame
            show_standby()
        elif i in capture_points:
            # run capturing images
            run_capture()
        elif i == total_steps:
            # reached the end
            cv2.putText(copy, "Hit 'Enter' to exit", TEXT_POS, cv2.FONT_HERSHEY_COMPLEX,
                        FONT_SCALE, TEXT_COLOR, TEXT_THICKNESS)
        
        # show the frame that has texts and ROI box on it
        cv2.imshow('frame', copy)
        
        # wait for specific milliseconds for each frame
        # basically controlling the approximate FPS
        key = cv2.waitKey(frametime)
        
        # press 'ENTER' to continue next run. 
        #  Or if LIMIT is set to True, it will automatically proceed to the
        #  next stage when the IMAGES_PER_CLASS is reached
        if key == 13 or (LIMIT and image_count == IMAGES_PER_CLASS):
            # reset the image count for new class label
            image_count = 0
            # increment the point of time to proceed to next point
            i += 1
            
        if key == 27:
            # press 'ESC' to exit properly
            break
            
except KeyboardInterrupt:
    # exit properly if user choose to interrupt the kernel
    pass
finally:
    # release the camera properly and destroy the OpenCV windows
    cap.release()
    cv2.destroyAllWindows()

In [10]:
# Run this if camera or OpenCV seems to not closing/functioning
cap.release()
cv2.destroyAllWindows()

# Or collect from Google Search using a quick method

1. Install the [Fatkun Batch Download Image extension](https://chrome.google.com/webstore/detail/fatkun-batch-download-ima/nnjjahlikiabnchcpehcpkdeckfgnohf?hl=en) from Google Chrome.
2. Open up a tab and search for the images you want, e.g. cloth face mask
3. Then use the extension to download all the images that are found in the Google Search tab. Beware that the more you scroll down, the more images will be downloaded.

NOTE: This requires you to perform more cleaning up and filtering to make sure the images that you've collected are of good quality! As the saying goes for machine learning, "Garbage in, Garbage Out". Good data is the core of a machine learning model with good performance.

# Split the dataset into training, validation, and test sets

In [66]:
TRAIN_SPLIT = 0.75
VAL_SPLIT = 0.1

In [67]:
# the folder which we will find the images to split into 3 sets
IMAGE_DIR

'dataset\\all_images'

In [69]:
from imutils import paths
import random
import shutil
import os

# grab the paths to all input images in the original input directory
# and shuffle them
imagePaths = list(paths.list_images(IMAGE_DIR))
random.seed(42)
random.shuffle(imagePaths)

# compute the training and testing split
i = int(len(imagePaths) * TRAIN_SPLIT)
trainPaths = imagePaths[:i]
testPaths = imagePaths[i:]

# we'll be using part of the training data for validation
i = int(len(trainPaths) * VAL_SPLIT)
valPaths = trainPaths[:i]
trainPaths = trainPaths[i:]

print("Train images:", len(trainPaths))
print("Validation images:", len(valPaths))
print("Test images:", len(testPaths))

Train images: 8316
Validation images: 924
Test images: 3080


In [71]:
TRAIN_PATH = os.path.join(DATASET_DIR, "training")
VAL_PATH = os.path.join(DATASET_DIR, "validation")
TEST_PATH = os.path.join(DATASET_DIR, "testing")
print(TRAIN_PATH)
print(VAL_PATH)
print(TEST_PATH)

dataset\training
dataset\validation
dataset\testing


In [72]:
# define the datasets that we'll be building
datasets = [
    ("training", trainPaths, TRAIN_PATH),
    ("validation", valPaths, VAL_PATH),
    ("testing", testPaths, TEST_PATH)
]

# loop over the datasets
for (dType, imagePaths, baseOutput) in datasets:
    # show which data split we are creating
    print(f"[INFO] building '{dType}' split")

    # if the output base output directory does not exist, create it
    if not os.path.exists(baseOutput):
        print(f"[INFO] creating '{baseOutput}' directory")
        os.makedirs(baseOutput)

    # loop over the input image paths
    for inputPath in imagePaths:
        # extract the filename of the input image along with its
        # corresponding class label
        filename = inputPath.split(os.path.sep)[-1]
        label = inputPath.split(os.path.sep)[-2]

        # build the path to the label directory
        labelPath = os.path.sep.join([baseOutput, label])

        # if the label output directory does not exist, create it
        if not os.path.exists(labelPath):
            print(f"[INFO] creating '{labelPath}' directory")
            os.makedirs(labelPath)

        # construct the path to the destination image and then copy
        # the image itself
        p = os.path.sep.join([labelPath, filename])
        shutil.copy2(inputPath, p)

[INFO] building 'training' split
[INFO] 'creating dataset\training' directory
[INFO] 'creating dataset\training\I' directory
[INFO] 'creating dataset\training\H' directory
[INFO] 'creating dataset\training\M' directory
[INFO] 'creating dataset\training\E' directory
[INFO] 'creating dataset\training\Y' directory
[INFO] 'creating dataset\training\D' directory
[INFO] 'creating dataset\training\C' directory
[INFO] 'creating dataset\training\O' directory
[INFO] 'creating dataset\training\L' directory
[INFO] 'creating dataset\training\V' directory
[INFO] 'creating dataset\training\F' directory
[INFO] 'creating dataset\training\B' directory
[INFO] 'creating dataset\training\P' directory
[INFO] 'creating dataset\training\W' directory
[INFO] 'creating dataset\training\R' directory
[INFO] 'creating dataset\training\T' directory
[INFO] 'creating dataset\training\X' directory
[INFO] 'creating dataset\training\Q' directory
[INFO] 'creating dataset\training\U' directory
[INFO] 'creating dataset\trai

# OPTIONAL - Compress them for Colab Training

In [None]:
import os
ARCHIVE_PATH = os.path.join(DATASET_DIR, "archive.tar.gz")

# add the training and testing datasets to a tar file
# uncomment this two lines below to do so
# !tar -czf {ARCHIVE_PATH} {TRAIN_PATH} {VAL_PATH} {TEST_PATH}
# print(f"File saved at {ARCHIVE_PATH}")

# Acknowledgement

Thanks to the YouTube video below for the inspiring this notebook.
1. 36 Building Your Own Gesture Recognition System with Your Own Data - [YouTube Video](https://www.youtube.com/watch?v=YjnGou4skGU)
2. PyImageSearch - [Fine-tuning ResNet with Keras, TensorFlow, and Deep Learning](https://www.pyimagesearch.com/2020/04/27/fine-tuning-resnet-with-keras-tensorflow-and-deep-learning/)