# Imports

In [1]:
import numpy as np
import os
import cv2

# Setup Config

In [15]:
# define the directory to save your images into
IMAGE_DIR = "dataset"

# defining the class names for the dataset
CLASS_NAMES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", 
               "K", "L", "M", "N", "O", "P", "Q", "R",
               "S", "T", "U", "V", "W", "X", "Y"]

# define the number of images to be collected for each class
# set at least 30 images each class, up to you
IMAGES_PER_CLASS = 30

# Set the image size that we want to save them with
IMAGE_SIZE = (320, 320)

In [3]:
## Set the config for the things to display by OpenCV

# Set position of the text to show on feed
TEXT_POS = (30, 30)
# Position to show the counter of image number for each label
COUNTER_POS = (30, 60)
FONT_SCALE = 0.6
TEXT_COLOR = (0, 255, 0)
TEXT_THICKNESS = 1

In [4]:
# Frames to capture per second
# Adjust this to capture images faster or slower every second
FPS = 10

- The next cell defines the Region of interest (ROI) to capture during the recording.
- We define this region to crop out specific region from the feed to save as image
- The definition of the 4 numbers: 
    - `startX` = x-coordinate of top left
    - `endX` = x-coordinate of bottom right
    - `startY` = y-coordinate of top left
    - `endY` = y-coordinate of bottom right
- Make sure to set the dimensions to be square to make things easier
- E.g. (320, 620, 100, 400) would result in 300 x 300 dimensions

NOTE: This ROI should be based on your webcam's resolution, make sure not to exceed
the width and height of the webcam's resolution. But generally, OpenCV will set the
default width and height to be (640, 360)

In [5]:
CAPTURE_ROI = (320, 620, 100, 400)
startX, endX, startY, endY = CAPTURE_ROI

# Capture Image from Webcam

`label_arr` is created to know which point of index represents which label's turn to capture image.

Basically, every label will have 3 different stages (or turns) for 3 different actions:
- a stage for standby (wait to start capture)
- a stage for capturing train set images
- a stage for capturing test set images

Therefore this `label_arr` is an array to keep track of the stages (indices) of which class label should be used. Once you try checking and running the code blocks below then you will understand.

In [6]:
i = 0
label_arr = np.empty(len(CLASS_NAMES) * 3, dtype=str)
total_steps = len(label_arr)

for label in CLASS_NAMES:
    for idx in range(i, i+3):
        label_arr[idx] = label
    i += 3

print(label_arr)

['A' 'A' 'A' 'B' 'B' 'B' 'C' 'C' 'C' 'D' 'D' 'D' 'E' 'E' 'E' 'F' 'F' 'F'
 'G' 'G' 'G' 'H' 'H' 'H' 'I' 'I' 'I' 'K' 'K' 'K' 'L' 'L' 'L' 'M' 'M' 'M'
 'N' 'N' 'N' 'O' 'O' 'O' 'P' 'P' 'P' 'Q' 'Q' 'Q' 'R' 'R' 'R' 'S' 'S' 'S'
 'T' 'T' 'T' 'U' 'U' 'U' 'V' 'V' 'V' 'W' 'W' 'W' 'X' 'X' 'X' 'Y' 'Y' 'Y']


In [7]:
def show_standby():
    # get the current label based on the current stage (or index)
    current_label = label_arr[i]
    # show the standby text on the frame
    cv2.putText(copy, f"({current_label}) Hit 'Enter' to record when ready",
                TEXT_POS, cv2.FONT_HERSHEY_COMPLEX,
                FONT_SCALE, TEXT_COLOR, TEXT_THICKNESS
    )
    
def run_capture(data="Train"):
    """
    A function to show what class label is being captured now,
    and run capturing images for a specific label, and count the number of images that have 
    been captured for the class, either for training set or test set.
    """
    
    # add the image count for the current class and data (train/test)
    global image_count
    image_count += 1
    
    # get the current label from label_arr that we created above to
    # keep track of the class label for current stage
    current_label = label_arr[i]
    
    # display on the feed what class we are capturing, and for which data (train/test)
    cv2.putText(copy, f"Capturing '{current_label}' gesture - {data} set",
                TEXT_POS, cv2.FONT_HERSHEY_COMPLEX,
                FONT_SCALE, TEXT_COLOR, TEXT_THICKNESS
    )
    # display the image counter on the feed
    cv2.putText(copy, f"Images captured: {image_count}", COUNTER_POS,
                cv2.FONT_HERSHEY_COMPLEX,
                FONT_SCALE, TEXT_COLOR, TEXT_THICKNESS
    )
    
    # get the specific directory for our label
    gesture_dir = os.path.join(IMAGE_DIR, data.lower(), current_label)
    if not os.path.exists(gesture_dir):
        # create the directory if not exists
        os.makedirs(gesture_dir)
    
    # save the image with a specific name based on the class name and image count
    image_path = os.path.join(gesture_dir, f"{current_label}_{image_count}.jpg")
    cv2.imwrite(image_path, roi)

In [8]:
## set up the different points for each different stage

# the points where we show a standby feed to press 'ENTER' to start capturing
standby_points = np.arange(0, total_steps, 3)
# the points where we run capturing images for train set
train_points = np.arange(1, total_steps, 3)
# the points where we run capturing images for test set
test_points = np.arange(2, total_steps, 3)

Once you run the code block below, OpenCV will open up 3 windows, and you should drag them and rearrange them side by side to see clearly what's going on.

The window showing the grayscaled ROI is the frame that we will save as our image, you may choose to not convert them to grayscale if deemed necessary, as RGB color could be an important feature in many cases. In this case, RGB color is not a distinguishing feature for recognizing alphabets from gestures.

Keep pressing the 'ENTER' key when you want to proceed to next stage every time. Or you may press 'ESC' key to exit any time. But you will have incomplete dataset saved in your directories. You may want to remove the incomplete dataset if you wish.

In [9]:
## CAPTURE IMAGES

# open the camera
cap = cv2.VideoCapture(0)

# initialize the stage index from zero
i = 0
# initialize image counter
image_count = 0

# keep running while not finish running all the required capturing steps
while i <= total_steps:
    try:
        # get the frame
        ret, frame = cap.read()
        if not ret:
            # if couldn't access the webcam, stop running
            break
        
        # flip the frame horizontally to make it easier to see
        frame = cv2.flip(frame, 1)

        # extract the ROI that we want to save as image
        roi = frame[startY:endY, startX:endX]
        # show the ROI side by side
        cv2.imshow('ROI', roi)
        
        # create a grayscale ROI that we want to save.
        # you may want to remove this conversion to grayscale if you think RGB color
        #  is a very important feature to distinguish between classes,
        #  which is not the case for alphabet classification
        roi = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
        
        # resize the ROI to a specific size we want to save the images with
        roi = cv2.resize(roi, IMAGE_SIZE, interpolation = cv2.INTER_AREA)
    
        # show the scaled and grayscale ROI side by side
        cv2.imshow('ROI scaled and gray', roi)
        
        # draw a rectangle box on the frame to show where we should place our
        # object into, i.e. our hand with specific gesture in this case
        copy = frame.copy()
        cv2.rectangle(copy, (startX, startY), (endX, endY), (255, 0, 0), 5)
        
        if i in standby_points:
            # show the standby text on the frame
            show_standby()
        elif i in train_points:
            # run capturing for train set
            run_capture(data="Train")
        elif i in test_points:
            # run capturing for test set
            run_capture(data="Test")
        elif i == total_steps:
            # reached the end
            cv2.putText(copy, "Hit 'Enter' to exit", TEXT_POS, cv2.FONT_HERSHEY_COMPLEX,
                        FONT_SCALE, TEXT_COLOR, TEXT_THICKNESS)
        
        # show the frame that has texts and ROI box on it
        cv2.imshow('frame', copy)
        
        # wait for specific milliseconds for each frame
        # basically controlling the approximate FPS
        key = cv2.waitKey(int(1 / FPS * 1000))

        if key == 13:
            # press 'ENTER' to continue next run
            # reset the image count for new class label
            image_count = 0
            # increment the point of time to proceed to next point
            i += 1
            
        if key == 27:
            # press 'ESC' to exit properly
            break
            
    except KeyboardInterrupt:
        # exit properly if user choose to interrupt the kernel
        break

# release the camera properly and destroy the OpenCV windows
cap.release()
cv2.destroyAllWindows()

In [10]:
# Run this if camera or OpenCV seems to not closing/functioning
cap.release()
cv2.destroyAllWindows()

# Or collect from Google Search using a quick method

1. Install the [Fatkun Batch Download Image extension](https://chrome.google.com/webstore/detail/fatkun-batch-download-ima/nnjjahlikiabnchcpehcpkdeckfgnohf?hl=en) from Google Chrome.
2. Open up a tab and search for the images you want, e.g. cloth face mask
3. Then use the extension to download all the images that are found in the Google Search tab. Beware that the more you scroll down, the more images will be downloaded.

NOTE: This requires you to perform more cleaning up and filtering to make sure the images that you've collected are of good quality! As the saying goes for machine learning, "Garbage in, Garbage Out". Good data is the core of a machine learning model with good performance.

# OPTIONAL - Compress them for Colab Training

In [14]:
import os
ARCHIVE_PATH = os.path.join(IMAGE_DIR, "archive.tar.gz")

TRAIN_PATH = os.path.join(IMAGE_DIR, "train")
TEST_PATH = os.path.join(IMAGE_DIR, "test")

# add the training and testing datasets to a tar file
!tar -czf {ARCHIVE_PATH} {TRAIN_PATH} {TEST_PATH}
print(f"File saved at {ARCHIVE_PATH}")

File saved at dataset\archive.tar.gz


# Acknowledgement

Thanks to the YouTube video below for the inspiring this notebook.
1. https://www.youtube.com/watch?v=YjnGou4skGU - 36 Building Your Own Gesture Recognition System with Your Own Data