In [29]:
#Template Project to build off of
import cv2 as cv2
import numpy as np
import tensorflow as tf
import pandas as pd
import imageio
from tensorflow import keras
import os

In [30]:
IMG_SIZE = 256
BATCH_SIZE = 64
EPOCHS = 20

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

LABELS = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "unknown"]
train_df = pd.read_csv("training_data.csv")
test_df = pd.read_csv("test_data.csv")

In [31]:
def crop_center(frame):
    y, x = frame.shape[0:2]
    min_dimension = min(y, x)
    start_x = x//2 - (min_dimension//2)
    start_y = y//2 - (min_dimension//2)
    return frame[start_y:start_y+min_dimension,start_x:start_x+min_dimension]

In [32]:
#From a video file (.mp4) extract every n-th frame and return them as a numpy array
#This will allow us to extract frames from a video and use them to train our model
#Every frame would be way too much data but we can test and hone in exactly how many 
#frames will be needed for a good model

def load_video(video_file, max_frames, resize=(IMG_SIZE, IMG_SIZE), n=1):
    #if video_file is a jpg file then return an array of 10 images in a numpy array
    if video_file.endswith(".jpg"):
        return np.array([np.array(cv2.resize(imageio.imread(video_file), resize)) for i in range(3)])

    video = cv2.VideoCapture(video_file)
    frames = []
    
    try:
        current_frame = 0
        while(True):
            ret,frame = video.read()
            if not ret:
                break
            if current_frame % n == 0:
               #frame = crop_center(frame)
                frame = cv2.resize(frame, resize)
                frame = frame[:, :, [2,1,0]]
                frames.append(frame)

            if len(frames) == max_frames:
                break
            current_frame += 1
    finally:
        video.release()
    return np.array(frames)

In [33]:
#A feature extractor will allow us to extarct only the most important parts
#of each frame and discard the rest. This will allow us to train our model
#faster and more efficiently
def create_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights='imagenet',
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3)
    )
    pre_process_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
    preprocessed = pre_process_input(inputs)
    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

feature_extractor = create_feature_extractor()

In [34]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(LABELS)
)
print(label_processor.get_vocabulary())

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [35]:
# NEXT STEP: Create function using the above helper functions to load all the 
# video data that we plan to use, splitting them into training and validation sets
# along with their labels
# Then we can make a simple model to train on the data on, hopefully capable of 
# predicting the correct label for a given video (Fist or Hand)

def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        #if the filename starts with an int then skip it
        if path[0].isdigit():
            continue
        
        if idx % 100 == 0:
            print(f"Processing video {idx} of {num_samples}.")
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path),20)
        #if the size of the frames is less than 2 then skip it
        if frames.shape[0] < 2:
            continue
        if frames.ndim != 4:
            print("Frames is not 4D")
            continue
        
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                ) 
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


#train_data, train_labels = prepare_all_videos(train_df, "train")
#test_data, test_labels = prepare_all_videos(test_df, "test")

train_data, train_labels = prepare_all_videos(train_df, "C:/Users/bencl/Desktop/Data/Train")
test_data, test_labels = prepare_all_videos(test_df, "C:/Users/bencl/Desktop/Data/Test")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

Processing video 0 of 23974.


  return np.array([np.array(cv2.resize(imageio.imread(video_file), resize)) for i in range(3)])


Processing video 100 of 23974.
Processing video 200 of 23974.
Processing video 300 of 23974.
Processing video 400 of 23974.
Processing video 500 of 23974.
Processing video 600 of 23974.
Processing video 700 of 23974.
Processing video 800 of 23974.
Processing video 900 of 23974.
Processing video 1000 of 23974.
Processing video 1100 of 23974.
Processing video 1200 of 23974.
Processing video 1300 of 23974.
Processing video 1400 of 23974.
Processing video 1500 of 23974.
Processing video 1600 of 23974.
Processing video 1700 of 23974.
Processing video 1800 of 23974.
Processing video 1900 of 23974.
Processing video 2000 of 23974.
Processing video 2100 of 23974.
Processing video 2200 of 23974.
Processing video 2300 of 23974.
Processing video 2400 of 23974.
Processing video 2500 of 23974.
Processing video 2600 of 23974.
Processing video 2700 of 23974.
Processing video 2800 of 23974.
Processing video 2900 of 23974.
Processing video 3000 of 23974.
Processing video 3100 of 23974.
Processing video 

InvalidArgumentError: Exception encountered when calling layer "string_lookup_4" (type StringLookup).

Expected 'tf.Tensor(False, shape=(), dtype=bool)' to be true. Summarized data: b'When `num_oov_indices=0` all inputs should be in vocabulary, found OOV values ["unknown" "unknown" "unknown" "unknown" "unknown"], consider setting `num_oov_indices=1`.'

Call arguments received:
  • inputs=tf.Tensor(shape=(185, 1), dtype=string)

In [None]:
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(
        frame_features_input, mask=mask_input
    )
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.1)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model

In [None]:
# Utility for running experiments.
def run_experiment():
    filepath = "/tmp/video_classifier/"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.15,
        epochs=30,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


_, sequence_model = run_experiment()

NameError: name 'train_data' is not defined

In [None]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("test", path),20)
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames


# This utility is for visualization.
# Referenced from:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def to_gif(images):
    converted_images = images.astype(np.uint8)
    imageio.mimsave("animation.gif", converted_images, fps=10)
    return embed.embed_file("animation.gif")


Test video path: Hand10.webm
  HAND: 65.34%
  FIST: 27.81%
  THUMBS:  6.85%


In [None]:
test_video = np.random.choice(test_df["video_name"].values.tolist())
print(f"Test video path: {test_video}")
test_frames = sequence_prediction(test_video)
#to_gif(test_frames[:MAX_SEQ_LENGTH])

Test video path: Thumbs10.mp4
  THUMBS: 58.75%
  FIST: 21.56%
  HAND: 19.69%


In [None]:
#get live feed from web cam and predict the action given last 10 frames

current_frames = []

def predict_signs():
    cap = cv2.VideoCapture(0)
    while True:
        ret, frame = cap.read()
        current_frames.append(frame)
        if len(current_frames) == 10:
            #remove the first frame
            current_frames.pop(0)
            #convert current_frames to mp4 file and pass it to sequence_prediction()
            video = cv2.VideoWriter("current", 0, 1, (IMG_SIZE,IMG_SIZE))
            sequence_prediction(video)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
    return frame
