# Video Classification with a CNN-RNN Architecture

**Model based in:** Sayak Paul<br>


In [1]:
!nvidia-smi


Mon Jul 18 04:37:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000000:00:1E.0 Off |                    0 |
| N/A   45C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Data collection


In [3]:
!pip install imutils



## Setup

In [4]:
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

  from cryptography import x509


## Define hyperparameters

In [5]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 10

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

## Data preparation

In [6]:
train_df = pd.read_csv("finish_data_train.csv")
test_df = pd.read_csv("finish_data_test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

train_df.sample(10)

Total videos for training: 15844
Total videos for testing: 196


Unnamed: 0,dir,movement_label,movement_value
8768,./../data/train/tt2293060/shot_0062.mp4,Static,4
3960,./../data/trailer_zoom/tt2527192_shot_0051.mp4,Push,2
15303,./../data/train/tt2066041/shot_0001.mp4,Motion,0
9564,./../data/train/tt5766118/shot_0036.mp4,Static,4
3113,./../data/trailer_zoom/tt2395421_shot_0030.mp4,Push,2
10475,./../data/train/tt2235515/shot_0024.mp4,Static,4
6692,./../data/new_pull/tt3201722_shot_0028.mp4,Pull,1
4379,./../data/new_pull/tt2624704_shot_0010.mp4,Pull,1
10867,./../data/train/tt4151098/shot_0037.mp4,Static,4
1008,./../data/trailer_zoom/tt2140577_shot_0019.mp4,Push,2


In [12]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]



def load_video(path, max_frames=MAX_SEQ_LENGTH):
    cap = cv2.VideoCapture(path)
    
    # Total frames
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    n_frames = round(length/max_frames)
    
    total = 0
    frames = []
    i=0
    try:
        while True:
            ret, frame = cap.read()
            if not ret or total == max_frames:
                break
            else:
                pass
            
            if i%n_frames == 0:
                frame = crop_center_square(frame)
                frame = frame[:, :, [2, 1, 0]]
                frames.append(frame)
                total += 1
            
            i+=1
            
        if total < max_frames:
            cap = cv2.VideoCapture(path)
            for j in range(int(length)):
                ret, frame = cap.read()
                if j == (int(length)-1):
                    frame = crop_center_square(frame)
                    frame = frame[:, :, [2, 1, 0]]
                    frames.append(frame)
            
            
    finally:
        cap.release()
    return np.array(frames)

In [8]:
# Xception model

def build_feature_extractor():
    feature_extractor = keras.applications.Xception(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )

    preprocess_input = keras.applications.xception.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

In [9]:
feature_extractor.summary()

Model: "feature_extractor"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
tf.math.truediv (TFOpLambda) (None, 224, 224, 3)       0         
_________________________________________________________________
tf.math.subtract (TFOpLambda (None, 224, 224, 3)       0         
_________________________________________________________________
xception (Functional)        (None, 2048)              20861480  
Total params: 20,861,480
Trainable params: 20,806,952
Non-trainable params: 54,528
_________________________________________________________________


In [10]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["movement_label"])
)
print(label_processor.get_vocabulary())

['Motion', 'Pull', 'Push', 'Static']


In [13]:

def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["dir"].values.tolist()
    labels = df["movement_label"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


train_data, train_labels = prepare_all_videos(train_df, "train")
test_data, test_labels = prepare_all_videos(test_df, "test")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

Frame features in train set: (15844, 20, 2048)
Frame masks in train set: (15844, 20)


## The sequence model


In [15]:
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:

    x = keras.layers.LSTM(256, return_sequences=True)(
        frame_features_input, mask=mask_input)
    x = keras.layers.LSTM(128)(x)
    x = keras.layers.Dropout(0.4)(x) # ver si al quitar mejora el modelo
    x = keras.layers.Dense(64, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)
    rnn_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    
    return rnn_model


# Utility for running experiments.
def run_experiment():
    filepath = "/tmp/video_classifier000"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


_, sequence_model = run_experiment()


Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.42155, saving model to /tmp/video_classifier000
Epoch 2/10

Epoch 00002: val_loss did not improve from 1.42155
Epoch 3/10

Epoch 00003: val_loss did not improve from 1.42155
Epoch 4/10

Epoch 00004: val_loss did not improve from 1.42155
Epoch 5/10

Epoch 00005: val_loss did not improve from 1.42155
Epoch 6/10

Epoch 00006: val_loss did not improve from 1.42155
Epoch 7/10

Epoch 00007: val_loss did not improve from 1.42155
Epoch 8/10

Epoch 00008: val_loss did not improve from 1.42155
Epoch 9/10

Epoch 00009: val_loss did not improve from 1.42155
Epoch 10/10

Epoch 00010: val_loss did not improve from 1.42155
Test accuracy: 25.51%


## Inference

In [None]:

def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("test", path))
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames


test_video = np.random.choice(test_df["dir"].values.tolist())
print(f"Test video path: {test_video}")
test_frames = sequence_prediction(test_video)