In [2]:
!wget -q https://git.io/JGc31 -O ucf101_top5.tar.gz
!tar xf ucf101_top5.tar.gz

In [85]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tqdm import tqdm

import os

import numpy as np
import cv2
import pandas as pd
%matplotlib inline
# display original
from matplotlib import pyplot as plt
from matplotlib import animation
from IPython.display import HTML

In [92]:
MAX_SEQ_LENGTH = 50
NUM_FEATURES = 1024
IMG_SIZE = 128

EPOCHS = 5

In [93]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE)


def crop_center(frame):
    cropped = center_crop_layer(frame[None, ...])
    cropped = cropped.numpy().squeeze()
    return cropped


def load_video(path, max_frames=0):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center(frame)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

Total videos for training: 594
Total videos for testing: 224


In [94]:
# Label preprocessing with StringLookup.
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"]), mask_token=None
)
print(label_processor.get_vocabulary())
NUM_CLASSES = len(label_processor.get_vocabulary())

def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_features` are what we will feed to our sequence model.
    sequences = []

    # For each video.
    for idx, path in tqdm(enumerate(video_paths)):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))

        # Pad shorter videos.
        print(len(frames))
        if len(frames) < MAX_SEQ_LENGTH:
            diff = MAX_SEQ_LENGTH - len(frames)
            # print(diff)
            padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
            # print(frames.shape)
            # print(padding.shape)
            frames = np.concatenate(frames, padding)
            

        else:
            frames = frames[:MAX_SEQ_LENGTH]
        frames = np.expand_dims(frames, 0)
        sequences.append(frames)
    sequences = np.concatenate(sequences, axis=0)
    return sequences, tf.squeeze(tf.one_hot(labels, depth=NUM_CLASSES))

['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']


In [96]:
df_a = train_df.take([0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500])

In [97]:
sequences, labels = prepare_all_videos(df_a, './train/')

1it [00:00,  2.96it/s]

75


2it [00:00,  2.71it/s]

92


3it [00:01,  2.28it/s]

105


4it [00:02,  1.54it/s]

231


5it [00:03,  1.30it/s]

249


6it [00:04,  1.10it/s]

300


7it [00:05,  1.01it/s]

300


8it [00:06,  1.03s/it]

271


9it [00:07,  1.01s/it]

241


10it [00:08,  1.12s/it]

300


11it [00:10,  1.10it/s]

235





In [98]:

labels

<tf.Tensor: shape=(11, 5), dtype=float32, numpy=
array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]], dtype=float32)>

In [165]:
def sample_beta_distribution(size, concentration_0=0.2, concentration_1=0.2):
    # Taken from https://keras.io/examples/vision/cutmix/
    gamma_1_sample = tf.random.gamma(shape=[size], alpha=concentration_1)
    gamma_2_sample = tf.random.gamma(shape=[size], alpha=concentration_0)
    return gamma_1_sample / (gamma_1_sample + gamma_2_sample)


#@tf.function take part of the dataset from training, shuffle it and apply stackmix to the percentage of data
def stackmix(train_ds, ratio=0.5, num_frames=28):
    # Similar to Temporal Mix as referred in VideoMix Paper. Used from https://github.com/jayChung0302/videomix
    (videos, labels) = train_ds
    indices = tf.range(start=0, limit=tf.shape(videos)[0], dtype=tf.int32)
    shuffled_indices = tf.random.shuffle(indices)
    selected_count = int(len(indices) * ratio)
    selected_indices1 = indices[:selected_count]
    selected_indices2 = shuffled_indices[:selected_count]
    selected_videos1 = tf.gather(videos, selected_indices1)
    selected_labels1 = tf.gather(labels, selected_indices1)
    selected_videos2 = tf.gather(videos, selected_indices2)
    selected_labels2 = tf.gather(labels, selected_indices2)

    alpha = [0.25]
    beta = [0.25]

    # Get a sample from the Beta distribution
    lambda_value = sample_beta_distribution(1, alpha, beta)
    cut_idx = int(lambda_value * num_frames)
    print(cut_idx)
    videos_ds_one = selected_videos1[:, :cut_idx, :, :]
    videos_ds_two = selected_videos2[:, cut_idx:, :, :]
    stacked_videos = tf.concat([videos_ds_one, videos_ds_two], axis=1)
    
    # Combine the labels of both images
    stacked_labels = lambda_value * selected_labels1 + (1 - lambda_value) * selected_labels2
    all_videos = tf.concat([stacked_videos, videos[selected_count:]], 0)
    all_labels = tf.concat([stacked_labels, labels[selected_count:]], 0)
    indices = tf.range(start=0, limit=tf.shape(all_videos)[0], dtype=tf.int32)
    shuffled_indices = tf.random.shuffle(indices)

    shuffled_videos = tf.gather(all_videos, shuffled_indices)
    shuffled_labels = tf.gather(all_labels, shuffled_indices)
    return shuffled_videos, shuffled_labels

@tf.function
def get_box(lambda_value, frame_size=224):
    cut_rat = tf.math.sqrt(1.0 - lambda_value)

    cut_w = frame_size * cut_rat  # rw
    cut_w = tf.cast(cut_w, tf.int32)

    cut_h = frame_size * cut_rat  # rh
    cut_h = tf.cast(cut_h, tf.int32)

    cut_x = tf.random.uniform((1,), minval=0, maxval=frame_size, dtype=tf.int32)  # rx
    cut_y = tf.random.uniform((1,), minval=0, maxval=frame_size, dtype=tf.int32)  # ry

    boundaryx1 = tf.clip_by_value(cut_x[0] - cut_w // 2, 0, frame_size)
    boundaryy1 = tf.clip_by_value(cut_y[0] - cut_h // 2, 0, frame_size)
    bbx2 = tf.clip_by_value(cut_x[0] + cut_w // 2, 0, frame_size)
    bby2 = tf.clip_by_value(cut_y[0] + cut_h // 2, 0, frame_size)

    target_h = bby2 - boundaryy1
    if target_h == 0:
        target_h += 1

    target_w = bbx2 - boundaryx1
    if target_w == 0:
        target_w += 1

    return boundaryx1, boundaryy1, target_h, target_w

def tubemix(x, y, prob, alpha=[0.25], beta = [0.25]):
    # The videomix paper calls it spatial mix https://github.com/jayChung0302/videomix
    if prob < 0:
        raise ValueError('prob must be a positive value')

    if tf.random.uniform([], minval=0, maxval=1) < prob:
        print("Tube Mixing in progress...")
        indices = tf.range(start=0, limit=tf.shape(x)[0], dtype=tf.int32)
        shuffled_indices = tf.random.shuffle(indices)
        lam = sample_beta_distribution(1, alpha, beta)
        bbx1, bby1, target_h, target_w = get_box(lam, frame_size=128)
        bbx1 = bbx1[0][0].numpy().astype(int)
        bby1 = bby1[0][0].numpy().astype(int)
        bbx2 = bbx1 + target_h
        bbx2 = bbx2[0][0].numpy().astype(int)
        bby2 = bby1 + target_w
        bby2 = bby2[0][0].numpy().astype(int)
        selected_volume = tf.gather(x, shuffled_indices)
        selected_volume = selected_volume[:, :, bbx1:bbx2, bby1:bby2, :]
        x = tf.Variable(x)
        x[:, :, bbx1:bbx2, bby1:bby2].assign(selected_volume)
        lam = 1 - ((target_h) * (target_w) / (x.shape[-2] * x.shape[-3]))
        lam = tf.cast(lam, dtype=tf.float32) # Channels First
        tube_y = y * lam + tf.gather(y, shuffled_indices) * (1 - lam)
        return x, tube_y
    else:
        return x, y

In [109]:
stack_mixed = stackmix((sequences, labels), num_frames=sequences.shape[1])

37


In [110]:
stack_mixed[0].shape

TensorShape([11, 50, 128, 128, 3])

In [116]:


# np array with shape (frames, height, width, channels)
video = sequences[0].astype(int)

fig = plt.figure()
im = plt.imshow(video[0,:,:,:])
plt.axis('off')
plt.close() # this is required to not display the generated image

def init():
    im.set_data(video[0,:,:,:])

def animate(i):
    im.set_data(video[i,:,:,:])
    return im

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0],
                               interval=50)
HTML(anim.to_html5_video())

In [114]:
# np array with shape (frames, height, width, channels)
video = stack_mixed[0][0].numpy().astype(int)

fig = plt.figure()
im = plt.imshow(video[0,:,:,:])
plt.axis('off')
plt.close() # this is required to not display the generated image

def init():
    im.set_data(video[0,:,:,:])

def animate(i):
    im.set_data(video[i,:,:,:])
    return im

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0],
                               interval=50)
HTML(anim.to_html5_video())


In [169]:
tube_mixed = tubemix(sequences, labels, 0.5)

(11, 50, 128, 128, 3)
(11, 50, 128, 128, 3)
0
24
73
86
tf.Tensor([[0.98095703]], shape=(1, 1), dtype=float32)
tf.Tensor(
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]], shape=(11, 5), dtype=float32)


In [170]:
# np array with shape (frames, height, width, channels)
video = tube_mixed[0][0].numpy().astype(int)

fig = plt.figure()
im = plt.imshow(video[0,:,:,:])
plt.axis('off')
plt.close() # this is required to not display the generated image

def init():
    im.set_data(video[0,:,:,:])

def animate(i):
    im.set_data(video[i,:,:,:])
    return im

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0],
                               interval=50)
HTML(anim.to_html5_video())