This is notebook prepares a subset of the [UCF-101 dataset](https://www.tensorflow.org/datasets/catalog/ucf101) for experimenting with video classification models. It's almost a copy-paste of [this tutorial](https://www.tensorflow.org/tutorials/load_data/video).

## Initial setup

In [1]:
!pip install remotezip tqdm -q

In [2]:
import tqdm
import random
import pathlib
import collections

import os
import numpy as np
import remotezip as rz

random.seed(666)

## Utilities

All of these are taken from [this tutorial](https://www.tensorflow.org/tutorials/load_data/video).

In [3]:
URL = "https://storage.googleapis.com/thumos14_files/UCF101_videos.zip"

In [4]:
def list_files_from_zip_url(zip_url):
    """List the files in each class of the dataset given a URL with the zip file.

    Args:
      zip_url: A URL from which the files can be extracted from.

    Returns:
      List of files in each of the classes.
    """
    files = []
    with rz.RemoteZip(zip_url) as zip:
        for zip_info in zip.infolist():
            files.append(zip_info.filename)
    return files

In [5]:
files = list_files_from_zip_url(URL)
files = [f for f in files if f.endswith(".avi")]
files[:10]

['UCF101/v_ApplyEyeMakeup_g01_c01.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c02.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c03.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c04.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c05.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c06.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c01.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c02.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c03.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c04.avi']

In [6]:
def get_class(fname):
    """Retrieve the name of the class given a filename.

    Args:
      fname: Name of the file in the UCF101 dataset.

    Returns:
      Class that the file belongs to.
    """
    return fname.split("_")[-3]

In [7]:
def get_group(fname):
    """Retrieve the name of the scene/group given a filename.

    Args:
      fname: Name of the file in the UCF101 dataset.

    Returns:
      Scene that the file belongs to.
    """
    return fname.split("_")[-2]

In [8]:
def get_files_per_class(files):
    """Retrieve the files that belong to each class.

    Args:
      files: List of files in the dataset.

    Returns:
      Dictionary of class names (key) and files (values).
    """
    files_for_class = collections.defaultdict(list)
    for fname in files:
        class_name = get_class(fname)
        files_for_class[class_name].append(fname)
    return files_for_class

In [9]:
files_for_class = get_files_per_class(files)
classes = list(files_for_class.keys())

In [10]:
classes

['ApplyEyeMakeup',
 'ApplyLipstick',
 'Archery',
 'BabyCrawling',
 'BalanceBeam',
 'BandMarching',
 'BaseballPitch',
 'BasketballDunk',
 'Basketball',
 'BenchPress',
 'Biking',
 'Billiards',
 'BlowDryHair',
 'BlowingCandles',
 'BodyWeightSquats',
 'Bowling',
 'BoxingPunchingBag',
 'BoxingSpeedBag',
 'BreastStroke',
 'BrushingTeeth',
 'CleanAndJerk',
 'CliffDiving',
 'CricketBowling',
 'CricketShot',
 'CuttingInKitchen',
 'Diving',
 'Drumming',
 'Fencing',
 'FieldHockeyPenalty',
 'FloorGymnastics',
 'FrisbeeCatch',
 'FrontCrawl',
 'GolfSwing',
 'Haircut',
 'Hammering',
 'HammerThrow',
 'HandstandPushups',
 'HandstandWalking',
 'HeadMassage',
 'HighJump',
 'HorseRace',
 'HorseRiding',
 'HulaHoop',
 'IceDancing',
 'JavelinThrow',
 'JugglingBalls',
 'JumpingJack',
 'JumpRope',
 'Kayaking',
 'Knitting',
 'LongJump',
 'Lunges',
 'MilitaryParade',
 'Mixing',
 'MoppingFloor',
 'Nunchucks',
 'ParallelBars',
 'PizzaTossing',
 'PlayingCello',
 'PlayingDaf',
 'PlayingDhol',
 'PlayingFlute',
 'Play

In [12]:
def select_subset_of_classes(files_for_class, classes, files_per_class):
    """Create a dictionary with the class name and a subset of the files in that class.

    Args:
      files_for_class: Dictionary of class names (key) and files (values).
      classes: List of classes.
      files_per_class: Number of files per class of interest.

    Returns:
      Dictionary with class as key and list of specified number of video files in that class.
    """
    files_subset = dict()

    for class_name in classes:
        class_files = files_for_class[class_name]
        files_subset[class_name] = class_files[:files_per_class]

    return files_subset

In [13]:
NUM_CLASSES = len(classes)
FILES_PER_CLASS = 100

files_subset = select_subset_of_classes(
    files_for_class, classes[:NUM_CLASSES], FILES_PER_CLASS
)
list(files_subset.keys())

['ApplyEyeMakeup',
 'ApplyLipstick',
 'Archery',
 'BabyCrawling',
 'BalanceBeam',
 'BandMarching',
 'BaseballPitch',
 'BasketballDunk',
 'Basketball',
 'BenchPress',
 'Biking',
 'Billiards',
 'BlowDryHair',
 'BlowingCandles',
 'BodyWeightSquats',
 'Bowling',
 'BoxingPunchingBag',
 'BoxingSpeedBag',
 'BreastStroke',
 'BrushingTeeth',
 'CleanAndJerk',
 'CliffDiving',
 'CricketBowling',
 'CricketShot',
 'CuttingInKitchen',
 'Diving',
 'Drumming',
 'Fencing',
 'FieldHockeyPenalty',
 'FloorGymnastics',
 'FrisbeeCatch',
 'FrontCrawl',
 'GolfSwing',
 'Haircut',
 'Hammering',
 'HammerThrow',
 'HandstandPushups',
 'HandstandWalking',
 'HeadMassage',
 'HighJump',
 'HorseRace',
 'HorseRiding',
 'HulaHoop',
 'IceDancing',
 'JavelinThrow',
 'JugglingBalls',
 'JumpingJack',
 'JumpRope',
 'Kayaking',
 'Knitting',
 'LongJump',
 'Lunges',
 'MilitaryParade',
 'Mixing',
 'MoppingFloor',
 'Nunchucks',
 'ParallelBars',
 'PizzaTossing',
 'PlayingCello',
 'PlayingDaf',
 'PlayingDhol',
 'PlayingFlute',
 'Play

In [14]:
def download_from_zip(zip_url, to_dir, file_names):
    """Download the contents of the zip file from the zip URL.

    Args:
      zip_url: A URL with a zip file containing data.
      to_dir: A directory to download data to.
      file_names: Names of files to download.
    """
    with rz.RemoteZip(zip_url) as zip:
        for fn in tqdm.tqdm(file_names):
            class_name = get_class(fn)
            zip.extract(fn, str(to_dir / class_name))
            unzipped_file = to_dir / class_name / fn

            fn = pathlib.Path(fn).parts[-1]
            output_file = to_dir / class_name / fn
            unzipped_file.rename(output_file)

The `split_class_lists()` differs from the above-mentioned tutorial. Read the comments to know more. [This comment](https://github.com/huggingface/notebooks/pull/261#pullrequestreview-1174969133) explains why this had to be done.

In [17]:
def split_class_lists(files_for_class, count):
    """Returns the list of files belonging to a subset of data as well as the remainder of
    files that need to be downloaded.

    Args:
      files_for_class: Files belonging to a particular class of data.
      count: Number of files to download.

    Returns:
      Files belonging to the subset of data and dictionary of the remainder of files
      that need to be downloaded.
    """
    split_files = []
    remainder = {}
    for cls in files_for_class:
        split_files.extend(files_for_class[cls][:count])
        remainder[cls] = files_for_class[cls][count:]

    # Return the remaining files for a particular class in the following manner.
    # 1. For the selected video files for a given class, we derive all the groups / scenes.
    ##
    # 2. Now, for each remaining video file in the same class,
    # we check if its group is already present in the set we derived in 1.
    ##
    # 3. If the above condition is false, then only we consider the respective
    # video file.

    split_files_cls_wise = get_files_per_class(split_files)
    new_files_for_class = collections.defaultdict(list)
    for cls in remainder:
        remainder_files = remainder[cls]
        selected_files = split_files_cls_wise[cls]
        unique_groups = {get_group(filename) for filename in selected_files}
        for filename in remainder_files:
            if not get_group(filename) in unique_groups:
                new_files_for_class[cls].append(filename)
    return split_files, new_files_for_class

In [18]:
def download_ufc_101_subset(zip_url, num_classes, splits, download_dir):
    """Download a subset of the UFC101 dataset and split them into various parts, such as
    training, validation, and test.

    Args:
      zip_url: A URL with a ZIP file with the data.
      num_classes: Number of labels.
      splits: Dictionary specifying the training, validation, test, etc. (key) division of data
              (value is number of files per split).
      download_dir: Directory to download data to.

    Return:
      Mapping of the directories containing the subsections of data.
    """
    files = list_files_from_zip_url(zip_url)
    for f in files:
        path = os.path.normpath(f)
        tokens = path.split(os.sep)
        if len(tokens) <= 2:
            # Remove that item from the list if it does not have a filename
            files.remove(f)

    files_for_class = get_files_per_class(files)

    classes = list(files_for_class.keys())[:num_classes]

    for cls in classes:
        random.shuffle(files_for_class[cls])

    # Only use the number of classes you want in the dictionary
    files_for_class = {x: files_for_class[x] for x in classes}

    dirs = {}
    for split_name, split_count in splits.items():
        print(split_name, ":")
        split_dir = download_dir / split_name
        split_files, files_for_class = split_class_lists(files_for_class, split_count)
        download_from_zip(zip_url, split_dir, split_files)
        dirs[split_name] = split_dir

    return dirs

## Prepare the subset

In [19]:
download_dir = "./UCF101_subset/"
download_dir = pathlib.Path(download_dir)
subset_paths = download_ufc_101_subset(
    URL,
    num_classes=NUM_CLASSES,
    splits={"train": 95, "val":5},
    download_dir=download_dir,
)

train :


100%|██████████| 6660/6660 [19:45<00:00,  5.62it/s]


val :


0it [00:00, ?it/s]


## Verification

In [None]:
for cls in classes[:NUM_CLASSES]:
    video_train_cls = list(download_dir.glob(f"train/{cls}/*.avi"))
    video_test_cls = list(download_dir.glob(f"test/{cls}/*.avi"))
    video_val_cls = list(download_dir.glob(f"val/{cls}/*.avi"))

    unique_groups_train = {get_group(str(x)) for x in video_train_cls}
    unique_groups_test = {get_group(str(x)) for x in video_test_cls}
    unique_groups_val = {get_group(str(x)) for x in video_val_cls}

    print(
        f"Class: {cls}, Intersection found with val: {unique_groups_train.intersection(unique_groups_val)}"
    )
    print(
        f"Class: {cls}, Intersection found with test: {unique_groups_train.intersection(unique_groups_test)}"
    )
    print("\n")

Class: ApplyEyeMakeup, Intersection found with val: set()
Class: ApplyEyeMakeup, Intersection found with test: set()


Class: ApplyLipstick, Intersection found with val: set()
Class: ApplyLipstick, Intersection found with test: set()


Class: Archery, Intersection found with val: set()
Class: Archery, Intersection found with test: set()


Class: BabyCrawling, Intersection found with val: set()
Class: BabyCrawling, Intersection found with test: set()


Class: BalanceBeam, Intersection found with val: set()
Class: BalanceBeam, Intersection found with test: set()


Class: BandMarching, Intersection found with val: set()
Class: BandMarching, Intersection found with test: set()


Class: BaseballPitch, Intersection found with val: set()
Class: BaseballPitch, Intersection found with test: set()


Class: BasketballDunk, Intersection found with val: set()
Class: BasketballDunk, Intersection found with test: set()


Class: Basketball, Intersection found with val: set()
Class: Basketball, Interse

In [None]:
for split in ["train", "test", "val"]:
    print(len({get_class(str(x)) for x in list(download_dir.glob(f"{split}/*/*.avi"))}))

10
10
10


In [None]:
video_count_train = len(list(download_dir.glob("train/*/*.avi")))
video_count_val = len(list(download_dir.glob("val/*/*.avi")))
video_count_test = len(list(download_dir.glob("test/*/*.avi")))
video_total = video_count_train + video_count_val + video_count_test
print(f"Total videos: {video_total}")

Total videos: 405


## Misc

In [None]:
!tar cf UCF101_subset.tar.gz UCF101_subset

In [None]:
!rm -rf "./UCF101_subset/"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp UCF101_subset.tar.gz /content/drive/MyDrive