In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Start Monday Oct 7th 2024
import os
print(os.getcwd())

! git clone https://github.com/huggingface/transformers.git
! ls /kaggle/working/transformers/src

/kaggle/working
Cloning into 'transformers'...
remote: Enumerating objects: 233169, done.[K
remote: Counting objects: 100% (530/530), done.[K
remote: Compressing objects: 100% (271/271), done.[K
remote: Total 233169 (delta 318), reused 375 (delta 212), pack-reused 232639 (from 1)[K
Receiving objects: 100% (233169/233169), 246.94 MiB | 25.45 MiB/s, done.
Resolving deltas: 100% (169859/169859), done.
transformers


In [3]:
# Install AvPy
! pip3 install av


import random
import matplotlib.pyplot as plt
import torch.nn as nn
import numpy as np
import shutil # for moving files around

from sklearn.model_selection import train_test_split

# Import relevant packages
import av
from transformers import VivitConfig, VivitModel, VivitImageProcessor, VivitForVideoClassification
from huggingface_hub import hf_hub_download

Collecting av
  Downloading av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: av
Successfully installed av-13.1.0


In [4]:
# Import various libraries, take model and split into classes, move them into the working directory

data_dir = '../input/hand-wash-dataset/HandWashDataset/HandWashDataset'

classes = ['Step_1', 'Step_2_Left', 'Step_2_Right', 'Step_3', 'Step_4_Left', 'Step_4_Right', 'Step_5_Left',
           'Step_5_Right', 'Step_6_Left', 'Step_6_Right', 'Step_7_Left', 'Step_7_Right']

def split_dataset(data_dir, classes, test_size=0.2, random_state=20):
    video_lengths = []
    for class_name in classes:
        class_dir = os.path.join(data_dir, class_name)
        videos = os.listdir(class_dir)
        for video_file in videos:
            video_path = os.path.join(class_dir, video_file)
            video_lengths.append((class_name, video_path))
            
    train_vid, test_vid = train_test_split(video_lengths, test_size=test_size, random_state=random_state)
    return train_vid, test_vid

def copy_videos_to_folders(videos, output_dir, set_name):
    for class_name, video_path in videos:
        destination_dir = os.path.join(output_dir, set_name, class_name)
        if not os.path.exists(destination_dir):
            os.makedirs(destination_dir)
        filename = os.path.basename(video_path)
        dest_file = os.path.join(destination_dir, filename)
        shutil.copy(video_path, dest_file)

# Preparing the dataset and splititng into training and test datasets
train_videos, test_videos = split_dataset(data_dir, classes)

# output directory
output_dir = '/kaggle/working'
copy_videos_to_folders(train_videos, output_dir, 'train')
copy_videos_to_folders(test_videos, output_dir, 'test')

In [5]:
# Auxilliary functions from Hugging Face
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

In [6]:
import sys
from torch.utils.data import Dataset, DataLoader

# Class of dataset (Needs a lot of preprocessing)
class HandwashingDataset(Dataset):
    """Handwashing Dataset."""
    def __init__(self, videos, labels, transform=None):
        assert(len(videos) == len(labels))
        self.videos = videos
        self.classes = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return {'video' : videos[idx], 'class' : labels[idx]}

def make_dataset(data_dir, classes, image_processor):
    # data_dir refers to the directory from 'working/kaggle' (typically 'test' or 'train')
    # assumes that at 'working/kaggle/data_dir' there is a directory for each class in classes
    videos = []
    labels = []
    for i in range(len(classes)):
        print(f"\rDownloading from {data_dir}/{classes[i]}")
        
        # Navigate to the class directory and list out the files in this directory
        class_dir = os.path.join(data_dir, classes[i])
        video_names = os.listdir(class_dir)
        for j in range(len(video_names)):
            print(f"\r Importing {i+1} out of {len(video_names)}", end="")
            sys.stdout.flush()
            
            vid_title = video_names[j] # Get title
            path = os.path.join(class_dir, vid_title) # Navigate to title
            vid_container = av.open(path) # Open video at title
            vid_indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=vid_container.streams.video[0].frames) # Get random indices
            vid_read = read_video_pyav(container=vid_container, indices=vid_indices) # read the video
            vid_input = image_processor(list(vid_read), return_tensors="pt") # preprocess the video
            videos.append(vid_input) # add the video to the list
            labels.append(j) # add the label
            
    print("\nDone")
    return (videos, labels)

image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
videos, labels = make_dataset('train', classes, image_processor)
data_loader = HandwashingDataset(videos, labels)

preprocessor_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading from train/Step_1
 Importing 1 out of 20

  return torch.tensor(value)


Downloading from train/Step_2_Left
Downloading from train/Step_2_Right
Downloading from train/Step_3
Downloading from train/Step_4_Left
Downloading from train/Step_4_Right
Downloading from train/Step_5_Left
Downloading from train/Step_5_Right
Downloading from train/Step_6_Left
Downloading from train/Step_6_Right
Downloading from train/Step_7_Left
Downloading from train/Step_7_Right
 Importing 12 out of 18
Done


In [7]:
type(videos[0])

transformers.image_processing_base.BatchFeature

In [13]:
# set configuration and initalize the desired model
import torch

config = VivitConfig()
config.num_labels = len(classes)

model = VivitForVideoClassification(config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available else nn.CrossEntropyLoss()

In [17]:
outputs = model(**videos[0])
logits = outputs.logits
predicted_label = logits.argmax(-1).item()
print(outputs)
print(logits)
print(predicted_label)

ImageClassifierOutput(loss=None, logits=tensor([[ 0.5762, -0.1448,  0.1088,  1.0504, -0.4075, -0.6358,  0.1499,  0.9902,
          0.5092,  0.1598,  0.0804,  1.0279]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[ 0.5762, -0.1448,  0.1088,  1.0504, -0.4075, -0.6358,  0.1499,  0.9902,
          0.5092,  0.1598,  0.0804,  1.0279]], grad_fn=<AddmmBackward0>)
3


In [26]:
desired = torch.from_numpy(np.array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]))

criterion(logits, desired)

tensor(2.3352, dtype=torch.float64, grad_fn=<DivBackward1>)

In [None]:
def train_net(n_epochs):
    
    model.train()
    for epoch in range(n_epochs):
        running_loss = 0.0
        
        for batch_i, data in enumerate(data_loader):
            video = data['video'].to(device)
            label = data['class'].to(device)
            
            outputs = model(video)
            loss = criterion(outputs.view(-1))