In [1]:
import os

import av
import cv2
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import albumentations as A

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoProcessor, AutoModel

In [2]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [3]:
batch_size = 8
root_dir = 'UCF-101/UCF-101/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
def apply_video_augmentations(video, transform):
    targets={'image': video[0]}
    for i in range(1, video.shape[0]):
        targets[f'image{i}'] = video[i]
    transformed = transform(**targets)
    transformed = np.concatenate(
        [np.expand_dims(transformed['image'], axis=0)] 
        + [np.expand_dims(transformed[f'image{i}'], axis=0) for i in range(1, video.shape[0])]
    )
    return transformed

In [5]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = converted_len
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

# Dataset preparation

In [6]:
labels = [i for i in os.listdir(root_dir) if i[0] != '.']
labels2id = {label:i for i, label in enumerate(labels)}

In [7]:
train = []
for label in tqdm(labels):
    for video_name in os.listdir(f'{root_dir}/{label}'):
        container = av.open(f'{root_dir}/{label}/{video_name}')
        if container.streams.video[0].frames > 75:
            train.append({
                'label': label,
                'video_path': f'{root_dir}/{label}/{video_name}'
            })
train = pd.DataFrame(train)

  0%|          | 0/101 [00:00<?, ?it/s]

In [8]:
train.label.value_counts()

PlayingDhol          164
PlayingCello         164
HorseRiding          163
BoxingPunchingBag    162
Drumming             161
                    ... 
BodyWeightSquats      90
JavelinThrow          82
BlowingCandles        68
BasketballDunk        57
PushUps               54
Name: label, Length: 101, dtype: int64

In [9]:
train['label_id'] = train.label.map(labels2id)

In [10]:
X_train, X_val, _, _ = train_test_split(train, train['label'])

# Load model

In [11]:
processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
model.to(device)

XCLIPModel(
  (text_model): XCLIPTextTransformer(
    (embeddings): XCLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): XCLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x XCLIPEncoderLayer(
          (self_attn): XCLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): XCLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps

# Zero-shot example

In [12]:
file_path = 'UCF-101/UCF-101/ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01.avi'
container = av.open(file_path)
indices = sample_frame_indices(clip_len=8, frame_sample_rate=5, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)

In [13]:
inputs = processor(
    text=labels,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)
inputs.to(device)

# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video
probs = logits_per_video.softmax(dim=1)
print(labels[probs.argmax()])

  return torch.tensor(value)


ApplyEyeMakeup


# Validate zero-shot

In [14]:
model.eval()  

val_targets = []
val_preds = []
for line in tqdm(X_val.itertuples()):
    
    file_path = line.video_path
    container = av.open(file_path)
    indices = sample_frame_indices(clip_len=8, frame_sample_rate=5, seg_len=container.streams.video[0].frames)
    video = read_video_pyav(container, indices)
    
    inputs = processor(
        text=labels,
        videos=list(video),
        return_tensors="pt",
        padding=True,
    )
    
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits_per_video = outputs.logits_per_video
    probs = logits_per_video.softmax(dim=1)

    val_targets.append(line.label_id)
    val_preds.append(probs.argmax(axis=1).cpu().numpy()[0])

print('F1:', f1_score(val_targets, val_preds, average='macro'))

0it [00:00, ?it/s]

F1: 0.6009428122232213


# Train

In [15]:
transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.5, contrast_limit=0.5, p=0.5)
], additional_targets={
    f'image{i}': 'image'
    for i in range(1, 8)
})

In [16]:
class ActionDataset(Dataset):

    def __init__(self, meta, transform=None):
        self.meta = meta
        self.transform = transform

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()


        file_path = self.meta['video_path'].iloc[idx]
        container = av.open(file_path)
        indices = sample_frame_indices(clip_len=8, frame_sample_rate=5, seg_len=container.streams.video[0].frames)
        try:
            indices = sample_frame_indices(clip_len=8, frame_sample_rate=5, seg_len=container.streams.video[0].frames)
        except Exception:
            indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
        if indices.shape[0] < 8:
            indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
            
        video = read_video_pyav(container, indices)
        while video.shape[0] < 8:
            video = np.vstack([video, video[-1:]])

        if self.transform:
            transformed = apply_video_augmentations(video, self.transform)
            

        inputs = processor(
            text=[self.meta['label'].iloc[idx]],
            videos=list(video),
            return_tensors="pt",
            padding='max_length',
            max_length=8
        )
        for i in inputs:
            inputs[i] = inputs[i][0]

        return inputs

In [17]:
train_dataset = ActionDataset(meta=X_train, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

In [18]:
epochs = 5
lr = 1e-5

optimizer = optim.AdamW(model.parameters(), lr)

In [19]:
for epoch in range(epochs):

    model.train()    

    train_loss = []
    for i, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch: {epoch}")):
        optimizer.zero_grad()

        batch = batch.to(device)

        outputs = model(**batch, return_loss=True)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())

    print('Training loss:', np.mean(train_loss))
    
    model.eval()  

    val_targets = []
    val_preds = []
    for line in tqdm(X_val.itertuples()):

        file_path = line.video_path
        container = av.open(file_path)
        indices = sample_frame_indices(clip_len=8, frame_sample_rate=5, seg_len=container.streams.video[0].frames)
        video = read_video_pyav(container, indices)

        inputs = processor(
            text=labels,
            videos=list(video),
            return_tensors="pt",
            padding=True,
        )

        inputs = inputs.to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        logits_per_video = outputs.logits_per_video
        probs = logits_per_video.softmax(dim=1)

        val_targets.append(line.label_id)
        val_preds.append(probs.argmax(axis=1).cpu().numpy()[0])

    print('F1:', f1_score(val_targets, val_preds, average='macro'))

Epoch: 0:   0%|          | 0/1172 [00:00<?, ?it/s]

Training loss: 0.17138626353610628


0it [00:00, ?it/s]

F1: 0.7701438562803449


Epoch: 1:   0%|          | 0/1172 [00:00<?, ?it/s]

Training loss: 0.12089584347651765


0it [00:00, ?it/s]

F1: 0.7454664787634218


Epoch: 2:   0%|          | 0/1172 [00:00<?, ?it/s]

Training loss: 0.10255951884639385


0it [00:00, ?it/s]

F1: 0.8453154939832047


Epoch: 3:   0%|          | 0/1172 [00:00<?, ?it/s]

Training loss: 0.10020141764683199


0it [00:00, ?it/s]

F1: 0.8156328234738446


Epoch: 4:   0%|          | 0/1172 [00:00<?, ?it/s]

Training loss: 0.09650173033271515


0it [00:00, ?it/s]

F1: 0.8402585610691524
