In [1]:
import os

import av
import cv2
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import albumentations as A

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoProcessor, AutoModel

2024-03-18 21:27:18.005969: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-18 21:27:18.483681: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/vladimir/.virtualenvs/ml/lib/python3.10/site-packages/cv2/../../lib64:
2024-03-18 21:27:18.483729: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/vladimir/.virtualenvs/ml/lib/python3.10/site-pa

In [2]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
def apply_video_augmentations(video, transform):
    targets={'image': video[0]}
    for i in range(1, video.shape[0]):
        targets[f'image{i}'] = video[i]
    transformed = transform(**targets)
    transformed = np.concatenate(
        [np.expand_dims(transformed['image'], axis=0)] 
        + [np.expand_dims(transformed[f'image{i}'], axis=0) for i in range(1, video.shape[0])]
    )
    return transformed

In [4]:
def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    while converted_len >= seg_len and clip_len > 1:
        clip_len -= 1
        converted_len = int(clip_len * frame_sample_rate)
    end_idx = converted_len
    start_idx = 0
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

In [5]:
batch_size = 32
root_dir = 'UCF-101/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Dataset preparation

In [6]:
labels = [i for i in os.listdir(root_dir) if i[0] != '.']
labels2id = {label:i for i, label in enumerate(labels)}

In [7]:
train = []
for label in tqdm(labels):
    for video_name in os.listdir(f'{root_dir}/{label}'):
        container = av.open(f'{root_dir}/{label}/{video_name}')
        if container.streams.video[0].frames > 75:
            train.append({
                'label': label,
                'video_path': f'{root_dir}/{label}/{video_name}'
            })
train = pd.DataFrame(train)

  0%|          | 0/101 [00:00<?, ?it/s]

In [8]:
train.label.value_counts()

PlayingCello         164
PlayingDhol          164
HorseRiding          163
BoxingPunchingBag    162
Drumming             161
                    ... 
BodyWeightSquats      90
JavelinThrow          82
BlowingCandles        68
BasketballDunk        57
PushUps               54
Name: label, Length: 101, dtype: int64

In [9]:
train['label_id'] = train.label.map(labels2id)

In [10]:
X_train, X_val, _, _ = train_test_split(train, train['label'])

In [11]:
transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.5, contrast_limit=0.5, p=0.5)
], additional_targets={
    f'image{i}': 'image'
    for i in range(1, 8)
})

In [12]:
class ActionDataset(Dataset):

    def __init__(self, meta, transform=None):
        self.meta = meta
        self.transform = transform

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        while True:
            try:
                file_path = self.meta['video_path'].iloc[idx]
                container = av.open(file_path)
                indices = sample_frame_indices(clip_len=8, frame_sample_rate=5, seg_len=container.streams.video[0].frames)
                video = read_video_pyav(container, indices)
                while video.shape[0] < 8:
                    video = np.vstack([video, video[-1:]])
            except Exception as e:
                continue
                
            break

        if self.transform:
            transformed = apply_video_augmentations(video, self.transform)
            

        inputs = processor(
            text=[''],
            videos=list(video),
            return_tensors="pt",
            padding=True,
        )
        for i in inputs:
            inputs[i] = inputs[i][0]

        return inputs, self.meta['label_id'].iloc[idx]

In [13]:
train_dataset = ActionDataset(meta=X_train, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=15)

val_dataset = ActionDataset(meta=X_val)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=15)

# Load model

In [14]:
processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
model.to(device)
classifier = nn.Linear(512, len(labels))
classifier.to(device)

Linear(in_features=512, out_features=101, bias=True)

# Frozen XClip training 

In [15]:
for param in model.parameters():
    param.requires_grad = False

In [16]:
epochs = 5
freeze_epochs = 5
model_lr = 1e-5
classifier_lr = 1e-3

model_optimizer = optim.AdamW(model.parameters(), model_lr)
classifier_optimizer = torch.optim.AdamW(classifier.parameters(), lr=classifier_lr)

criterion = nn.CrossEntropyLoss()

In [17]:
for epoch in range(epochs):

    model.eval()
    classifier.train()     

    train_loss = []
    for i, (batch, targets) in enumerate(tqdm(train_dataloader, desc=f"Epoch: {epoch}")):
        classifier_optimizer.zero_grad()

        batch = batch.to(device)
        targets = targets.to(device)

        with torch.no_grad():
            outputs = model(**batch)
        logits = classifier(outputs.video_embeds)

        loss = criterion(logits, targets) 
        loss.backward()
        classifier_optimizer.step()

        train_loss.append(loss.item())

    print('Training loss:', np.mean(train_loss))

    model.eval() 
    classifier.eval()    

    val_loss = []
    val_targets = []
    val_preds = []
    for i, (batch, targets) in enumerate(tqdm(val_dataloader, desc=f"Epoch: {epoch}")):
        with torch.no_grad():

            batch = batch.to(device)
            targets = targets.to(device)

            outputs = model(**batch)
            logits = classifier(outputs.video_embeds)

            loss = criterion(logits, targets) 

            val_loss.append(loss.item())
            val_targets.extend(targets.cpu().numpy())
            val_preds.extend(logits.argmax(axis=1).cpu().numpy())

    print('Val loss:', np.mean(val_loss))
    print('F1:', f1_score(val_targets, val_preds, average='macro'))

Epoch: 0:   0%|          | 0/293 [00:00<?, ?it/s]

Training loss: 4.018884596971115


Epoch: 0:   0%|          | 0/98 [00:00<?, ?it/s]

Val loss: 3.433330998128774
F1: 0.7946440619920973


Epoch: 1:   0%|          | 0/293 [00:00<?, ?it/s]

Training loss: 2.900474670397137


Epoch: 1:   0%|          | 0/98 [00:00<?, ?it/s]

Val loss: 2.4261778544406503
F1: 0.8477762722676296


Epoch: 2:   0%|          | 0/293 [00:00<?, ?it/s]

Training loss: 2.006405236777999


Epoch: 2:   0%|          | 0/98 [00:00<?, ?it/s]

Val loss: 1.679323940860982
F1: 0.8728186861949377


Epoch: 3:   0%|          | 0/293 [00:00<?, ?it/s]

Training loss: 1.3875153754351488


Epoch: 3:   0%|          | 0/98 [00:00<?, ?it/s]

Val loss: 1.1925898747784751
F1: 0.8929003187479272


Epoch: 4:   0%|          | 0/293 [00:00<?, ?it/s]

Training loss: 0.9996980937267733


Epoch: 4:   0%|          | 0/98 [00:00<?, ?it/s]

Val loss: 0.8944721252334361
F1: 0.9112310784445119


# Full XClip training

In [18]:
for param in model.parameters():
    param.requires_grad = True
for param in model.text_model.parameters():
    param.requires_grad = False

In [19]:
for epoch in range(epochs):

    model.train() 
    classifier.train()     

    train_loss = []
    for i, (batch, targets) in enumerate(tqdm(train_dataloader, desc=f"Epoch: {epoch}")):
        model_optimizer.zero_grad()
        classifier_optimizer.zero_grad()

        batch = batch.to(device)
        targets = targets.to(device)

        outputs = model(**batch)
        logits = classifier(outputs.video_embeds)

        loss = criterion(logits, targets) 
        loss.backward()
        model_optimizer.step()
        classifier_optimizer.step()

        train_loss.append(loss.item())

    print('Training loss:', np.mean(train_loss))

    model.eval()
    classifier.eval()   

    val_loss = []
    val_targets = []
    val_preds = []
    for i, (batch, targets) in enumerate(tqdm(val_dataloader, desc=f"Epoch: {epoch}")):
        with torch.no_grad():

            batch = batch.to(device)
            targets = targets.to(device)

            outputs = model(**batch)
            logits = classifier(outputs.video_embeds)

            loss = criterion(logits, targets) 

            val_loss.append(loss.item())
            val_targets.extend(targets.cpu().numpy())
            val_preds.extend(logits.argmax(axis=1).cpu().numpy())           

    print('Val loss:', np.mean(val_loss))
    print('F1:', f1_score(val_targets, val_preds, average='macro'))

Epoch: 0:   0%|          | 0/293 [00:00<?, ?it/s]

Training loss: 0.36282466362157373


Epoch: 0:   0%|          | 0/98 [00:00<?, ?it/s]

Val loss: 0.2345287130803478
F1: 0.9609115342441644


Epoch: 1:   0%|          | 0/293 [00:00<?, ?it/s]

Training loss: 0.101868323689333


Epoch: 1:   0%|          | 0/98 [00:00<?, ?it/s]

Val loss: 0.12485515451704969
F1: 0.9829743104295883


Epoch: 2:   0%|          | 0/293 [00:00<?, ?it/s]

Training loss: 0.04672228425436052


Epoch: 2:   0%|          | 0/98 [00:00<?, ?it/s]

Val loss: 0.09608748642613693
F1: 0.985137991482419


Epoch: 3:   0%|          | 0/293 [00:00<?, ?it/s]

Training loss: 0.03176195842592179


Epoch: 3:   0%|          | 0/98 [00:00<?, ?it/s]

Val loss: 0.07797025535635803
F1: 0.9858082602909529


Epoch: 4:   0%|          | 0/293 [00:00<?, ?it/s]

Training loss: 0.02439089220195195


Epoch: 4:   0%|          | 0/98 [00:00<?, ?it/s]

Val loss: 0.06936652772128582
F1: 0.9860634274981412
