In [None]:
# load dataset
!gdown 10G5-q3Eq52tYEncTF54douBsjCTUVoXb
!unrar x datasets.rar

In [None]:
!pip install av
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting av
  Downloading av-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.0/31.0 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-10.0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from tra

In [None]:
from datetime import datetime
from pathlib import Path

import av
import cv2
import pandas as pd
import numpy as np
import torch
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import TimesformerConfig, TimesformerForVideoClassification, AutoImageProcessor
from tqdm import tqdm

## Dataset

In [135]:
import glob
valid_paths = glob.glob("datasets/train/*")

data = pd.read_csv('datasets/kinetics700/train_dance.csv')
data = data.dropna(subset=['path'])
data['path'] = [i.replace('mp4', '3gpp') for i in data['path']]
data['path'] = [i.replace('\\', r'/') for i in data['path']]
data = data[[i in valid_paths for i in data.path]].copy()
data = data.reset_index(drop=True)

le = preprocessing.LabelEncoder()
data['target'] = le.fit_transform(data.label)
data

Unnamed: 0,label,youtube_id,time_start,time_end,split,path,target
0,belly dancing,-2JgjPsy4Eo,77,87,train,datasets/train/Anna Rubtsova - Zay El Asal ISA...,0
1,salsa dancing,-3FihEVl-R8,26,36,train,datasets/train/Salsa dancing Hassan and Kim.3gpp,10
2,country line dancing,-4HzfnOtVeI,150,160,train,datasets/train/run rabbit run Line Dance par J...,2
3,tap dancing,-4r6VLqGeK4,106,116,train,datasets/train/My Mad Tap Dancing Skillz c.3gpp,14
4,mosh pit dancing,-9N39otwJl8,22,32,train,datasets/train/mosh pitts.3gpp,8
...,...,...,...,...,...,...,...
2185,dancing ballet,zqsDdWv33Ho,102,112,train,datasets/train/ballet CascaNueces Juguetes.3gpp,3
2186,mosh pit dancing,zsY9bKeIW9o,48,58,train,datasets/train/Pierce The Veil Mosh Pit.3gpp,8
2187,belly dancing,zvni26d4ZoI,102,112,train,datasets/train/Asian girl belly dancing in a l...,0
2188,mosh pit dancing,zxhj8Mg2oNE,13,23,train,datasets/train/Foo Fighters Live at Wembley Mo...,8


In [155]:
def read_video_opencv(capture, indices):

    frames = []
    start_index = indices[0]
    end_index = indices[-1]
    idx = 0
    while True:
        ret, frame = capture.read()
        if ret:
            if idx > end_index:
                capture.release()
                break
            if idx >= start_index and idx in indices:
                frames.append(frame[:, :, ::-1])
            idx += 1
    capture.release()
    return np.array(frames)


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = converted_len
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

In [156]:
class VideoDataset(Dataset):
    def __init__(self, data):
        self.paths = data['path']
        self.targets = data['target']
        self.time_start = data['time_start']

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        file_path = self.paths.iloc[idx]
        target = self.targets.iloc[idx]

        container = av.open(file_path)
        cap = cv2.VideoCapture(file_path)

        start_idx = self.time_start.iloc[idx] * cap.get(cv2.CAP_PROP_FPS)
        if start_idx < cap.get(cv2.CAP_PROP_FPS):  # Некорректная разметка на некоторых примерах
            cap.set(cv2.CAP_PROP_POS_FRAMES, start_idx)

        try:
            indices = sample_frame_indices(clip_len=8, frame_sample_rate=5, seg_len=cap.get(cv2.CAP_PROP_FRAME_COUNT))
        except Exception:
            indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if indices.shape[0] < 8:
            indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=cap.get(cv2.CAP_PROP_FRAME_COUNT))

        video = read_video_opencv(cap, indices)
        while video.shape[0] < 8:
            video = np.vstack([video, video[-1:]])

        video = processor(list(video), return_tensors="pt")
        return video, target

In [157]:
train_data, val_data = train_test_split(data, test_size=0.3, shuffle=True, stratify=data['target'])
train_dataset = VideoDataset(train_data)

In [158]:
batch_size = 4 #8
processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")

train_data, val_data = train_test_split(data, test_size=0.3, shuffle=True, stratify=data['target'])

train_dataset = VideoDataset(train_data)
val_dataset = VideoDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


## Training

In [159]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [160]:
# configuration = TimesformerConfig()
# model = TimesformerForVideoClassification(configuration)
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")
model.classifier = nn.Linear(in_features=768, out_features=data.label.nunique(), bias=True)
model.to(device)

TimesformerForVideoClassification(
  (timesformer): TimesformerModel(
    (embeddings): TimesformerEmbeddings(
      (patch_embeddings): TimesformerPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
    )
    (encoder): TimesformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x TimesformerLayer(
          (drop_path): Identity()
          (attention): TimeSformerAttention(
            (attention): TimesformerSelfAttention(
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
            )
            (output): TimesformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): TimesformerIntermediate(
            (dense

In [161]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.CrossEntropyLoss()

In [162]:
for param in model.parameters():
    param.requires_grad = True

In [163]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    for i, data in tqdm(enumerate(train_loader)):
        inputs, labels = data
        inputs = inputs['pixel_values'].to(device)
        inputs = torch.squeeze(inputs, 1)

        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)

        logits = outputs.logits
        predicted_label = logits.softmax(dim=1)

        loss = loss_fn(logits, labels) # predicted_label
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    last_loss = running_loss / len(train_dataset)
    return last_loss

In [164]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 10

best_vloss = 1000000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    model.train()
    avg_loss = train_one_epoch(epoch_number, writer)

    running_vloss = 0.0
    val_targets = []
    val_preds = []
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(val_loader):
          inputs, labels = data
          inputs = inputs['pixel_values'].to(device)
          inputs = torch.squeeze(inputs, 1)

          labels = labels.to(device)

          outputs = model(inputs)
          logits = outputs.logits
          predicted_label = logits.softmax(dim=1)
          vloss = loss_fn(logits, labels)
          running_vloss += vloss.item()
          val_targets.extend(labels.cpu().numpy())
          val_preds.extend(predicted_label.argmax(axis=1).cpu().numpy())

    avg_vloss = running_vloss / len(val_dataset)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
    print('F1:', f1_score(val_targets, val_preds, average='macro'))
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:


88it [03:36,  2.46s/it]


LOSS train 0.7014194256918771 valid 0.610669777393341
F1: 0.23384612381537984
EPOCH 2:


88it [03:35,  2.45s/it]


LOSS train 0.3649964950765882 valid 0.5969261852900187
F1: 0.26460454835841835
EPOCH 3:


88it [03:36,  2.46s/it]


LOSS train 0.1284137983407293 valid 0.6627759162584941
F1: 0.21414853768040917
EPOCH 4:


88it [03:35,  2.45s/it]


LOSS train 0.06576877956145576 valid 0.7047584621111552
F1: 0.25903934842317383
EPOCH 5:


88it [03:39,  2.49s/it]


LOSS train 0.0298586087860167 valid 0.7592365447680155
F1: 0.22805174588485125
EPOCH 6:


88it [03:34,  2.44s/it]


LOSS train 0.01681288529586579 valid 0.670877077182134
F1: 0.28881621923727185
EPOCH 7:


88it [03:33,  2.43s/it]


LOSS train 0.015929679339086372 valid 0.7130652360121409
F1: 0.3367013963985293
EPOCH 8:


88it [03:34,  2.44s/it]


LOSS train 0.015703272059160684 valid 0.7141064500808716
F1: 0.24009956077765343
EPOCH 9:


88it [03:33,  2.43s/it]


LOSS train 0.01253942341277642 valid 0.7584423327445984
F1: 0.2489476949240995
EPOCH 10:


88it [03:33,  2.43s/it]


LOSS train 0.003513576736108267 valid 0.7884823719660441
F1: 0.2566415350686373
