# Imports

In [1]:
import mediapipe as mp

import os
from pathlib import Path

import torch
import numpy as np
import pandas as pd
import torchmetrics
import seaborn as sns
import torch.utils.data

  warn(


In [2]:
import pytorchvideo.data
import pytorch_lightning
import torch.nn.functional as F
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.metrics import confusion_matrix

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    ShortSideScale,
    UniformTemporalSubsample,
)
from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    CenterCrop,
    RandomHorizontalFlip,
)



# Prepare dataset

In [3]:
def move_videos_to_dir(labels: pd.DataFrame, out_dir: Path):
    in_dir = Path("data", "videos")
    for _, label in tqdm(labels.iterrows(), total=len(labels)):
        class_name = label["label"]
        class_dir = out_dir / class_name
        filename = label["youtube_id"] + ".mp4"
        file = in_dir / filename
        if file.is_file():
            if not class_dir.is_dir():
                class_dir.mkdir(parents=True, exist_ok=True)
            file.rename(class_dir / filename)

In [5]:
csv_filenames = {
     "train": "dancing-train.csv",
     "val": "dancing-validate.csv",
}
for phase, filename in csv_filenames.items():
    labels_df = pd.read_csv(f"data/{filename}")
    move_videos_to_dir(labels_df, Path("data", "videos", phase))

# Train models

## Init data module

In [4]:
class KineticsDataModule(pytorch_lightning.LightningDataModule):

  # Dataset configuration
  _DATA_PATH = "./data/videos"
  _CLIP_DURATION = 2  # Duration of sampled clip for each video
  _BATCH_SIZE = 4
  _NUM_WORKERS = 10  # Number of parallel processes fetching data

  def train_dataloader(self):
    """
    Create the Kinetics train partition from the list of video labels
    in {self._DATA_PATH}/train
    """
    train_transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(16),
                Lambda(lambda x: x / 255.0),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                RandomShortSideScale(min_size=256, max_size=320),
                RandomCrop(224),
                RandomHorizontalFlip(p=0.5),
            ]),
        ),
    ])
    train_dataset = pytorchvideo.data.Kinetics(
        data_path=os.path.join(self._DATA_PATH, "train"),
        clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
        transform=train_transform,
        decode_audio=False,
    )
    return torch.utils.data.DataLoader(
        train_dataset,
        batch_size=self._BATCH_SIZE,
        num_workers=self._NUM_WORKERS,
    )

  def val_dataloader(self):
    """
    Create the Kinetics validation partition from the list of video labels
    in {self._DATA_PATH}/val
    """
    val_transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(16),
                Lambda(lambda x: x / 255.0),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                ShortSideScale(224),
                CenterCrop(224),
                RandomHorizontalFlip(p=0.5),
            ]),
        ),
    ])
    val_dataset = pytorchvideo.data.Kinetics(
        data_path=os.path.join(self._DATA_PATH, "val"),
        clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", self._CLIP_DURATION),
        transform=val_transform,
        decode_audio=False,
    )
    return torch.utils.data.DataLoader(
        val_dataset,
        batch_size=self._BATCH_SIZE,
        num_workers=self._NUM_WORKERS,
    )

  def get_classes(self):
      folder = os.path.join(self._DATA_PATH, "train")
      return [f for f in os.listdir(folder)]

## Init torch lightning trainer

In [58]:
import torch
import pytorch_lightning as pl
import torchmetrics
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix

class VideoClassificationLightningModule(pl.LightningModule):
    def __init__(self, classes, model_fn):
        super().__init__()
        self.model = model_fn()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=8)
        self.f1 = torchmetrics.F1Score(task="multiclass", average="weighted", num_classes=8)
        self.classes = classes
        self.validation_step_outputs = []
        self.validation_step_preds = []
        self.validation_step_targets = []

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        y_hat = self.model(batch["video"])
        loss = F.cross_entropy(y_hat, batch["label"])
        return loss

    def validation_step(self, batch, batch_idx):
        if batch is None or "video" not in batch or "label" not in batch:
            raise ValueError("Batch data is incomplete or None.")
        
        video_data = batch.get("video")
        label_data = batch.get("label")
        
        if video_data is None or label_data is None:
            raise ValueError("Video or Label data is None.")
        
        preds = self.model(video_data)
        loss = F.cross_entropy(preds, label_data)
        self.validation_step_outputs.append(loss)
        self.validation_step_preds.append(preds)
        self.validation_step_targets.append(label_data)
    
        return {"loss": loss, "preds": preds, "target": label_data}


    def on_validation_epoch_end(self):
        avg_loss = torch.stack(self.validation_step_outputs).mean()
        preds = torch.cat(self.validation_step_preds, dim=0)
        targets = torch.cat(self.validation_step_targets, dim=0)
        
        acc = self.accuracy(preds, targets).item()
        f1 = self.f1(preds, targets).item()
        
        preds_classes = torch.argmax(preds, dim=1).cpu().numpy()
        targets = targets.cpu().numpy()
        cm = confusion_matrix(targets, preds_classes)

        # Print the confusion matrix, average loss, accuracy, and F1 score
        print("Confusion Matrix:")
        print(cm)
        print(f"Average Loss: {avg_loss}")
        print(f"Accuracy: {acc}")
        print(f"F1 Score: {f1}")

        # Clear validation data
        self.validation_step_outputs.clear()
        self.validation_step_preds.clear()
        self.validation_step_targets.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
        return {"optimizer": optimizer, "lr_scheduler": scheduler}


## П.0 - Предобученные модели

In [6]:
def make_kinetics_mvit():
    model = torch.hub.load('facebookresearch/pytorchvideo', 'mvit_base_16x4', pretrained=True)
    model.blocks[:-1].requires_grad_(False)
    model.head.proj = torch.nn.Linear(
        in_features=model.head.proj.in_features,
        out_features=8
    )
    return model

In [8]:
def train(model_fn):
    data_module = KineticsDataModule()
    classification_module = VideoClassificationLightningModule(
        classes=data_module.get_classes(),
        model_fn=model_fn,
    )
    trainer = pytorch_lightning.Trainer(limit_train_batches=50,  # Обрабатывать только первые 200 батчей обучения каждую эпоху
                                        limit_val_batches=50,     # Обрабатывать только первые 50 батчей валидации каждую эпоху
                                        max_epochs=1,             # Количество эпох
                                        accelerator='gpu',
                                        # devices=1
                                       ) #(gpus=-1, max_epochs=3)
    trainer.fit(classification_module, data_module)

In [38]:
train(model_fn=make_kinetics_mvit)

Using cache found in /home/user/.cache/torch/hub/facebookresearch_pytorchvideo_main
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /home/user/Desktop/sergey_action_recognition/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                         | Params
----------------------------------------------------------
0 | model    | MultiscaleVisionTransformers | 36.3 M
1 | accuracy | MulticlassAccuracy           | 0     
2 | f1       | MulticlassF1Score            | 0     
----------------------------------------------------------
7.4 M     Trainable params
28.9 M    Non-trainable params
36.3 M    Total params
145.237   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Confusion Matrix:
[[0 0 0 0 0 0]
 [0 0 0 1 2 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [1 0 1 0 0 3]
 [0 0 0 0 0 0]]
Average Loss: 2.2145962715148926
Accuracy: 0.0
F1 Score: 0.0


  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Confusion Matrix:
[[35  1  0  1  0  0  0  0]
 [ 0 12  1  2  0 11  0 10]
 [ 5  0  5  0  0  0  0  0]
 [ 0  0  0 16  1  0  1  0]
 [ 0  0  1 10  6  0  2  1]
 [ 4  0  1  2  9 14  4  1]
 [ 0  1  0  2  2  5  9  5]
 [ 0  1  0  0  0  4  0 15]]
Average Loss: 1.8468093872070312
Accuracy: 0.5600000023841858
F1 Score: 0.5431165099143982


`Trainer.fit` stopped: `max_epochs=1` reached.


## П.3 - Построена и обучена модель классификации видео на основе 3D свёрток - 3DCNN

In [47]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [9]:
import torch.nn as nn
class Simple3DCNN(nn.Module):
    def __init__(self, num_classes):
        super(Simple3DCNN, self).__init__()
        self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=1)
        self.pool = nn.MaxPool3d((2, 2, 2))
        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=1)
        self.fc1 = nn.Linear(128 * 4 * 56 * 56, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(-1, 128 * 4 * 56 * 56)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [10]:
def make_3D_CNN(num_classes):
    model = Simple3DCNN(num_classes=num_classes)
    return model

In [11]:
def train(model_fn, num_classes):
    data_module = KineticsDataModule()
    classification_module = VideoClassificationLightningModule(
        classes=data_module.get_classes(),
        model_fn=lambda: model_fn(num_classes=len(data_module.get_classes())),
    )
    trainer = pytorch_lightning.Trainer(
        limit_train_batches=50,
        limit_val_batches=50,
        max_epochs=2,
        accelerator='gpu',
    )
    trainer.fit(classification_module, data_module)

In [12]:
train(model_fn=make_3D_CNN, num_classes=8)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type               | Params
------------------------------------------------
0 | model    | Simple3DCNN        | 822 M 
1 | accuracy | MulticlassAccuracy | 0     
2 | f1       | MulticlassF1Score  | 0     
------------------------------------------------
822 M     Trainable params
0         Non-trainable params
822 M     Total params
3,289.259 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Confusion Matrix:
[[0 0 0 4]
 [0 0 0 0]
 [0 2 1 1]
 [0 0 0 0]]
Average Loss: 2.0698554515838623
Accuracy: 0.125
F1 Score: 0.20000000298023224


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


## П.2 - Построена и обучена модель классификации видео на основе Pose Estimation

In [5]:
import cv2
import torch.nn as nn

In [59]:
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False, model_complexity=1, smooth_landmarks=True)

I0000 00:00:1714243611.749220  759359 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1714243611.801537  762393 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.104.05), renderer: NVIDIA GeForce RTX 3090/PCIe/SSE2


In [60]:
%%script False
def extract_pose_features(video_path):
    cap = cv2.VideoCapture(video_path)
    pose_features = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = pose.process(frame_rgb)
        if results.pose_landmarks:
            # Сохраняем только координаты x, y для упрощения, можно добавить z и visibility, если нужно
            frame_landmarks = np.array([[lm.x, lm.y] for lm in results.pose_landmarks.landmark]).flatten()
            pose_features.append(frame_landmarks)
    cap.release()
    return np.array(pose_features)

Couldn't find program: 'False'


In [61]:
def extract_pose_features(video_path):
    cap = cv2.VideoCapture(video_path)
    pose_features = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = pose.process(frame_rgb)
        if results.pose_landmarks:
            frame_landmarks = np.array([[lm.x, lm.y] for lm in results.pose_landmarks.landmark]).flatten()
            pose_features.append(frame_landmarks)
        else:
            # Обеспечение размерности даже при отсутствии данных
            pose_features.append(np.zeros(66))  # Примерное число точек * 2 (x, y)
    cap.release()
    if not pose_features:
        return np.array([np.zeros(66)])  # Возвращает хотя бы один массив нулей
    return np.array(pose_features)


In [62]:
class PoseDataset(torch.utils.data.Dataset):
    def __init__(self, data_path, clip_duration, class_to_index):
        self.data_path = data_path
        self.clip_duration = clip_duration
        self.class_to_index = class_to_index
        self.videos = []
        self.labels = []

        for class_dir in os.listdir(data_path):
            class_path = os.path.join(data_path, class_dir)
            if os.path.isdir(class_path):
                for video_file in os.listdir(class_path):
                    self.videos.append(os.path.join(class_path, video_file))
                    self.labels.append(self.class_to_index[class_dir])

    def __getitem__(self, idx):
        video_path = self.videos[idx]
        label = self.labels[idx]
        pose_features = extract_pose_features(video_path)
        pose_tensor = torch.tensor(pose_features, dtype=torch.float32)
        return pose_tensor, label

    def __len__(self):
        return len(self.videos)


In [63]:
class PoseDataModule(pytorch_lightning.LightningDataModule):
    _DATA_PATH = "./data/videos"
    _CLIP_DURATION = 2  # Duration of sampled clip for each video
    _BATCH_SIZE = 4
    _NUM_WORKERS = 0

    def __init__(self):
        super().__init__()
        self.class_to_index = {class_name: index for index, class_name in enumerate(self.get_classes())}

    def get_classes(self):
        class_path = os.path.join(self._DATA_PATH, "train")
        return [d for d in os.listdir(class_path) if os.path.isdir(os.path.join(class_path, d))]

    def train_dataloader(self):
        # Создаём экземпляр PoseDataset, передавая class_to_index
        train_dataset = PoseDataset(
            data_path=os.path.join(self._DATA_PATH, "train"),
            clip_duration=self._CLIP_DURATION,
            class_to_index=self.class_to_index  # передаём сюда словарь с индексами классов
        )
        return torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self._BATCH_SIZE,
            num_workers=self._NUM_WORKERS,
            shuffle=True,
            collate_fn=self.collate_fn
        )

    

    def val_dataloader(self):
        val_dataset = PoseDataset(
            data_path=os.path.join(self._DATA_PATH, "val"),
            clip_duration=self._CLIP_DURATION,
            class_to_index=self.class_to_index
        )
        return torch.utils.data.DataLoader(
            val_dataset,
            batch_size=self._BATCH_SIZE,
            num_workers=self._NUM_WORKERS,
            shuffle=True,
            collate_fn=self.collate_fn
        )
    
    # def collate_fn(self, batch):
    #     videos, labels = zip(*batch)
    #     videos = torch.nn.utils.rnn.pad_sequence(videos, batch_first=True, padding_value=0)
    #     labels = torch.tensor(labels, dtype=torch.long)  # Обратите внимание на тип long для меток
    #     return {"video": videos, "label": labels}
    
    def collate_fn(self, batch):
        videos, labels = zip(*batch)
        
        if not videos:
            raise ValueError("Batch is empty! Check your dataset and DataLoader.")
    
        processed_videos = []
        for video in videos:
            if video is None or len(video) == 0:
                continue  # Пропускаем пустые видео
            video_tensor = torch.tensor(video, dtype=torch.float32)
            processed_videos.append(video_tensor)
    
        if not processed_videos:
            raise ValueError("All videos in the batch are empty or None.")
        
        videos_tensor = torch.nn.utils.rnn.pad_sequence(processed_videos, batch_first=True, padding_value=0)
        labels_tensor = torch.tensor(labels, dtype=torch.long)
    
        return {"video": videos_tensor, "label": labels_tensor}

            
    def get_classes(self):
        # Извлечение уникальных имен классов из папок в train
        class_path = os.path.join(self._DATA_PATH, "train")
        return [d for d in os.listdir(class_path) if os.path.isdir(os.path.join(class_path, d))]

In [64]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_num, output_dim, dropout_prob=0.5):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_num = layer_num
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_num, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        h0 = torch.zeros(self.layer_num, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.layer_num, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

def make_LSTM_network(input_dim, hidden_dim, layer_num, num_classes, dropout_prob=0.5):
    return LSTMClassifier(input_dim, hidden_dim, layer_num, num_classes, dropout_prob)


In [65]:
def train(model_fn, input_dim, hidden_dim, layer_num, dropout_prob):
    data_module = PoseDataModule()
    num_classes = len(data_module.get_classes())
    classification_module = VideoClassificationLightningModule(
        classes=data_module.get_classes(),
        model_fn=lambda: model_fn(input_dim=input_dim, hidden_dim=hidden_dim, layer_num=layer_num, num_classes=num_classes, dropout_prob=dropout_prob)
    )
    trainer = pytorch_lightning.Trainer(
        limit_train_batches=10,
        limit_val_batches=10,
        log_every_n_steps=10,
        max_epochs=1,
        accelerator='gpu',
    )
    trainer.fit(classification_module, data_module)

In [66]:
def make_LSTM_network():
    input_dim = 66  # Для примера, количество координат (33 точки * 2 координаты)
    hidden_dim = 128
    layer_num = 2
    output_dim=8
    
    model = LSTMClassifier(input_dim, hidden_dim, layer_num, output_dim)
    print(model)
    
    return model

In [67]:
def make_LSTM_network(input_dim, hidden_dim, layer_num, num_classes, dropout_prob=0.5):
    return LSTMClassifier(input_dim, hidden_dim, layer_num, num_classes, dropout_prob)

In [68]:
train(model_fn=make_LSTM_network, input_dim=66, hidden_dim=128, layer_num=2, dropout_prob=0.5)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type               | Params
------------------------------------------------
0 | model    | LSTMClassifier     | 233 K 
1 | accuracy | MulticlassAccuracy | 0     
2 | f1       | MulticlassF1Score  | 0     
------------------------------------------------
233 K     Trainable params
0         Non-trainable params
233 K     Total params
0.934     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  video_tensor = torch.tensor(video, dtype=torch.float32)


Confusion Matrix:
[[0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 2 0 0 0 0]]
Average Loss: 2.0731964111328125
Accuracy: 0.125
F1 Score: 0.02777777798473835


  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  video_tensor = torch.tensor(video, dtype=torch.float32)


Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Confusion Matrix:
[[3 0 5 0 0 0 0 0]
 [1 0 5 0 0 0 0 0]
 [1 0 3 0 0 0 0 0]
 [1 0 3 0 0 0 0 0]
 [0 0 4 0 0 0 0 0]
 [0 0 3 0 0 0 0 0]
 [0 0 3 0 0 0 0 0]
 [5 0 3 0 0 0 0 0]]
Average Loss: 2.0664772987365723
Accuracy: 0.15000000596046448
F1 Score: 0.08133971691131592
