Построить с нуля и обучить модель классификации видео на основе 3D свёрток или трансформеров - 5 баллов

# Dataset init

In [3]:
dataset_dir = "../kinetics"

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from kinetics_dataset import KineticsDataset
from torch.utils.data import DataLoader, SubsetRandomSampler, random_split
from tqdm import tqdm
from video_classifier_model import VideoClassifier

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
dataset = KineticsDataset(dataset_dir)

dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, test_size])

In [6]:
batch_size = 8
sample_size = 100

indices = torch.randperm(len(train_dataset)).tolist()[:sample_size]
sampler = SubsetRandomSampler(indices)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Model init and training

In [7]:
model = VideoClassifier(num_classes=len(dataset.classes)).to(device)

criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1} / {num_epochs}", leave=False):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch + 1} / {num_epochs}, Loss: {epoch_loss:.4f}")

                                                             

Epoch 1 / 10, Loss: 0.9013


                                                             

Epoch 2 / 10, Loss: 0.5792


                                                             

Epoch 3 / 10, Loss: 0.6149


                                                             

Epoch 4 / 10, Loss: 0.5660


                                                             

Epoch 5 / 10, Loss: 0.4823


                                                             

Epoch 6 / 10, Loss: 0.4903


                                                             

Epoch 7 / 10, Loss: 0.3896


                                                             

Epoch 8 / 10, Loss: 0.4337


                                                             

Epoch 9 / 10, Loss: 0.4598


                                                              

Epoch 10 / 10, Loss: 0.4141




# Model saving

In [8]:
torch.save(model.state_dict(), "hw3_3.pt")

# Model inference

In [9]:
model_inference = VideoClassifier(num_classes=2).to(device)

model_inference.load_state_dict(torch.load("hw3_3.pt"))

<All keys matched successfully>

In [None]:
# model_inference.eval()
# correct = 0
# total = 0
# with torch.no_grad():
#     for inputs, labels in tqdm(val_loader):
#         inputs, labels = inputs.to(device), labels.to(device)

#         outputs = model_inference(inputs)
#         _, predicted = torch.max(outputs.data, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()

# accuracy = 100 * correct / total
# print(f"Validation Accuracy: {accuracy:.2f}%")

In [11]:
import numpy as np
from sklearn.metrics import f1_score

model_inference.eval()
all_labels = []
all_predictions = []

with torch.no_grad():
    for inputs, labels in tqdm(val_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model_inference(inputs)
        _, predicted = torch.max(outputs.data, 1)
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())

all_labels = np.array(all_labels)
all_predictions = np.array(all_predictions)

accuracy = 100 * (all_predictions == all_labels).sum() / len(all_labels)
f1 = f1_score(all_labels, all_predictions, average="macro")

print(f"Validation Accuracy: {accuracy:.2f}%")
print(f"Macro F1 Score: {f1:.2f}")

100%|██████████| 45/45 [03:23<00:00,  4.51s/it]

Validation Accuracy: 66.01%
Macro F1 Score: 0.59



