Построить с нуля и обучить модель классификации видео на основе 3D свёрток или трансформеров - 5 баллов

# 1. Data Preparation

In [1]:
dataset_dir = "../kinetics"

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from kinetics_dataset import KineticsDataset
from torch.utils.data import DataLoader, SubsetRandomSampler, random_split
from tqdm import tqdm
from video_classifier_model import VideoClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
dataset = KineticsDataset(dataset_dir)

dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, test_size])

In [13]:
batch_size = 8
sample_size = 100

indices = torch.randperm(len(train_dataset)).tolist()[:sample_size]
sampler = SubsetRandomSampler(indices)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [17]:
model = VideoClassifier(num_classes=len(dataset.classes)).to(device)

criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

num_epochs = 1
for epoch in range(num_epochs):
    running_loss = 0.0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1} / {num_epochs}", leave=False):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch + 1} / {num_epochs}, Loss: {epoch_loss:.4f}")

                                                            

Epoch 1 / 1, Loss: 0.9558




In [18]:
torch.save(model.state_dict(), "hw3_3.pt")

In [19]:
model_inference = VideoClassifier(num_classes=2).to(device)

model_inference.load_state_dict(torch.load("hw3_3.pt"))

<All keys matched successfully>

In [21]:
model_inference.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in tqdm(val_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model_inference(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Validation Accuracy: {accuracy:.2f}%")

100%|██████████| 45/45 [03:23<00:00,  4.53s/it]

Validation Accuracy: 57.79%



