In [1]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torch.optim as optim
from torchinfo import summary

import dataloader as dataloader
import utils

import time
import importlib

In [2]:
importlib.reload(dataloader)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class CNN3D(nn.Module):
    def __init__(self, t_dim=16, img_x=112, img_y=112, drop_p=0.2, fc_hidden1=256, fc_hidden2=128, num_classes=10):
        super(CNN3D, self).__init__()

        # set video dimension
        self.t_dim = t_dim
        self.img_x = img_x
        self.img_y = img_y
        
        # fully connected layer hidden nodes
        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p
        self.num_classes = num_classes
        self.ch1, self.ch2 = 32, 48
        self.k1, self.k2 = (5, 5, 5), (3, 3, 3)  # 3d kernel size
        self.s1, self.s2 = (2, 2, 2), (2, 2, 2)  # 3d strides
        self.pd1, self.pd2 = (0, 0, 0), (0, 0, 0)  # 3d padding

        # compute conv1 & conv2 output shape
        self.conv1_outshape = utils.conv3D_output_size((self.t_dim, self.img_x, self.img_y), self.pd1, self.k1, self.s1)
        self.conv2_outshape = utils.conv3D_output_size(self.conv1_outshape, self.pd2, self.k2, self.s2)

        self.conv1 = nn.Conv3d(in_channels=3, out_channels=self.ch1, kernel_size=self.k1, stride=self.s1,
                               padding=self.pd1)
        self.bn1 = nn.BatchNorm3d(self.ch1)
        self.conv2 = nn.Conv3d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, stride=self.s2,
                               padding=self.pd2)
        self.bn2 = nn.BatchNorm3d(self.ch2)
        self.relu = nn.ReLU(inplace=True)
        self.drop = nn.Dropout3d(self.drop_p)
        self.pool = nn.MaxPool3d(2)
        self.fc1 = nn.Linear(self.ch2 * self.conv2_outshape[0] * self.conv2_outshape[1] * self.conv2_outshape[2],
                             self.fc_hidden1)  # fully connected hidden layer
        self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2)
        self.fc3 = nn.Linear(self.fc_hidden2, self.num_classes)  # fully connected layer, output = multi-classes

    def forward(self, x_3d):
        # Conv 1
        x = self.conv1(x_3d)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.drop(x)
        # Conv 2
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.drop(x)
        # FC 1 and 2
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc3(x)

        return x


In [4]:
# Let's create an instance of the model and print its architecture
model = CNN3D(t_dim=16, img_x=112, img_y=112, drop_p=0.2, fc_hidden1=256, fc_hidden2=128, num_classes=5)
print(model)

# Define the shape of the dummy input data (batch_size, channels, frames, height, width)
dummy_input_shape = (32, 3, 112, 112, 16) # Batch size 32, 3 input channels, 16 frames, 112x112 resolution

# Generate random dummy input data within the defined shape
dummy_input = torch.randn(*dummy_input_shape)

# Forward pass through the model
output = model(dummy_input)

# Print the output shape
print("Output shape:", output.shape)


CNN3D(
  (conv1): Conv3d(3, 32, kernel_size=(5, 5, 5), stride=(2, 2, 2))
  (bn1): BatchNorm3d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv3d(32, 48, kernel_size=(3, 3, 3), stride=(2, 2, 2))
  (bn2): BatchNorm3d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (drop): Dropout3d(p=0.2, inplace=False)
  (pool): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=64896, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=5, bias=True)
)
Output shape: torch.Size([32, 5])


In [5]:
# Print model summary
summary(model)

Layer (type:depth-idx)                   Param #
CNN3D                                    --
├─Conv3d: 1-1                            12,032
├─BatchNorm3d: 1-2                       64
├─Conv3d: 1-3                            41,520
├─BatchNorm3d: 1-4                       96
├─ReLU: 1-5                              --
├─Dropout3d: 1-6                         --
├─MaxPool3d: 1-7                         --
├─Linear: 1-8                            16,613,632
├─Linear: 1-9                            32,896
├─Linear: 1-10                           645
Total params: 16,700,885
Trainable params: 16,700,885
Non-trainable params: 0

In [6]:
## initialize dataloader

# Define your transform (data pre-processing) fınction
# Define your dataset with transform
transform = transforms.Compose([
    dataloader.myUCF5Preprocessing(output_size=(112, 112))
])


# Define your dataset
dataset = dataloader.myUCF5Loader(root_dir='UCF5', transform=transform)


# Split the dataset
train_set, val_set, test_set = dataloader.split_dataset(dataset)


# Create data loaders
train_loader = torch.utils.data.DataLoader(train_set, batch_size=2, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=1, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=False)


In [7]:
# Training loop


# hyper-params
num_epochs = 3
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# dont forget to send model to device 
model.to(device) 

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device, dtype=torch.float32), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    # Validation loop
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device, dtype=torch.float32), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {running_loss / len(train_loader.dataset):.4f}, "
          f"Val Loss: {val_loss / len(val_loader.dataset):.4f}, "
          f"Val Acc: {(100 * correct / total):.2f}%")

# Testing loop
model.eval()
test_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device, dtype=torch.float32), targets.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        test_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

print(f"Test Loss: {test_loss / len(test_loader.dataset):.4f}, "
      f"Test Acc: {(100 * correct / total):.2f}%")




Epoch [1/3], Train Loss: 5.0512, Val Loss: 3.9023, Val Acc: 20.00%
Epoch [2/3], Train Loss: 1.1597, Val Loss: 1.0611, Val Acc: 50.00%
Epoch [3/3], Train Loss: 0.5132, Val Loss: 0.3002, Val Acc: 90.00%
Test Loss: 0.7352, Test Acc: 80.00%
