In [None]:
import glob
import os
import cv2
from PIL import Image
import numpy as np
import random
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
torch.cuda.is_available()

In [None]:
root = 'D:/Jester/Jester/20bn-jester-v1/*'
num_classes = 27
num_worker = 0
batch_size = 16
scales = [1, 1/2**(1/4), 1/2**(1/2)]
sample_size = (96,160)
sample_duration = 16
num_channels = [144,288,576,1296]
num_blocks = [2,7,14,2]
cardinality = 72
rgb_mean = (0.485, 0.456, 0.406)
rgb_std = (0.229, 0.224, 0.225)
lr = 1e-3
momentum = 0.9
weight_decay = 1e-5

In [None]:
def sortKeyFunc(s):
    return int(os.path.basename(s)[:])

def load_all_path(root):
    video_dictionary = glob.glob(root)
    video_dictionary.sort(key=sortKeyFunc)
    all_path = []
    for video_path in video_dictionary:
        file_list = sorted(glob.glob(video_path + '/*'))
        all_path.append(file_list)
    return all_path

In [None]:
all_path = load_all_path(root)
labels = np.genfromtxt('D:/Jester/jester-v1-labels.csv', delimiter=',', dtype=np.str)
labels.tolist()

In [None]:
class TemporalCrop(object):
    """Temporally crop the given frame indices at a random location or at the center location.
        size (int): Desired output size of the crop.
    """

    def __init__(self, size, mode):
        self.size = size
        self.mode = mode

    def __call__(self, path):
        """
        Args:
            paths (list): paths to be cropped.
        Returns:
            list: Cropped paths.
        """
        num_frames = len(path)
        
        if self.mode == 'train':
            if num_frames < self.size:
                num_loops = self.size//num_frames
                delta = self.size - num_frames*num_loops
                new_path = path*num_loops + path[0:delta]
            else:
                begin_index = random.randint(0, num_frames - self.size)
                end_index = begin_index + self.size
                new_path = path[begin_index:end_index]
        else:
            if num_frames < self.size:
                num_loops = self.size//num_frames
                delta = self.size - num_frames*num_loops
                new_path = path*num_loops + path[0:delta]
            else:
                begin_index = (num_frames - self.size)//2
                end_index = begin_index + self.size
                new_path = path[begin_index:end_index]
        
        return new_path

    
class MultiScaleRandomCrop(object):

    def __init__(self, scales, size, interpolation=Image.BILINEAR):
        self.scales = scales
        self.size = size
        self.interpolation = interpolation       

    def get_random_param(self):
        self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
        self.topleft_x = random.random()
        self.topleft_y = random.random()

    def __call__(self, img):
        image_width, image_height = img.size
        out_height, out_width = self.size
        crop_height = out_height*self.scale
        crop_width = out_width*self.scale

        topleft_x = self.topleft_x * (image_width - crop_width)
        topleft_y = self.topleft_y * (image_height - crop_height)
        bottomright_x = topleft_x + crop_width
        bottomright_y = topleft_y + crop_height

        img = img.crop((topleft_x, topleft_y, bottomright_x, bottomright_y))
        img = img.resize((out_width, out_height), self.interpolation)

        return img

class RandomCrop(object):

    def __init__(self, size):
        self.size = size  

    def get_random_param(self):
        self.topleft_x = random.random()
        self.topleft_y = random.random()

    def __call__(self, img):
        image_width, image_height = img.size
        out_height, out_width = self.size

        topleft_x = self.topleft_x * (image_width - out_width)
        topleft_y = self.topleft_y * (image_height - out_height)
        bottomright_x = topleft_x + out_width
        bottomright_y = topleft_y + out_height

        img = img.crop((topleft_x, topleft_y, bottomright_x, bottomright_y))
        return img

In [None]:
def read_video(paths, mode):
    all_image = []
    temporal_transform = TemporalCrop(sample_duration, mode)
    if mode == 'train':
        RandomCrops = MultiScaleRandomCrop(scales, sample_size)
        RandomCrops.get_random_param()
        spatial_transform = transforms.Compose([
            transforms.ToPILImage(),
            RandomCrops,
    #         SpatialElasticDisplacement(),
            transforms.ToTensor(),
            transforms.Normalize(rgb_mean, rgb_std)
        ])
    else:
        spatial_transform = transforms.Compose([
            transforms.ToPILImage(),
#             transforms.Resize(sample_size),
            transforms.CenterCrop(sample_size),
            transforms.ToTensor(),
            transforms.Normalize(rgb_mean, rgb_std)
        ])
                
    new_paths = temporal_transform(paths)
    width = cv2.imread(new_paths[0]).shape[1]
    if width != 176:
        padding = np.zeros((100,(176-width)//2,3), dtype=np.uint8)
        for path in new_paths:
            image = cv2.imread(path)
            image = np.concatenate([padding, image, padding], axis=1)
            image = spatial_transform(image)
            all_image.append(image)
    else:
        for path in new_paths:
            image = cv2.imread(path)
            image = spatial_transform(image)
            all_image.append(image)
            
    video = np.stack(all_image).transpose(1,0,2,3)
#     print(video.shape)
    return video

In [None]:
class Dataset(Dataset):
    def __init__(self, all_path, x, y, mode):
        self.length = len(x)
        self.all_path = all_path
        self.x = x
        self.y = y
        self.mode = mode
    
    def __len__(self):
        return(self.length)
    
    def __getitem__(self, index):
        if self.mode == 'train' or self.mode == 'valid':
            x = read_video(self.all_path[int(self.x[index,0])-1], self.mode)
            y = int(np.argwhere(self.y == self.x[index,1]))
            return torch.from_numpy(x), torch.tensor(y)
        else:
            x = read_video(self.all_path[int(self.x[index])-1], self.mode)
            return torch.from_numpy(x)

In [None]:
train = np.genfromtxt('D:/Jester/jester-v1-train.csv', delimiter=',', dtype=np.str)    
train_data = Dataset(all_path, train, labels, 'train')
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_worker, pin_memory=True)

In [None]:
valid = np.genfromtxt('D:/Jester/jester-v1-validation.csv', delimiter=',', dtype=np.str)
valid_data = Dataset(all_path, valid, labels, 'valid')
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, num_workers=num_worker, pin_memory=True)

In [None]:
test = np.genfromtxt('D:/Jester/jester-v1-test.csv', delimiter=',', dtype=np.str)
test_data = Dataset(all_path, test, labels, 'test')
test_loader = DataLoader(test_data, batch_size=1, shuffle=False, num_workers=num_worker, pin_memory=True)

In [None]:
class RegNetBlockY(nn.Module):
    def __init__(self, in_channels, out_channels, cardinality=32, stride=1):
        super(RegNetBlockY, self).__init__()
        
        self.net = nn.Sequential(
            nn.Conv3d(in_channels, out_channels, kernel_size=1, bias=False),
            nn.BatchNorm3d(out_channels),
            nn.PReLU(),
            nn.Conv3d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False),
            nn.BatchNorm3d(out_channels),
            nn.PReLU(),
            nn.Conv3d(out_channels, out_channels, kernel_size=1, bias=False),
            nn.BatchNorm3d(out_channels),
        )
        
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool3d(1),
            nn.Conv3d(out_channels, out_channels//4, kernel_size=1),
            nn.PReLU(),
            nn.Conv3d(out_channels//4, out_channels, kernel_size=1),
            nn.Sigmoid()
        )
        
        self.prelu = nn.PReLU()
        
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv3d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm3d(out_channels)                
            )
        else:
            self.shortcut = nn.Identity()
            
    def forward(self, x):
        out = self.net(x)
        out = out*self.se(out)
        out += self.shortcut(x)
        out = self.prelu(out)
        return out
    
    
class RegNet(nn.Module):
    def __init__(self, block, num_channels, num_blocks, cardinality, num_class):
        super(RegNet, self).__init__()
        self.cardinality = cardinality

        self.feature_detector = nn.Sequential(
            nn.Conv3d(3, num_channels[0], kernel_size=(3,7,7), stride=(1,2,2), padding=(1,3,3), bias=False),
            nn.MaxPool3d(kernel_size=3, stride=2, padding=1),
            self.stack_blocks(num_channels[0], num_channels[1], block, num_blocks[0], stride=1),
            self.stack_blocks(num_channels[1], num_channels[2], block, num_blocks[1], stride=2),
            self.stack_blocks(num_channels[2], num_channels[3], block, num_blocks[2], stride=2),
            self.stack_blocks(num_channels[3], num_channels[3]*2,block, num_blocks[3], stride=2),
            nn.AvgPool3d((1,3,5)),
            nn.Flatten()
        )
        
        self.classifier = nn.Linear(num_channels[3]*2,num_class)
        
        for layer in self.modules():
            if isinstance(layer, nn.Conv3d):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(layer, nn.BatchNorm3d):
                nn.init.constant_(layer.weight, val=1.0)
                nn.init.constant_(layer.bias, val=0.0)
            elif isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
                nn.init.constant_(layer.bias, val=0.0)
                
    def stack_blocks(self, in_channels, out_channels, block, num_block, stride):
        strides = [1]*(num_block-1)
        layers = []
        layers.append(block(in_channels, out_channels, self.cardinality, stride))
        for stride in strides:
            layers.append(block(out_channels, out_channels, self.cardinality, stride))
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.feature_detector(x)
        out = self.classifier(out)
        return out

In [None]:
def train_epoch(model, train_loader, criterion, optimizer):
    model.train()

    running_loss = 0.0
    total_predictions = 0.0
    correct_predictions = 0.0
    
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        
        correct_predictions += (predicted == labels).sum().item()
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        print(i, loss)
        loss.backward()
        optimizer.step()

    running_loss /= len(train_loader)
    acc = (correct_predictions/total_predictions)*100.0
    print('Training Loss: ', running_loss)
    print('Training Accuracy: ', acc, '%')
    return running_loss, acc

In [None]:
def test_model(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0
        
        for i, data in enumerate(test_loader, 0):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

            loss = criterion(outputs, labels).detach()
            running_loss += loss.item()
            
        running_loss /= len(test_loader)
        acc = (correct_predictions/total_predictions)*100.0
        print('Validation Loss: ', running_loss)
        print('Validation Accuracy: ', acc, '%')
        return running_loss, acc

In [None]:
model = RegNet(RegNetBlockY, num_channels, num_blocks, cardinality=cardinality, num_class=num_classes)
model.load_state_dict(torch.load('project_classifier_regnet9046.pth'))
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=True)
scheduler = ReduceLROnPlateau(optimizer, mode = 'min', factor=0.1, patience=1)
device = torch.device("cuda")
model.to(device)
pytorch_total_params = sum(p.numel() for p in model.parameters())
pytorch_total_params

In [None]:
# n_epochs = 100
# Train_loss = []
# Train_acc = []
# Valid_loss = []
# Valid_acc = []
# num_no_improve = 0
# for i in range(n_epochs):
#     train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
#     valid_loss, valid_acc = test_model(model, valid_loader, criterion)
#     Train_acc.append(train_acc)
#     Train_loss.append(train_loss)
#     Valid_loss.append(valid_loss)
#     scheduler.step(valid_loss)
#     print('='*40)

#     if i == 0:
#         torch.save(model.state_dict(), 'project_classifier_regnet.pth')
#     else:
#         if valid_acc > max(Valid_acc):
#             torch.save(model.state_dict(), 'project_classifier_regnet.pth')
#             num_no_improve = 0
#         else:
#             num_no_improve += 1
#     Valid_acc.append(valid_acc)
    
#     training_loss = np.array(Train_loss).reshape(-1,1)
#     training_acc = np.array(Train_acc).reshape(-1,1)
#     validation_loss = np.array(Valid_loss).reshape(-1,1)
#     validation_acc = np.array(Valid_acc).reshape(-1,1)
#     result = np.concatenate([training_loss, training_acc, validation_loss, validation_acc], axis=1)
#     np.savetxt('result_project_classifier_regnet.csv', result, delimiter=',', fmt='%1.5f', header='training_loss,training_acc,validation_loss,validation_acc', comments='')
    
#     if num_no_improve >= 10:
#         break

In [None]:
def inference(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()
        output = []
        total_duration = 0
        labels = np.genfromtxt('D:/Jester/jester-v1-labels.csv', delimiter=',', dtype=np.str)
        for i, data in enumerate(test_loader, 0):
            start = time.time()
            inputs = data.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            end = time.time()
            
            for n in range(inputs.size(0)):
                prediction = labels[predicted[n]]
                output.append(prediction)
            
            duration = end - start
            total_duration += duration
            print(i, duration)
        
        index = np.genfromtxt('D:/Jester/jester-v1-test.csv', delimiter=',', dtype=np.str).reshape(-1,1)
        Predicted = np.array(output, dtype=np.str).reshape(-1,1)
        submission = np.concatenate((index, Predicted), axis=1)
        
    np.savetxt('predict.csv', Predicted, delimiter=',', fmt='%s')
    average_duration = total_duration/len(test_loader.dataset)
    print(average_duration)

In [None]:
inference(model, test_loader, criterion)