In [None]:
import glob
import os
import cv2
from PIL import Image
import numpy as np
import random
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
torch.cuda.is_available()

In [None]:
root = 'D:/Jester/Jester/20bn-jester-v1/*'
num_classes = 27
num_worker = 0
batch_size = 128
scales = [1, 1/2**(1/4), 1/2**(1/2)]
sample_size = (96,160)
sample_duration = 16
rgb_mean = (0.485, 0.456, 0.406)
rgb_std = (0.229, 0.224, 0.225)
lr = 1e-2
momentum = 0.9
weight_decay = 1e-5

In [None]:
def sortKeyFunc(s):
    return int(os.path.basename(s)[:])

def load_all_path(root):
    video_dictionary = glob.glob(root)
    video_dictionary.sort(key=sortKeyFunc)
    all_path = []
    for video_path in video_dictionary:
        file_list = sorted(glob.glob(video_path + '/*'))
        all_path.append(file_list)
    return all_path

In [None]:
all_path = load_all_path(root)
labels = np.genfromtxt('D:/Jester/jester-v1-labels.csv', delimiter=',', dtype=np.str)
labels.tolist()

In [None]:
class TemporalCrop(object):
    """Temporally crop the given frame indices at a random location or at the center location.
        size (int): Desired output size of the crop.
    """

    def __init__(self, size, mode):
        self.size = size*2
        self.mode = mode

    def __call__(self, path):
        """
        Args:
            paths (list): paths to be cropped.
        Returns:
            list: Cropped paths.
        """
        num_frames = len(path)
        
        if self.mode == 'train':
            if num_frames < self.size:
                num_loops = self.size//num_frames
                delta = self.size - num_frames*num_loops
                new_path = path*num_loops + path[0:delta]
            else:
                begin_index = random.randint(0, num_frames - self.size)
                end_index = begin_index + self.size
                new_path = path[begin_index:end_index]
        else:
            if num_frames < self.size:
                num_loops = self.size//num_frames
                delta = self.size - num_frames*num_loops
                new_path = path*num_loops + path[0:delta]
            else:
                begin_index = (num_frames - self.size)//2
                end_index = begin_index + self.size
                new_path = path[begin_index:end_index]
                
        new_path = new_path[0:self.size]
        return new_path

class MultiScaleRandomCrop(object):

    def __init__(self, scales, size, interpolation=Image.BILINEAR):
        self.scales = scales
        self.size = size
        self.interpolation = interpolation       

    def get_random_param(self):
        self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
        self.topleft_x = random.random()
        self.topleft_y = random.random()

    def __call__(self, img):
        image_width, image_height = img.size
        out_height, out_width = self.size
        crop_height = out_height*self.scale
        crop_width = out_width*self.scale

        topleft_x = self.topleft_x * (image_width - crop_width)
        topleft_y = self.topleft_y * (image_height - crop_height)
        bottomright_x = topleft_x + crop_width
        bottomright_y = topleft_y + crop_height

        img = img.crop((topleft_x, topleft_y, bottomright_x, bottomright_y))
        img = img.resize((out_width, out_height), self.interpolation)

        return img
    
class RandomCrop(object):

    def __init__(self, size):
        self.size = size  

    def get_random_param(self):
        self.topleft_x = random.random()
        self.topleft_y = random.random()

    def __call__(self, img):
        image_width, image_height = img.size
        out_height, out_width = self.size

        topleft_x = self.topleft_x * (image_width - out_width)
        topleft_y = self.topleft_y * (image_height - out_height)
        bottomright_x = topleft_x + out_width
        bottomright_y = topleft_y + out_height

        img = img.crop((topleft_x, topleft_y, bottomright_x, bottomright_y))
        return img

In [None]:
def read_video(paths, mode):
    all_image = []
    temporal_transform = TemporalCrop(sample_duration, mode)
    if mode == 'train':
        RandomCrops = MultiScaleRandomCrop(scales, sample_size)
        RandomCrops.get_random_param()
        spatial_transform = transforms.Compose([
            transforms.ToPILImage(),
            RandomCrops,
            transforms.ToTensor(),
            transforms.Normalize(rgb_mean, rgb_std)
        ])
    else:
        spatial_transform = transforms.Compose([
            transforms.ToPILImage(),
#             transforms.Resize(sample_size),
            transforms.CenterCrop(sample_size),
            transforms.ToTensor(),
            transforms.Normalize(rgb_mean, rgb_std)
        ])
                
    new_paths = temporal_transform(paths)
    width = cv2.imread(new_paths[0]).shape[1]
    if width != 176:
        padding = np.zeros((100,(176-width)//2,3), dtype=np.uint8)
        for path in new_paths:
            image = cv2.imread(path)
            image = np.concatenate([padding, image, padding], axis=1)
            image = spatial_transform(image)
            all_image.append(image)
    else:
        for path in new_paths:
            image = cv2.imread(path)
            image = spatial_transform(image)
            all_image.append(image)
            
    video = np.stack(all_image).transpose(1,0,2,3)
#     print(video.shape)
    return video

In [None]:
class Dataset(Dataset):
    def __init__(self, all_path, x, y, mode):
        self.length = len(x)
        self.all_path = all_path
        self.x = x
        self.y = y
        self.mode = mode
    
    def __len__(self):
        return(self.length)
    
    def __getitem__(self, index):
        if self.mode == 'train' or self.mode == 'valid':
            x = read_video(self.all_path[int(self.x[index,0])-1], self.mode)
            y = int(np.argwhere(self.y == self.x[index,1]))
            return torch.from_numpy(x), torch.tensor(y)
        else:
            x = read_video(self.all_path[int(self.x[index])-1], self.mode)
            return torch.from_numpy(x)

In [None]:
train = np.genfromtxt('D:/Jester/jester-v1-train.csv', delimiter=',', dtype=np.str)    
train_data = Dataset(all_path, train, labels, 'train')
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_worker, pin_memory=True)

In [None]:
valid = np.genfromtxt('D:/Jester/jester-v1-validation.csv', delimiter=',', dtype=np.str)
valid_data = Dataset(all_path, valid, labels, 'valid')
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, num_workers=num_worker, pin_memory=True)

In [None]:
test = np.genfromtxt('D:/Jester/jester-v1-test.csv', delimiter=',', dtype=np.str)
test_data = Dataset(all_path, test, labels, 'test')
test_loader = DataLoader(test_data, batch_size=1, shuffle=False, num_workers=num_worker, pin_memory=True)

In [None]:
class SlowFastFusion(nn.Module):
    def __init__(self, dim_in, fusion_conv_channel_ratio, fusion_kernel, alpha):
        super(SlowFastFusion, self).__init__()
        self.conv_f2s = nn.Sequential(
            nn.Conv3d(dim_in, dim_in * fusion_conv_channel_ratio, kernel_size=(fusion_kernel, 1, 1),
                stride=(alpha, 1, 1), padding=(fusion_kernel // 2, 0, 0), bias=False),
            nn.BatchNorm3d(num_features=dim_in * fusion_conv_channel_ratio),
            nn.ReLU())

    def forward(self, x1, x2):
        x_s = x1
        x_f = x2
        fuse = self.conv_f2s(x_f)
        x_s_fuse = torch.cat([x_s, fuse], 1)
        return x_s_fuse, x_f
    
class SlowFastBlock1(nn.Module):
    def __init__(self, in_channels, cardinality=32, width=4, stride=1):
        super(SlowFastBlock1, self).__init__()
        channels = cardinality*width
        out_channels = 2*channels
        
        self.net = nn.Sequential(
            nn.Conv3d(in_channels, channels, kernel_size=1, bias=False),
            nn.BatchNorm3d(channels),
            nn.PReLU(),
            nn.Conv3d(channels, channels, kernel_size=(1,3,3), stride=stride, padding=(0,1,1), groups=cardinality, bias=False),
            nn.BatchNorm3d(channels),
            nn.PReLU(),
            nn.Conv3d(channels, out_channels, kernel_size=1, bias=False),
            nn.BatchNorm3d(out_channels),
        )
        
        self.prelu = nn.PReLU()

        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv3d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm3d(out_channels)
            )
        else:
            self.shortcut = nn.Identity()
            
    def forward(self, x):
        out = self.net(x)
        out += self.shortcut(x)
        out = self.prelu(out)
        return out

class SlowFastBlock2(nn.Module):
    def __init__(self, in_channels, cardinality=32, width=4, stride=1):
        super(SlowFastBlock2, self).__init__()
        channels = cardinality*width
        out_channels = 2*channels
        
        self.net = nn.Sequential(
            nn.Conv3d(in_channels, channels, kernel_size=(3,1,1), padding=(1,0,0), bias=False),
            nn.BatchNorm3d(channels),
            nn.PReLU(),
            nn.Conv3d(channels, channels, kernel_size=(1,3,3), stride=stride, padding=(0,1,1), groups=cardinality, bias=False),
            nn.BatchNorm3d(channels),
            nn.PReLU(),
            nn.Conv3d(channels, out_channels, kernel_size=1, bias=False),
            nn.BatchNorm3d(out_channels),
        )
        
        self.prelu = nn.PReLU()

        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv3d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm3d(out_channels)
            )
        else:
            self.shortcut = nn.Identity()
            
    def forward(self, x):
        out = self.net(x)
        out += self.shortcut(x)
        out = self.prelu(out)
        return out

In [None]:
class SlowFastNet(nn.Module):
    def __init__(self, block1, block2, num_blocks, cardinality=32, width=4, num_class=27):
        super(SlowFastNet, self).__init__()
        self.cardinality_slow = cardinality
        self.cardinality_fast = cardinality//8
        self.width_slow = width
        self.width_fast = width
        self.slow_in_channels = 64
        self.fast_in_channels = 8
        
        self.slownet_head = nn.Sequential(
            nn.Conv3d(3, 64, kernel_size=(1,7,7), stride=(1,2,2), padding=(0,3,3), bias=False),
            nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)))
        self.slownet_res2 = self.stack_blocks_slow(block1, num_blocks[0], stride=1)
        self.slownet_res3 = self.stack_blocks_slow(block1, num_blocks[1], stride=2)
        self.slownet_res4 = self.stack_blocks_slow(block2, num_blocks[2], stride=2)
        self.slownet_res5 = self.stack_blocks_slow(block2, num_blocks[3], stride=2)
        
        self.fastnet_head = nn.Sequential(
            nn.Conv3d(3, 8, kernel_size=(5,7,7), stride=(1,2,2), padding=(2,3,3), bias=False),
            nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)))

        self.fastnet_res2 = self.stack_blocks_fast(block2, num_blocks[0], stride=1)
        self.fastnet_res3 = self.stack_blocks_fast(block2, num_blocks[1], stride=2)
        self.fastnet_res4 = self.stack_blocks_fast(block2, num_blocks[2], stride=2)
        self.fastnet_res5 = self.stack_blocks_fast(block2, num_blocks[3], stride=2)
        
        self.fuse1 = SlowFastFusion(8,2,5,4)
        self.fuse2 = SlowFastFusion(32,2,5,4)
        self.fuse3 = SlowFastFusion(64,2,5,4)
        self.fuse4 = SlowFastFusion(128,2,5,4)
        
        self.pooling = nn.AdaptiveAvgPool3d(1)
        self.dropout = nn.Dropout(0.5)
        self.out = nn.Linear(self.slow_in_channels + self.fast_in_channels, num_class)
        
        for layer in self.modules():
            if isinstance(layer, nn.Conv3d):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(layer, nn.BatchNorm3d):
                nn.init.constant_(layer.weight, val=1.0)
                nn.init.constant_(layer.bias, val=0.0)
            elif isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
                nn.init.constant_(layer.bias, val=0.0)
                
    def stack_blocks_slow(self, block, num_blocks, stride):
        strides = [1]*(num_blocks-1)
        layers = []
        layers.append(block(int(self.slow_in_channels*1.25), self.cardinality_slow, self.width_slow, stride))
        for stride in strides:
            self.slow_in_channels = 2*self.cardinality_slow*self.width_slow
            layers.append(block(self.slow_in_channels, self.cardinality_slow, self.width_slow, stride))
        self.width_slow *= 2
        return nn.Sequential(*layers)
    
    def stack_blocks_fast(self, block, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.fast_in_channels, self.cardinality_fast, self.width_fast, stride))
            self.fast_in_channels = 2*self.cardinality_fast*self.width_fast
        self.width_fast *= 2
        return nn.Sequential(*layers)
    
    def forward(self, x):
        x_fast = x[:,:,0:32,:,:]
        x_slow = x[:,:,2:34:4,:,:]
        
        x_fast = self.fastnet_head(x_fast)
        x_slow = self.slownet_head(x_slow)
        x_slow, x_fast = self.fuse1(x_slow, x_fast)
        
        x_fast = self.fastnet_res2(x_fast)
        x_slow = self.slownet_res2(x_slow)
        x_slow, x_fast = self.fuse2(x_slow, x_fast)
        
        x_fast = self.fastnet_res3(x_fast)
        x_slow = self.slownet_res3(x_slow)
        x_slow, x_fast = self.fuse3(x_slow, x_fast)
        
        x_fast = self.fastnet_res4(x_fast)
        x_slow = self.slownet_res4(x_slow)
        x_slow, x_fast = self.fuse4(x_slow, x_fast)
        
        x_fast = self.fastnet_res5(x_fast)
        x_slow = self.slownet_res5(x_slow)
        
        x_fast = self.pooling(x_fast)
        x_slow = self.pooling(x_slow)
        
        x = torch.cat([x_slow.view(x_slow.size(0),x_slow.size(1)), x_fast.view(x_fast.size(0), x_fast.size(1))], dim=1)
#         x = self.dropout(x)
        x = self.out(x)
        return x

In [None]:
def train_epoch(model, train_loader, criterion, optimizer):
    model.train()

    running_loss = 0.0
    total_predictions = 0.0
    correct_predictions = 0.0
    
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        
        correct_predictions += (predicted == labels).sum().item()
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        print(i, loss)
        loss.backward()
        optimizer.step()

    running_loss /= len(train_loader)
    acc = (correct_predictions/total_predictions)*100.0
    print('Training Loss: ', running_loss)
    print('Training Accuracy: ', acc, '%')
    return running_loss, acc

In [None]:
def test_model(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0
        
        for i, data in enumerate(test_loader, 0):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

            loss = criterion(outputs, labels).detach()
            running_loss += loss.item()
            
        running_loss /= len(test_loader)
        acc = (correct_predictions/total_predictions)*100.0
        print('Validation Loss: ', running_loss)
        print('Validation Accuracy: ', acc, '%')
        return running_loss, acc

In [None]:
model = SlowFastNet(SlowFastBlock1, SlowFastBlock2, [3,4,6,3], cardinality=32, width=4, num_class=num_classes)
model = nn.DataParallel(model)
model.load_state_dict(torch.load('project_classifier_slowfast9295.pth'))
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=True)
scheduler = ReduceLROnPlateau(optimizer, mode = 'min', factor=0.1, patience=2)
device = torch.device("cuda")
model.to(device)
pytorch_total_params = sum(p.numel() for p in model.parameters())
pytorch_total_params

In [None]:
# n_epochs = 100
# Train_loss = []
# Train_acc = []
# Valid_loss = []
# Valid_acc = []
# num_no_improve = 0
# for i in range(n_epochs):
#     train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
#     valid_loss, valid_acc = test_model(model, valid_loader, criterion)
#     Train_acc.append(train_acc)
#     Train_loss.append(train_loss)
#     Valid_loss.append(valid_loss)
#     scheduler.step(valid_loss)
#     print('='*40)

#     if i == 0:
#         torch.save(model.state_dict(), 'project_classifier_slowfast.pth')
#     else:
#         if valid_acc > max(Valid_acc):
#             torch.save(model.state_dict(), 'project_classifier_slowfast.pth')
#             num_no_improve = 0
#         else:
#             num_no_improve += 1
#     Valid_acc.append(valid_acc)
    
#     training_loss = np.array(Train_loss).reshape(-1,1)
#     training_acc = np.array(Train_acc).reshape(-1,1)
#     validation_loss = np.array(Valid_loss).reshape(-1,1)
#     validation_acc = np.array(Valid_acc).reshape(-1,1)
#     result = np.concatenate([training_loss, training_acc, validation_loss, validation_acc], axis=1)
#     np.savetxt('result_project_classifier_slowfast.csv', result, delimiter=',', fmt='%1.5f', header='training_loss,training_acc,validation_loss,validation_acc', comments='')
    
#     if num_no_improve >= 10:
#         break

In [None]:
def inference(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()
        output = []
        total_duration = 0
        labels = np.genfromtxt('D:/Jester/jester-v1-labels.csv', delimiter=',', dtype=np.str)
        for i, data in enumerate(test_loader, 0):
            start = time.time()
            inputs = data.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            end = time.time()
            
            for n in range(inputs.size(0)):
                prediction = labels[predicted[n]]
                output.append(prediction)
            
            duration = end - start
            total_duration += duration
            print(i, duration)
        
        index = np.genfromtxt('D:/Jester/jester-v1-test.csv', delimiter=',', dtype=np.str).reshape(-1,1)
        Predicted = np.array(output, dtype=np.str).reshape(-1,1)
        submission = np.concatenate((index, Predicted), axis=1)
        
    np.savetxt('predict.csv', Predicted, delimiter=',', fmt='%s')
    average_duration = total_duration/len(test_loader.dataset)
    print(average_duration)

In [None]:
inference(model, test_loader, criterion)