In [None]:
import glob
import os
import cv2
from PIL import Image
import numpy as np
import random
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
torch.cuda.is_available()

In [None]:
root = 'D:/Jester/Jester/20bn-jester-v1/*'
num_classes = 27
num_worker = 0
batch_size = 64
scales = [1, 1/2**(1/4), 1/2**(1/2)]
sample_size = (96,160)
sample_duration = 16
rgb_mean = (0.485, 0.456, 0.406)
rgb_std = (0.229, 0.224, 0.225)
lr = 1e-2
momentum = 0.9
weight_decay = 1e-5

In [None]:
def sortKeyFunc(s):
    return int(os.path.basename(s)[:])

def load_all_path(root):
    video_dictionary = glob.glob(root)
    video_dictionary.sort(key=sortKeyFunc)
    all_path = []
    for video_path in video_dictionary:
        file_list = sorted(glob.glob(video_path + '/*'))
        all_path.append(file_list)
    return all_path

In [None]:
all_path = load_all_path(root)
labels = np.genfromtxt('D:/Jester/jester-v1-labels.csv', delimiter=',', dtype=np.str)
labels.tolist()

In [None]:
class TemporalCrop(object):
    """Temporally crop the given frame indices at a random location or at the center location.
        size (int): Desired output size of the crop.
    """

    def __init__(self, size, mode):
        self.size = size*2
        self.mode = mode

    def __call__(self, path):
        """
        Args:
            paths (list): paths to be cropped.
        Returns:
            list: Cropped paths.
        """
        num_frames = len(path)
        
        if self.mode == 'train':
            if num_frames < self.size:
                num_loops = self.size//num_frames
                delta = self.size - num_frames*num_loops
                new_path = path*num_loops + path[0:delta]
            else:
                begin_index = random.randint(0, num_frames - self.size)
                end_index = begin_index + self.size
                new_path = path[begin_index:end_index]
        else:
            if num_frames < self.size:
                num_loops = self.size//num_frames
                delta = self.size - num_frames*num_loops
                new_path = path*num_loops + path[0:delta]
            else:
                begin_index = (num_frames - self.size)//2
                end_index = begin_index + self.size
                new_path = path[begin_index:end_index]
                
        new_path = new_path[0:self.size:2]
        return new_path

    
class MultiScaleRandomCrop(object):

    def __init__(self, scales, size, interpolation=Image.BILINEAR):
        self.scales = scales
        self.size = size
        self.interpolation = interpolation       

    def get_random_param(self):
        self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
        self.topleft_x = random.random()
        self.topleft_y = random.random()

    def __call__(self, img):
        image_width, image_height = img.size
        out_height, out_width = self.size
        crop_height = out_height*self.scale
        crop_width = out_width*self.scale

        topleft_x = self.topleft_x * (image_width - crop_width)
        topleft_y = self.topleft_y * (image_height - crop_height)
        bottomright_x = topleft_x + crop_width
        bottomright_y = topleft_y + crop_height

        img = img.crop((topleft_x, topleft_y, bottomright_x, bottomright_y))
        img = img.resize((out_width, out_height), self.interpolation)

        return img
    
class RandomCrop(object):

    def __init__(self, size):
        self.size = size  

    def get_random_param(self):
        self.topleft_x = random.random()
        self.topleft_y = random.random()

    def __call__(self, img):
        image_width, image_height = img.size
        out_height, out_width = self.size

        topleft_x = self.topleft_x * (image_width - out_width)
        topleft_y = self.topleft_y * (image_height - out_height)
        bottomright_x = topleft_x + out_width
        bottomright_y = topleft_y + out_height

        img = img.crop((topleft_x, topleft_y, bottomright_x, bottomright_y))
        return img

In [None]:
def read_video(paths, mode):
    all_image = []
    temporal_transform = TemporalCrop(sample_duration, mode)
    if mode == 'train':
        RandomCrops = MultiScaleRandomCrop(scales, sample_size)
#         RandomCrops = RandomCrop(sample_size)
        RandomCrops.get_random_param()
        spatial_transform = transforms.Compose([
            transforms.ToPILImage(),
            RandomCrops,
            transforms.ToTensor(),
            transforms.Normalize(rgb_mean, rgb_std)
        ])
    else:
        spatial_transform = transforms.Compose([
            transforms.ToPILImage(),
#             transforms.Resize(sample_size),
            transforms.CenterCrop(sample_size),
            transforms.ToTensor(),
            transforms.Normalize(rgb_mean, rgb_std)
        ])
                
    new_paths = temporal_transform(paths)
    width = cv2.imread(new_paths[0]).shape[1]
    if width != 176:
        padding = np.zeros((100,(176-width)//2,3), dtype=np.uint8)
        for path in new_paths:
            image = cv2.imread(path)
            image = np.concatenate([padding, image, padding], axis=1)
            image = spatial_transform(image)
            all_image.append(image)
    else:
        for path in new_paths:
            image = cv2.imread(path)
            image = spatial_transform(image)
            all_image.append(image)
            
    video = np.stack(all_image).transpose(1,0,2,3)
#     print(video.shape)
    return video

In [None]:
class Dataset(Dataset):
    def __init__(self, all_path, x, y, mode):
        self.length = len(x)
        self.all_path = all_path
        self.x = x
        self.y = y
        self.mode = mode
    
    def __len__(self):
        return(self.length)
    
    def __getitem__(self, index):
        if self.mode == 'train' or self.mode == 'valid':
            x = read_video(self.all_path[int(self.x[index,0])-1], self.mode)
            y = int(np.argwhere(self.y == self.x[index,1]))
            return torch.from_numpy(x), torch.tensor(y)
        else:
            x = read_video(self.all_path[int(self.x[index])-1], self.mode)
            return torch.from_numpy(x)

In [None]:
train = np.genfromtxt('D:/Jester/jester-v1-train.csv', delimiter=',', dtype=np.str)    
train_data = Dataset(all_path, train, labels, 'train')
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_worker, pin_memory=True)

In [None]:
valid = np.genfromtxt('D:/Jester/jester-v1-validation.csv', delimiter=',', dtype=np.str)
valid_data = Dataset(all_path, valid, labels, 'valid')
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, num_workers=num_worker, pin_memory=True)

In [None]:
test = np.genfromtxt('D:/Jester/jester-v1-test.csv', delimiter=',', dtype=np.str)
test_data = Dataset(all_path, test, labels, 'test')
test_loader = DataLoader(test_data, batch_size=1, shuffle=False, num_workers=num_worker, pin_memory=True)

In [None]:
class GlobalContextBlock(nn.Module):
    def __init__(self, in_channels, ratio=16):
        super(GlobalContextBlock, self).__init__()

        self.conv_mask = nn.Conv3d(in_channels, 1, kernel_size=1)
        self.softmax = nn.Softmax(dim=2)

        self.channel_add_conv = nn.Sequential(
            nn.Conv3d(in_channels, in_channels//ratio, kernel_size=1),
            nn.LayerNorm([in_channels//ratio, 1, 1, 1]),
            nn.ReLU(),
            nn.Conv3d(in_channels//ratio, in_channels, kernel_size=1)
        )

    def spatial_pool(self, x):
        batch, channel, time, height, width = x.size()

        input_x = x 
        input_x = input_x.view(batch, channel, time * height * width) # [N, C, T * H * W]
        
        context_mask = self.conv_mask(x) # [N, 1, T, H, W]
        context_mask = context_mask.view(batch, 1, time * height * width) # [N, 1, T * H * W]
        context_mask = self.softmax(context_mask) # [N, 1, T * H * W]
        context_mask = context_mask.squeeze(1).unsqueeze(2) # [N, T * H * W, 1]
        context = torch.bmm(input_x, context_mask) # [N, C, 1]
        
        context = context.view(batch, channel, 1, 1, 1) # [N, C, 1, 1, 1]
        return context

    def forward(self, x):
        context = self.spatial_pool(x)
        channel_add_term = self.channel_add_conv(context)
        out = x + channel_add_term
        return out

In [None]:
MobileFaceNet_BottleNeck_Setting = [
    # t, c , n ,s
    [2, 128, 5, 1],
    [4, 128, 1, 1],
    [2, 128, 6, 2],
    [4, 128, 1, 2],
    [2, 128, 2, 2]
]

class BottleNeck(nn.Module):
    def __init__(self, inp, oup, stride, expansion):
        super(BottleNeck, self).__init__()
        self.connect = stride == 1 and inp == oup

        self.conv = nn.Sequential(
            # 1*1 conv
            nn.Conv3d(inp, inp * expansion, 1, 1, 0, bias=False),
            nn.BatchNorm3d(inp * expansion),
            nn.PReLU(inp * expansion),

            # 3*3 depth wise conv
            nn.Conv3d(inp * expansion, inp * expansion, 3, stride, 1, groups=inp * expansion, bias=False),
            nn.BatchNorm3d(inp * expansion),
            nn.PReLU(inp * expansion),

            # 1*1 conv
            nn.Conv3d(inp * expansion, oup, 1, 1, 0, bias=False),
            nn.BatchNorm3d(oup),
        )

    def forward(self, x):
        if self.connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class ConvBlock(nn.Module):
    def __init__(self, inp, oup, k, s, p, dw=False, linear=False):
        super(ConvBlock, self).__init__()
        self.linear = linear
        if dw:
            self.conv = nn.Conv3d(inp, oup, k, s, p, groups=inp, bias=False)
        else:
            self.conv = nn.Conv3d(inp, oup, k, s, p, bias=False)

        self.bn = nn.BatchNorm3d(oup)
        if not linear:
            self.prelu = nn.PReLU(oup)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        if self.linear:
            return x
        else:
            return self.prelu(x)


class MobileFaceNet(nn.Module):
    def __init__(self, feature_dim=256, num_classes=27, bottleneck_setting=MobileFaceNet_BottleNeck_Setting):
        super(MobileFaceNet, self).__init__()
        self.conv1 = ConvBlock(3, 64, (3,7,7), (1,2,2), (1,3,3))
        self.dw_conv1 = ConvBlock(64, 128, 3, 2, 1, dw=True)

        self.cur_channel = 128
        block = BottleNeck
        self.blocks = self._make_layer(block, bottleneck_setting)

        self.conv2 = ConvBlock(128, 512, 1, 1, 0)
        self.linear7 = ConvBlock(512, 512, (1,3,5), 1, 0, dw=True, linear=True)
        self.linear1 = ConvBlock(512, feature_dim, 1, 1, 0, linear=True)
        self.bn = nn.BatchNorm3d(feature_dim)
#         self.dropout = nn.Dropout(0.2)
        self.out = nn.Linear(feature_dim, num_classes)

        for layer in self.modules():
            if isinstance(layer, nn.Conv3d):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(layer, nn.BatchNorm3d):
                nn.init.constant_(layer.weight, val=1.0)
                nn.init.constant_(layer.bias, val=0.0)
            elif isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
                nn.init.constant_(layer.bias, val=0.0)

    
    def _make_layer(self, block, setting):
        layers = []
        for t, c, n, s in setting:
            for i in range(n):
                if i == 0:
                    layers.append(block(self.cur_channel, c, s, t))
                    layers.append(GlobalContextBlock(self.cur_channel))
#                 elif i != n-1:
#                     layers.append(block(self.cur_channel, c, 1, t))
#                     layers.append(GlobalContextBlock(self.cur_channel))
                else:
                    layers.append(block(self.cur_channel, c, 1, t))
                    layers.append(GlobalContextBlock(self.cur_channel))
                self.cur_channel = c

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.dw_conv1(x)
        x = self.blocks(x)
        x = self.conv2(x)
        x = self.linear7(x)
        x = self.linear1(x)
        x = self.bn(x)
        x = x.view(x.size(0), -1)
#         x = self.dropout(x)
        output = self.out(x)
        return output
    
class FocalLoss(nn.Module):

    def __init__(self, gamma=2):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.ce = torch.nn.CrossEntropyLoss()

    def forward(self, input, target):
        logp = self.ce(input, target)
        p = torch.exp(-logp)
        loss = (1 - p) ** self.gamma * logp
        return loss.mean()

In [None]:
def train_epoch(model, train_loader, criterion, optimizer):
    model.train()

    running_loss = 0.0
    total_predictions = 0.0
    correct_predictions = 0.0
    
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        
        correct_predictions += (predicted == labels).sum().item()
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
        print(i, loss)

    running_loss /= len(train_loader)
    acc = (correct_predictions/total_predictions)*100.0
    print('Training Loss: ', running_loss)
    print('Training Accuracy: ', acc, '%')
    return running_loss, acc

In [None]:
def test_model(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0
        wrong = 0
        for i, data in enumerate(test_loader, 0):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()
            
#             for n in range(labels.size(0)):
#                 if predicted[n] != labels[n]:
#                     print('Prediction: ',predicted[n], '////Groundtruth: ', labels[n])
#                     if int(labels[n].cpu()) in [0,1,2,3,6,7,8,9,14,15,16,17,18,19,23,25]:
#                         wrong +=1
#             print(i,wrong)
            
            loss = criterion(outputs, labels).detach()
            running_loss += loss.item()

        running_loss /= len(test_loader)
        acc = (correct_predictions/total_predictions)*100.0
        print('Validation Loss: ', running_loss)
        print('Validation Accuracy: ', acc, '%')
        return running_loss, acc

In [None]:
model = MobileFaceNet()
model = nn.DataParallel(model)
model.load_state_dict(torch.load('project_classifier_mobile_gc9228.pth'))
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=True)
scheduler = ReduceLROnPlateau(optimizer, mode = 'min', factor=0.1, patience=2)
device = torch.device("cuda")
model.to(device)
pytorch_total_params = sum(p.numel() for p in model.parameters())
pytorch_total_params

In [None]:
# n_epochs = 100
# Train_loss = []
# Train_acc = []
# Valid_loss = []
# Valid_acc = []
# num_no_improve = 0
# for i in range(n_epochs):
#     train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
#     valid_loss, valid_acc = test_model(model, valid_loader, criterion)
#     Train_acc.append(train_acc)
#     Train_loss.append(train_loss)
#     Valid_loss.append(valid_loss)
#     scheduler.step(valid_loss)
#     print('='*40)

#     if i == 0:
#         torch.save(model.state_dict(), 'project_classifier_mobile_gc.pth')
#     else:
#         if valid_acc > max(Valid_acc):
#             torch.save(model.state_dict(), 'project_classifier_mobile_gc.pth')
#             num_no_improve = 0
#         else:
#             num_no_improve += 1
#     Valid_acc.append(valid_acc)
    
#     training_loss = np.array(Train_loss).reshape(-1,1)
#     training_acc = np.array(Train_acc).reshape(-1,1)
#     validation_loss = np.array(Valid_loss).reshape(-1,1)
#     validation_acc = np.array(Valid_acc).reshape(-1,1)
#     result = np.concatenate([training_loss, training_acc, validation_loss, validation_acc], axis=1)
#     np.savetxt('result_project_classifier_mobile_gc.csv', result, delimiter=',', fmt='%1.5f', header='training_loss,training_acc,validation_loss,validation_acc', comments='')
    
#     if num_no_improve >= 10:
#         break

In [None]:
def inference(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()
        output = []
        total_duration = 0
        labels = np.genfromtxt('D:/Jester/jester-v1-labels.csv', delimiter=',', dtype=np.str)
        for i, data in enumerate(test_loader, 0):
            start = time.time()
            inputs = data.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            end = time.time()
            
            for n in range(inputs.size(0)):
                prediction = labels[predicted[n]]
                output.append(prediction)
            
            duration = end - start
            total_duration += duration
            print(i, duration)
        
        index = np.genfromtxt('D:/Jester/jester-v1-test.csv', delimiter=',', dtype=np.str).reshape(-1,1)
        Predicted = np.array(output, dtype=np.str).reshape(-1,1)
        submission = np.concatenate((index, Predicted), axis=1)
        
    np.savetxt('predict.csv', Predicted, delimiter=',', fmt='%s')
    average_duration = total_duration/len(test_loader.dataset)
    print(average_duration)

In [None]:
inference(model, test_loader, criterion)