In [1]:
import glob
# import os
import cv2
from PIL import Image
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
torch.cuda.is_available()

True

In [2]:
root = 'D:/Jester/jester/20bn-jester-v1/*'
num_classes = 28
num_worker = 0
batch_size = 4
scales = [1, 1/2**(1/4), 1/2**(1/2), 1/2**(3/4), 1/2]
sample_size = 224
sample_duration = 16
rgb_mean = (114.7748/255, 107.7354/255, 99.4750/255)
rgb_std = (38.7568578/255, 37.88248729/255, 40.02898126/255)
lr = 0.01
momentum = 0.9
weight_decay = 5e-4

In [3]:
def load_all_path(root):
    video_dictionary = sorted(glob.glob(root))
    all_path = []
    for video_path in video_dictionary:
        file_list = sorted(glob.glob(video_path + '/*'))
        all_path.append(file_list)
    return all_path

In [4]:
all_path = load_all_path(root)
labels = np.genfromtxt('D:/Jester/jester-v1-labels.csv', delimiter=',', dtype=np.str)
labels.tolist()

['Swiping Left',
 'Swiping Right',
 'Swiping Down',
 'Swiping Up',
 'Pushing Hand Away',
 'Pulling Hand In',
 'Sliding Two Fingers Left',
 'Sliding Two Fingers Right',
 'Sliding Two Fingers Down',
 'Sliding Two Fingers Up',
 'Pushing Two Fingers Away',
 'Pulling Two Fingers In',
 'Rolling Hand Forward',
 'Rolling Hand Backward',
 'Turning Hand Clockwise',
 'Turning Hand Counterclockwise',
 'Zooming In With Full Hand',
 'Zooming Out With Full Hand',
 'Zooming In With Two Fingers',
 'Zooming Out With Two Fingers',
 'Thumb Up',
 'Thumb Down',
 'Shaking Hand',
 'Stop Sign',
 'Drumming Fingers',
 'No gesture',
 'Doing other things']

In [5]:
class TemporalCrop(object):
    """Temporally crop the given frame indices at a random location or at the center location.
        size (int): Desired output size of the crop.
    """

    def __init__(self, size, mode):
        self.size = size
        self.mode = mode

    def __call__(self, path):
        """
        Args:
            paths (list): paths to be cropped.
        Returns:
            list: Cropped paths.
        """
        num_frames = len(path)
        
        if self.mode == 'random':
            if num_frames < self.size:
                pad_before = (self.size - num_frames)//2
                pad_after = self.size - num_frames - pad_before
                new_path = [path[0]]*pad_before + path + [path[-1]]*pad_after
            else:
                begin_index = random.randint(0, num_frames - self.size)
                end_index = begin_index + self.size
                new_path = path[begin_index:end_index]
        else:
            if num_frames < self.size:
                pad_before = (self.size - num_frames)//2
                pad_after = self.size - num_frames - pad_before
                new_path = [path[0]]*pad_before + path + [path[-1]]*pad_after
            else:
                begin_index = (num_frames - self.size)//2
                end_index = begin_index + self.size
                new_path = path[begin_index:end_index]
        
        return new_path

    
class MultiScaleRandomCrop(object):

    def __init__(self, scales, size, interpolation=Image.BILINEAR):
        self.scales = scales
        self.size = size
        self.interpolation = interpolation
        
    @staticmethod
    def get_random_param(self):
        self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
        self.topleft_x = random.random()
        self.topleft_y = random.random()

    def __call__(self, img):
        image_width, image_height = _get_image_size(img)
        min_length = min(image_width, image_height)
        crop_size = int(min_length * self.scale)

        topleft_x = self.topleft_x * (image_width - crop_size)
        topleft_y = self.topleft_y * (image_height - crop_size)
        bottomright_x = topleft_x + crop_size
        bottomright_y = topleft_x + crop_size

        img = F.resized_crop(img, topleft_x, topleft_y, bottomright_x, bottomright_y, self.size, self.interpolation)

        return img

# class SpatialElasticDisplacement(object):

#     def __init__(self, sigma=2.0, alpha=1.0, order=0, cval=0, mode="constant"):
#         self.alpha = alpha
#         self.sigma = sigma
#         self.order = order
#         self.cval = cval
#         self.mode = mode
    
#         @staticmethod
#     def get_random_param(self):
#         self.p = random.random()
        
#     def __call__(self, img):
#         if self.p < 0.50:
#             is_L = False
#             is_PIL = isinstance(img, Image.Image)
            
#             if is_PIL:
#                 img = np.asarray(img, dtype=np.uint8)
#             if len(img.shape) == 2:
#                 is_L = True
#                 img = np.reshape(img, img.shape + (1,))  

#             image = img
#             image_first_channel = np.squeeze(image[..., 0])
#             indices_x, indices_y = self._generate_indices(image_first_channel.shape, alpha=self.alpha, sigma=self.sigma)
#             ret_image = (self._map_coordinates(
#                 image,
#                 indices_x,
#                 indices_y,
#                 order=self.order,
#                 cval=self.cval,
#                 mode=self.mode))

#             if  is_PIL:
#                 if is_L:
#                     return Image.fromarray(ret_image.reshape(ret_image.shape[:2]), mode= 'L')
#                 else:
#                     return Image.fromarray(ret_image)
#             else:
#                 return ret_image
#         else:
#             return img

#     def _generate_indices(self, shape, alpha, sigma):
#         assert (len(shape) == 2),"shape: Should be of size 2!"
#         dx = scipy.ndimage.gaussian_filter((np.random.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha
#         dy = scipy.ndimage.gaussian_filter((np.random.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha

#         x, y = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]), indexing='ij')
#         return np.reshape(x+dx, (-1, 1)), np.reshape(y+dy, (-1, 1))

#     def _map_coordinates(self, image, indices_x, indices_y, order=1, cval=0, mode="constant"):
#         assert (len(image.shape) == 3),"image.shape: Should be of size 3!"
#         result = np.copy(image)
#         height, width = image.shape[0:2]
#         for c in range(image.shape[2]):
#             remapped_flat = scipy.ndimage.interpolation.map_coordinates(
#                 image[..., c],
#                 (indices_x, indices_y),
#                 order=order,
#                 cval=cval,
#                 mode=mode
#             )
#             remapped = remapped_flat.reshape((height, width))
#             result[..., c] = remapped
#         return result

In [6]:
def read_video(paths, mode):
    all_image = []
    temporal_transform = TemporalCrop(sample_duration, mode)
    if mode == 'train':
        spatial_transform = transforms.Compose([
            transforms.ToPILImage(),
            MultiScaleRandomCrop(scales, sample_size),
    #         SpatialElasticDisplacement(),
            transforms.ToTensor(),
            transforms.Normalize(rgb_mean, rgb_std)
        ])
    else:
        spatial_transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize(sample_size),
            transforms.CenterCrop(sample_size),
            transforms.ToTensor(),
            transforms.Normalize(rgb_mean, rgb_std)
        ])
                
    new_paths = temporal_transform(paths)
    for path in new_paths:
        image = cv2.imread(path)
        image = spatial_transform(image)
        all_image.append(image)
    video = np.stack(all_image).transpose(1,0,2,3)
    return video

In [7]:
class Dataset(Dataset):
    def __init__(self, all_path, x, y, mode):
        self.length = len(x)
        self.all_path = all_path
        self.x = x
        self.y = y
        self.mode = mode
    
    def __len__(self):
        return(self.length)
    
    def __getitem__(self, index):
        if self.mode == 'train':
            x = read_video(self.all_path[int(self.x[index,0])-1], 'random')
            y = int(np.argwhere(self.y == self.x[index,1]))
            return torch.from_numpy(x), torch.tensor(y)
        elif self.mode == 'valid':
            x = read_video(self.all_path[int(self.x[index,0])-1], 'center')
            y = int(np.argwhere(self.y == self.x[index,1]))
            return torch.from_numpy(x), torch.tensor(y)
        else:
            x = read_video(self.all_path[int(self.x[index])-1], 'center')
            return torch.from_numpy(x)

In [8]:
train = np.genfromtxt('D:/Jester/jester-v1-train.csv', delimiter=',', dtype=np.str)    
train_data = Dataset(all_path, train, labels, 'train')
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_worker, pin_memory=True)

In [9]:
valid = np.genfromtxt('D:/Jester/jester-v1-validation.csv', delimiter=',', dtype=np.str)
valid_data = Dataset(all_path, valid, labels, 'valid')
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, num_workers=num_worker, pin_memory=True)

In [10]:
test = np.genfromtxt('D:/Jester/jester-v1-test.csv', delimiter=',', dtype=np.str)
test_data = Dataset(all_path, test, labels, 'test')
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_worker, pin_memory=True)

In [11]:
# for i, (data, target) in enumerate(train_loader):
#     print(data.shape)

In [12]:
# def get_training_set(opt, spatial_transform, temporal_transform,
#                      target_transform):
#     assert opt.dataset in ['jester', 'egogesture', 'nv']

#     if opt.train_validate:
#         subset = ['training', 'validation']
#     else:
#         subset = 'training'
#     if opt.dataset == 'jester':
#         training_data = Jester(
#             opt.video_path,
#             opt.annotation_path,
#             subset,
#             spatial_transform=spatial_transform,
#             temporal_transform=temporal_transform,
#             target_transform=target_transform,
#             sample_duration=opt.sample_duration,
#             modality=opt.modality)
#     elif opt.dataset == 'egogesture':
#         training_data = EgoGesture(
#             opt.video_path,
#             opt.annotation_path,
#             subset,
#             spatial_transform=spatial_transform,
#             temporal_transform=temporal_transform,
#             target_transform=target_transform,
#             sample_duration=opt.sample_duration,
#             modality=opt.modality)
#     elif opt.dataset == 'nv':
#         training_data = NV(
#             opt.video_path,
#             opt.annotation_path,
#             subset,
#             spatial_transform=spatial_transform,
#             temporal_transform=temporal_transform,
#             target_transform=target_transform,
#             sample_duration=opt.sample_duration,
#             modality=opt.modality)
#     return training_data


# def get_validation_set(opt, spatial_transform, temporal_transform,
#                        target_transform):
#     assert opt.dataset in ['jester', 'egogesture', 'nv']

#     if opt.dataset == 'jester':
#         validation_data = Jester(
#             opt.video_path,
#             opt.annotation_path,
#             'validation',
#             opt.n_val_samples,
#             spatial_transform,
#             temporal_transform,
#             target_transform,
#             modality=opt.modality,
#             sample_duration=opt.sample_duration)
#     elif opt.dataset == 'egogesture':
#         validation_data = EgoGesture(
#             opt.video_path,
#             opt.annotation_path,
#             'validation',
#             opt.n_val_samples,
#             spatial_transform,
#             temporal_transform,
#             target_transform,
#             modality=opt.modality,
#             sample_duration=opt.sample_duration)
#     elif opt.dataset == 'nv':
#         validation_data = NV(
#             opt.video_path,
#             opt.annotation_path,
#             'validation',
#             spatial_transform=spatial_transform,
#             temporal_transform=temporal_transform,
#             target_transform=target_transform,
#             sample_duration=opt.sample_duration,
#             modality=opt.modality)
#     return validation_data

In [13]:
class ResNeXtBottleneck(nn.Module):
    def __init__(self, in_channels, cardinality=32, width=4, stride=1):
        super(ResNeXtBottleneck, self).__init__()
        channels = cardinality*width
        out_channels = 2*channels
        
        self.net = nn.Sequential(
            nn.BatchNorm3d(in_channels),
            nn.PReLU(),
            nn.Conv3d(in_channels, channels, kernel_size=1, bias=False),
            nn.BatchNorm3d(channels),
            nn.PReLU(),
            nn.Conv3d(channels, channels, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False),
            nn.BatchNorm3d(channels),
            nn.PReLU(),
            nn.Conv3d(channels, out_channels, kernel_size=1, bias=False)
        )

        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.BatchNorm3d(in_channels),
                nn.PReLU(),
                nn.Conv3d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
            )
        else:
            self.shortcut = nn.Identity()
            
    def forward(self, x):
        out = self.net(x)
        out += self.shortcut(x)
        return out
    
    
class ResNeXt(nn.Module):
    def __init__(self, block, num_blocks, cardinality, width, num_class):
        super(ResNeXt, self).__init__()
        self.cardinality = cardinality
        self.width = width
        self.in_channels = 64

        self.feature_detector = nn.Sequential(
            nn.Conv3d(3, 64, kernel_size=(3,7,7), stride=(1,2,2), padding=(1,3,3), bias=False),
            nn.MaxPool3d(kernel_size=3, stride=2, padding=1),
            self.stack_blocks(block, num_blocks[0], stride=1),
            self.stack_blocks(block, num_blocks[1], stride=2),
            self.stack_blocks(block, num_blocks[2], stride=2),
            self.stack_blocks(block, num_blocks[3], stride=2),
            nn.AvgPool3d((1,7,7)),
            nn.Dropout3d(0.5),
            nn.Flatten()
        )
        
        self.classifier = nn.Linear(2048,num_class)
        
        for layer in self.modules():
            if isinstance(layer, nn.Conv3d):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(layer, nn.BatchNorm3d):
                nn.init.constant_(layer.weight, val=1.0)
                nn.init.constant_(layer.bias, val=0.0)
            elif isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
                nn.init.constant_(layer.bias, val=0.0)
                
    def stack_blocks(self, block, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, self.cardinality, self.width, stride))
            self.in_channels = 2*self.cardinality*self.width
        self.width *= 2
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.feature_detector(x)
        out = self.classifier(out)
        return out

In [14]:
# model = torch.hub.load('pytorch/vision:v0.5.0', 'resnext50_32x4d', pretrained=True)
model = ResNeXt(ResNeXtBottleneck, [3,4,6,3], cardinality=32, width=4, num_class=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=True)
scheduler = ReduceLROnPlateau(optimizer, mode = 'min', factor=0.1, patience=2)
device = torch.device("cuda")
model.to(device)

ResNeXt(
  (feature_detector): Sequential(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): MaxPool3d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (2): Sequential(
      (0): ResNeXtBottleneck(
        (net): Sequential(
          (0): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (1): PReLU(num_parameters=1)
          (2): Conv3d(64, 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (3): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (4): PReLU(num_parameters=1)
          (5): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=32, bias=False)
          (6): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (7): PReLU(num_parameters=1)
          (8): Conv3d(128, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
        )
  

In [15]:
def train_epoch(model, train_loader, criterion, optimizer):
    model.train()

    running_loss = 0.0
    total_predictions = 0.0
    correct_predictions = 0.0
    
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        
        correct_predictions += (predicted == labels).sum().item()
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        print(i, loss)
        loss.backward()
        optimizer.step()

    running_loss /= len(train_loader)
    acc = (correct_predictions/total_predictions)*100.0
    print('Training Loss: ', running_loss)
    print('Training Accuracy: ', acc, '%')
    return running_loss, acc

In [16]:
def test_model(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0
        
        for i, data in enumerate(test_loader, 0):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

            loss = criterion(outputs, labels).detach()
            running_loss += loss.item()
            
        running_loss /= len(test_loader)
        acc = (correct_predictions/total_predictions)*100.0
        print('Validation Loss: ', running_loss)
        print('Validation Accuracy: ', acc, '%')
        return running_loss, acc

In [None]:
n_epochs = 100
Train_loss = []
Train_acc = []
Valid_loss = []
Valid_acc = []
num_no_improve = 0
for i in range(n_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    valid_loss, valid_acc = test_model(model, valid_loader, criterion)
    Train_acc.append(train_acc)
    Train_loss.append(train_loss)
    Valid_loss.append(valid_loss)
    scheduler.step(valid_loss)
    print('='*40)

    if i == 0:
        torch.save(model.state_dict(), 'project.pth')
    else:
        if test_acc > max(Test_acc):
            torch.save(model.state_dict(), 'project.pth')
            num_no_improve = 0
        else:
            num_no_improve += 1
    Valid_acc.append(valid_acc)
    
    training_loss = np.array(Train_loss).reshape(-1,1)
    training_acc = np.array(Train_acc).reshape(-1,1)
    validation_loss = np.array(Valid_loss).reshape(-1,1)
    validation_acc = np.array(Valid_acc).reshape(-1,1)
    result = np.concatenate([training_loss, training_acc, validation_loss, validation_acc]).T
    np.savetxt('result_project.csv', result, delimiter=',', fmt='%1.5f', header='training_loss,training_acc,validation_loss,validation_acc', comments='')
    
    if num_no_improve >= 10:
        break

0 tensor(27.6849, device='cuda:0', grad_fn=<NllLossBackward>)
1 tensor(243.1723, device='cuda:0', grad_fn=<NllLossBackward>)
2 tensor(282.3637, device='cuda:0', grad_fn=<NllLossBackward>)
3 tensor(1376.7660, device='cuda:0', grad_fn=<NllLossBackward>)
4 tensor(1202.6620, device='cuda:0', grad_fn=<NllLossBackward>)
5 tensor(3354.1289, device='cuda:0', grad_fn=<NllLossBackward>)
6 tensor(26028.6914, device='cuda:0', grad_fn=<NllLossBackward>)
7 tensor(135842.5938, device='cuda:0', grad_fn=<NllLossBackward>)
8 tensor(4014499.7500, device='cuda:0', grad_fn=<NllLossBackward>)
9 tensor(2.1298e+10, device='cuda:0', grad_fn=<NllLossBackward>)
10 tensor(1.4256e+22, device='cuda:0', grad_fn=<NllLossBackward>)
11 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
12 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
13 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
14 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
15 tensor(nan, device='cuda:0', grad_fn=<NllLossBackwar

137 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
138 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
139 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
140 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
141 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
142 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
143 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
144 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
145 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
146 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
147 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
148 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
149 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
150 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
151 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
152 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
153 tensor(nan, device='cuda:0', grad_fn

273 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
274 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
275 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
276 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
277 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
278 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
279 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
280 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
281 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
282 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
283 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
284 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
285 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
286 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
287 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
288 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
289 tensor(nan, device='cuda:0', grad_fn

409 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
410 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
411 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
412 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
413 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
414 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
415 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
416 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
417 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
418 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
419 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
420 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
421 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
422 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
423 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
424 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
425 tensor(nan, device='cuda:0', grad_fn

545 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
546 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
547 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
548 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
549 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
550 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
551 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
552 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
553 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
554 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
555 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
556 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
557 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
558 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
559 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
560 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
561 tensor(nan, device='cuda:0', grad_fn

681 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
682 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
683 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
684 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
685 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
686 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
687 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
688 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
689 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
690 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
691 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
692 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
693 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
694 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
695 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
696 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
697 tensor(nan, device='cuda:0', grad_fn

817 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
818 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
819 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
820 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
821 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
822 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
823 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
824 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
825 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
826 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
827 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
828 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
829 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
830 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
831 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
832 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
833 tensor(nan, device='cuda:0', grad_fn

953 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
954 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
955 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
956 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
957 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
958 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
959 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
960 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
961 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
962 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
963 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
964 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
965 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
966 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
967 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
968 tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
969 tensor(nan, device='cuda:0', grad_fn