In [1]:
import glob
# import os
import cv2
from PIL import Image
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
torch.cuda.is_available()

True

In [2]:
root = 'D:/Jester/jester/20bn-jester-v1/*'
num_classes = 27
num_worker = 0
batch_size = 4
scales = [1, 1/2**(1/4), 1/2**(1/2), 1/2**(3/4), 1/2]
sample_size = 224
sample_duration = 16
rgb_mean = (114.7748/255, 107.7354/255, 99.4750/255)
rgb_std = (38.7568578/255, 37.88248729/255, 40.02898126/255)
lr = 0.0001
momentum = 0.9
weight_decay = 5e-4

In [3]:
def load_all_path(root):
    video_dictionary = sorted(glob.glob(root))
    all_path = []
    for video_path in video_dictionary:
        file_list = sorted(glob.glob(video_path + '/*'))
        all_path.append(file_list)
    return all_path

In [4]:
all_path = load_all_path(root)
labels = np.genfromtxt('D:/Jester/jester-v1-labels.csv', delimiter=',', dtype=np.str)
labels.tolist()

['Swiping Left',
 'Swiping Right',
 'Swiping Down',
 'Swiping Up',
 'Pushing Hand Away',
 'Pulling Hand In',
 'Sliding Two Fingers Left',
 'Sliding Two Fingers Right',
 'Sliding Two Fingers Down',
 'Sliding Two Fingers Up',
 'Pushing Two Fingers Away',
 'Pulling Two Fingers In',
 'Rolling Hand Forward',
 'Rolling Hand Backward',
 'Turning Hand Clockwise',
 'Turning Hand Counterclockwise',
 'Zooming In With Full Hand',
 'Zooming Out With Full Hand',
 'Zooming In With Two Fingers',
 'Zooming Out With Two Fingers',
 'Thumb Up',
 'Thumb Down',
 'Shaking Hand',
 'Stop Sign',
 'Drumming Fingers',
 'No gesture',
 'Doing other things']

In [5]:
class TemporalCrop(object):
    """Temporally crop the given frame indices at a random location or at the center location.
        size (int): Desired output size of the crop.
    """

    def __init__(self, size, mode):
        self.size = size
        self.mode = mode

    def __call__(self, path):
        """
        Args:
            paths (list): paths to be cropped.
        Returns:
            list: Cropped paths.
        """
        num_frames = len(path)
        
        if self.mode == 'random':
            if num_frames < self.size:
                pad_before = (self.size - num_frames)//2
                pad_after = self.size - num_frames - pad_before
                new_path = [path[0]]*pad_before + path + [path[-1]]*pad_after
            else:
                begin_index = random.randint(0, num_frames - self.size)
                end_index = begin_index + self.size
                new_path = path[begin_index:end_index]
        else:
            if num_frames < self.size:
                pad_before = (self.size - num_frames)//2
                pad_after = self.size - num_frames - pad_before
                new_path = [path[0]]*pad_before + path + [path[-1]]*pad_after
            else:
                begin_index = (num_frames - self.size)//2
                end_index = begin_index + self.size
                new_path = path[begin_index:end_index]
        
        return new_path

    
class MultiScaleRandomCrop(object):

    def __init__(self, scales, size, interpolation=Image.BILINEAR):
        self.scales = scales
        self.size = size
        self.interpolation = interpolation
        
    @staticmethod
    def get_random_param(self):
        self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
        self.topleft_x = random.random()
        self.topleft_y = random.random()

    def __call__(self, img):
        image_width, image_height = _get_image_size(img)
        min_length = min(image_width, image_height)
        crop_size = int(min_length * self.scale)

        topleft_x = self.topleft_x * (image_width - crop_size)
        topleft_y = self.topleft_y * (image_height - crop_size)
        bottomright_x = topleft_x + crop_size
        bottomright_y = topleft_x + crop_size

        img = F.crop(img, topleft_x, topleft_y, bottomright_x, bottomright_y)
        img = F.resize(img, self.size, self.interpolation)

        return img

# class SpatialElasticDisplacement(object):

#     def __init__(self, sigma=2.0, alpha=1.0, order=0, cval=0, mode="constant"):
#         self.alpha = alpha
#         self.sigma = sigma
#         self.order = order
#         self.cval = cval
#         self.mode = mode
    
#         @staticmethod
#     def get_random_param(self):
#         self.p = random.random()
        
#     def __call__(self, img):
#         if self.p < 0.50:
#             is_L = False
#             is_PIL = isinstance(img, Image.Image)
            
#             if is_PIL:
#                 img = np.asarray(img, dtype=np.uint8)
#             if len(img.shape) == 2:
#                 is_L = True
#                 img = np.reshape(img, img.shape + (1,))  

#             image = img
#             image_first_channel = np.squeeze(image[..., 0])
#             indices_x, indices_y = self._generate_indices(image_first_channel.shape, alpha=self.alpha, sigma=self.sigma)
#             ret_image = (self._map_coordinates(
#                 image,
#                 indices_x,
#                 indices_y,
#                 order=self.order,
#                 cval=self.cval,
#                 mode=self.mode))

#             if  is_PIL:
#                 if is_L:
#                     return Image.fromarray(ret_image.reshape(ret_image.shape[:2]), mode= 'L')
#                 else:
#                     return Image.fromarray(ret_image)
#             else:
#                 return ret_image
#         else:
#             return img

#     def _generate_indices(self, shape, alpha, sigma):
#         assert (len(shape) == 2),"shape: Should be of size 2!"
#         dx = scipy.ndimage.gaussian_filter((np.random.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha
#         dy = scipy.ndimage.gaussian_filter((np.random.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha

#         x, y = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]), indexing='ij')
#         return np.reshape(x+dx, (-1, 1)), np.reshape(y+dy, (-1, 1))

#     def _map_coordinates(self, image, indices_x, indices_y, order=1, cval=0, mode="constant"):
#         assert (len(image.shape) == 3),"image.shape: Should be of size 3!"
#         result = np.copy(image)
#         height, width = image.shape[0:2]
#         for c in range(image.shape[2]):
#             remapped_flat = scipy.ndimage.interpolation.map_coordinates(
#                 image[..., c],
#                 (indices_x, indices_y),
#                 order=order,
#                 cval=cval,
#                 mode=mode
#             )
#             remapped = remapped_flat.reshape((height, width))
#             result[..., c] = remapped
#         return result

In [6]:
def read_video(paths, mode):
    all_image = []
    temporal_transform = TemporalCrop(sample_duration, mode)
    if mode == 'train':
        spatial_transform = transforms.Compose([
            transforms.ToPILImage(),
            MultiScaleRandomCrop(scales, sample_size),
    #         SpatialElasticDisplacement(),
            transforms.ToTensor(),
            transforms.Normalize(rgb_mean, rgb_std)
        ])
    else:
        spatial_transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize(sample_size),
            transforms.CenterCrop(sample_size),
            transforms.ToTensor(),
            transforms.Normalize(rgb_mean, rgb_std)
        ])
                
    new_paths = temporal_transform(paths)
    for path in new_paths:
        image = cv2.imread(path)
        image = spatial_transform(image)
        all_image.append(image)
    video = np.stack(all_image).transpose(1,0,2,3)
    return video

In [7]:
class Dataset(Dataset):
    def __init__(self, all_path, x, y, mode):
        self.length = len(x)
        self.all_path = all_path
        self.x = x
        self.y = y
        self.mode = mode
    
    def __len__(self):
        return(self.length)
    
    def __getitem__(self, index):
        if self.mode == 'train':
            x = read_video(self.all_path[int(self.x[index,0])-1], 'random')
            y = int(np.argwhere(self.y == self.x[index,1]))
            return torch.from_numpy(x), torch.tensor(y)
        elif self.mode == 'valid':
            x = read_video(self.all_path[int(self.x[index,0])-1], 'center')
            y = int(np.argwhere(self.y == self.x[index,1]))
            return torch.from_numpy(x), torch.tensor(y)
        else:
            x = read_video(self.all_path[int(self.x[index])-1], 'center')
            return torch.from_numpy(x)

In [8]:
train = np.genfromtxt('D:/Jester/jester-v1-train.csv', delimiter=',', dtype=np.str)    
train_data = Dataset(all_path, train, labels, 'train')
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_worker, pin_memory=True)

In [9]:
valid = np.genfromtxt('D:/Jester/jester-v1-validation.csv', delimiter=',', dtype=np.str)
valid_data = Dataset(all_path, valid, labels, 'valid')
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, num_workers=num_worker, pin_memory=True)

In [10]:
test = np.genfromtxt('D:/Jester/jester-v1-test.csv', delimiter=',', dtype=np.str)
test_data = Dataset(all_path, test, labels, 'test')
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_worker, pin_memory=True)

In [11]:
# for i, (data, target) in enumerate(train_loader):
#     print(data.shape)

In [12]:
# def get_training_set(opt, spatial_transform, temporal_transform,
#                      target_transform):
#     assert opt.dataset in ['jester', 'egogesture', 'nv']

#     if opt.train_validate:
#         subset = ['training', 'validation']
#     else:
#         subset = 'training'
#     if opt.dataset == 'jester':
#         training_data = Jester(
#             opt.video_path,
#             opt.annotation_path,
#             subset,
#             spatial_transform=spatial_transform,
#             temporal_transform=temporal_transform,
#             target_transform=target_transform,
#             sample_duration=opt.sample_duration,
#             modality=opt.modality)
#     elif opt.dataset == 'egogesture':
#         training_data = EgoGesture(
#             opt.video_path,
#             opt.annotation_path,
#             subset,
#             spatial_transform=spatial_transform,
#             temporal_transform=temporal_transform,
#             target_transform=target_transform,
#             sample_duration=opt.sample_duration,
#             modality=opt.modality)
#     elif opt.dataset == 'nv':
#         training_data = NV(
#             opt.video_path,
#             opt.annotation_path,
#             subset,
#             spatial_transform=spatial_transform,
#             temporal_transform=temporal_transform,
#             target_transform=target_transform,
#             sample_duration=opt.sample_duration,
#             modality=opt.modality)
#     return training_data


# def get_validation_set(opt, spatial_transform, temporal_transform,
#                        target_transform):
#     assert opt.dataset in ['jester', 'egogesture', 'nv']

#     if opt.dataset == 'jester':
#         validation_data = Jester(
#             opt.video_path,
#             opt.annotation_path,
#             'validation',
#             opt.n_val_samples,
#             spatial_transform,
#             temporal_transform,
#             target_transform,
#             modality=opt.modality,
#             sample_duration=opt.sample_duration)
#     elif opt.dataset == 'egogesture':
#         validation_data = EgoGesture(
#             opt.video_path,
#             opt.annotation_path,
#             'validation',
#             opt.n_val_samples,
#             spatial_transform,
#             temporal_transform,
#             target_transform,
#             modality=opt.modality,
#             sample_duration=opt.sample_duration)
#     elif opt.dataset == 'nv':
#         validation_data = NV(
#             opt.video_path,
#             opt.annotation_path,
#             'validation',
#             spatial_transform=spatial_transform,
#             temporal_transform=temporal_transform,
#             target_transform=target_transform,
#             sample_duration=opt.sample_duration,
#             modality=opt.modality)
#     return validation_data

In [13]:
class ResNeXtBottleneck(nn.Module):
    def __init__(self, in_channels, cardinality=32, width=4, stride=1):
        super(ResNeXtBottleneck, self).__init__()
        channels = cardinality*width
        out_channels = 2*channels
        
        self.net = nn.Sequential(
            nn.BatchNorm3d(in_channels),
            nn.PReLU(),
            nn.Conv3d(in_channels, channels, kernel_size=1, bias=False),
            nn.BatchNorm3d(channels),
            nn.PReLU(),
            nn.Conv3d(channels, channels, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False),
            nn.BatchNorm3d(channels),
            nn.PReLU(),
            nn.Conv3d(channels, out_channels, kernel_size=1, bias=False)
        )

        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.BatchNorm3d(in_channels),
                nn.PReLU(),
                nn.Conv3d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
            )
        else:
            self.shortcut = nn.Identity()
            
    def forward(self, x):
        out = self.net(x)
        out += self.shortcut(x)
        return out
    
    
class ResNeXt(nn.Module):
    def __init__(self, block, num_blocks, cardinality, width, num_class):
        super(ResNeXt, self).__init__()
        self.cardinality = cardinality
        self.width = width
        self.in_channels = 64

        self.feature_detector = nn.Sequential(
            nn.Conv3d(3, 64, kernel_size=(3,7,7), stride=(1,2,2), padding=(1,3,3), bias=False),
            nn.MaxPool3d(kernel_size=3, stride=2, padding=1),
            self.stack_blocks(block, num_blocks[0], stride=1),
            self.stack_blocks(block, num_blocks[1], stride=2),
            self.stack_blocks(block, num_blocks[2], stride=2),
            self.stack_blocks(block, num_blocks[3], stride=2),
            nn.AvgPool3d((1,7,7)),
            nn.Dropout3d(0.5),
            nn.Flatten()
        )
        
        self.classifier = nn.Linear(2048,num_class)
        
        for layer in self.modules():
            if isinstance(layer, nn.Conv3d):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(layer, nn.BatchNorm3d):
                nn.init.constant_(layer.weight, val=1.0)
                nn.init.constant_(layer.bias, val=0.0)
            elif isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
                nn.init.constant_(layer.bias, val=0.0)
                
    def stack_blocks(self, block, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, self.cardinality, self.width, stride))
            self.in_channels = 2*self.cardinality*self.width
        self.width *= 2
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.feature_detector(x)
        out = self.classifier(out)
        return out

In [14]:
# model = torch.hub.load('pytorch/vision:v0.5.0', 'resnext50_32x4d', pretrained=True)
model = ResNeXt(ResNeXtBottleneck, [3,4,6,3], cardinality=32, width=4, num_class=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=True)
scheduler = ReduceLROnPlateau(optimizer, mode = 'min', factor=0.1, patience=2)
device = torch.device("cuda")
model.to(device)

ResNeXt(
  (feature_detector): Sequential(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): MaxPool3d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (2): Sequential(
      (0): ResNeXtBottleneck(
        (net): Sequential(
          (0): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (1): PReLU(num_parameters=1)
          (2): Conv3d(64, 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (3): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (4): PReLU(num_parameters=1)
          (5): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=32, bias=False)
          (6): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (7): PReLU(num_parameters=1)
          (8): Conv3d(128, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
        )
  

In [15]:
def train_epoch(model, train_loader, criterion, optimizer):
    model.train()

    running_loss = 0.0
    total_predictions = 0.0
    correct_predictions = 0.0
    
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        
        correct_predictions += (predicted == labels).sum().item()
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        print(i, loss)
        loss.backward()
        optimizer.step()

    running_loss /= len(train_loader)
    acc = (correct_predictions/total_predictions)*100.0
    print('Training Loss: ', running_loss)
    print('Training Accuracy: ', acc, '%')
    return running_loss, acc

In [16]:
def test_model(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0
        
        for i, data in enumerate(test_loader, 0):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

            loss = criterion(outputs, labels).detach()
            running_loss += loss.item()
            
        running_loss /= len(test_loader)
        acc = (correct_predictions/total_predictions)*100.0
        print('Validation Loss: ', running_loss)
        print('Validation Accuracy: ', acc, '%')
        return running_loss, acc

In [None]:
n_epochs = 100
Train_loss = []
Train_acc = []
Valid_loss = []
Valid_acc = []
num_no_improve = 0
for i in range(n_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    valid_loss, valid_acc = test_model(model, valid_loader, criterion)
    Train_acc.append(train_acc)
    Train_loss.append(train_loss)
    Valid_loss.append(valid_loss)
    scheduler.step(valid_loss)
    print('='*40)

    if i == 0:
        torch.save(model.state_dict(), 'project.pth')
    else:
        if test_acc > max(Test_acc):
            torch.save(model.state_dict(), 'project.pth')
            num_no_improve = 0
        else:
            num_no_improve += 1
    Valid_acc.append(valid_acc)
    
    training_loss = np.array(Train_loss).reshape(-1,1)
    training_acc = np.array(Train_acc).reshape(-1,1)
    validation_loss = np.array(Valid_loss).reshape(-1,1)
    validation_acc = np.array(Valid_acc).reshape(-1,1)
    result = np.concatenate([training_loss, training_acc, validation_loss, validation_acc]).T
    np.savetxt('result_project.csv', result, delimiter=',', fmt='%1.5f', header='training_loss,training_acc,validation_loss,validation_acc', comments='')
    
    if num_no_improve >= 10:
        break

0 tensor(26.2450, device='cuda:0', grad_fn=<NllLossBackward>)
1 tensor(14.0352, device='cuda:0', grad_fn=<NllLossBackward>)
2 tensor(22.2732, device='cuda:0', grad_fn=<NllLossBackward>)
3 tensor(36.6595, device='cuda:0', grad_fn=<NllLossBackward>)
4 tensor(35.6278, device='cuda:0', grad_fn=<NllLossBackward>)
5 tensor(27.8554, device='cuda:0', grad_fn=<NllLossBackward>)
6 tensor(29.3224, device='cuda:0', grad_fn=<NllLossBackward>)
7 tensor(20.1308, device='cuda:0', grad_fn=<NllLossBackward>)
8 tensor(26.9728, device='cuda:0', grad_fn=<NllLossBackward>)
9 tensor(31.5669, device='cuda:0', grad_fn=<NllLossBackward>)
10 tensor(40.4861, device='cuda:0', grad_fn=<NllLossBackward>)
11 tensor(25.0049, device='cuda:0', grad_fn=<NllLossBackward>)
12 tensor(48.9517, device='cuda:0', grad_fn=<NllLossBackward>)
13 tensor(29.9631, device='cuda:0', grad_fn=<NllLossBackward>)
14 tensor(15.9813, device='cuda:0', grad_fn=<NllLossBackward>)
15 tensor(40.1864, device='cuda:0', grad_fn=<NllLossBackward>)
16

129 tensor(43.2640, device='cuda:0', grad_fn=<NllLossBackward>)
130 tensor(53.2994, device='cuda:0', grad_fn=<NllLossBackward>)
131 tensor(10.9381, device='cuda:0', grad_fn=<NllLossBackward>)
132 tensor(17.8515, device='cuda:0', grad_fn=<NllLossBackward>)
133 tensor(43.0378, device='cuda:0', grad_fn=<NllLossBackward>)
134 tensor(31.8881, device='cuda:0', grad_fn=<NllLossBackward>)
135 tensor(36.4610, device='cuda:0', grad_fn=<NllLossBackward>)
136 tensor(27.9708, device='cuda:0', grad_fn=<NllLossBackward>)
137 tensor(29.0191, device='cuda:0', grad_fn=<NllLossBackward>)
138 tensor(52.9249, device='cuda:0', grad_fn=<NllLossBackward>)
139 tensor(31.0584, device='cuda:0', grad_fn=<NllLossBackward>)
140 tensor(26.3935, device='cuda:0', grad_fn=<NllLossBackward>)
141 tensor(35.6059, device='cuda:0', grad_fn=<NllLossBackward>)
142 tensor(42.8763, device='cuda:0', grad_fn=<NllLossBackward>)
143 tensor(18.7121, device='cuda:0', grad_fn=<NllLossBackward>)
144 tensor(16.3149, device='cuda:0', gra

257 tensor(25.5543, device='cuda:0', grad_fn=<NllLossBackward>)
258 tensor(23.9483, device='cuda:0', grad_fn=<NllLossBackward>)
259 tensor(23.0159, device='cuda:0', grad_fn=<NllLossBackward>)
260 tensor(21.7728, device='cuda:0', grad_fn=<NllLossBackward>)
261 tensor(38.1292, device='cuda:0', grad_fn=<NllLossBackward>)
262 tensor(30.7508, device='cuda:0', grad_fn=<NllLossBackward>)
263 tensor(47.1976, device='cuda:0', grad_fn=<NllLossBackward>)
264 tensor(30.4002, device='cuda:0', grad_fn=<NllLossBackward>)
265 tensor(19.7337, device='cuda:0', grad_fn=<NllLossBackward>)
266 tensor(35.3089, device='cuda:0', grad_fn=<NllLossBackward>)
267 tensor(14.9784, device='cuda:0', grad_fn=<NllLossBackward>)
268 tensor(20.6438, device='cuda:0', grad_fn=<NllLossBackward>)
269 tensor(38.9138, device='cuda:0', grad_fn=<NllLossBackward>)
270 tensor(27.2270, device='cuda:0', grad_fn=<NllLossBackward>)
271 tensor(28.7225, device='cuda:0', grad_fn=<NllLossBackward>)
272 tensor(14.2143, device='cuda:0', gra

385 tensor(10.4727, device='cuda:0', grad_fn=<NllLossBackward>)
386 tensor(19.3604, device='cuda:0', grad_fn=<NllLossBackward>)
387 tensor(6.8200, device='cuda:0', grad_fn=<NllLossBackward>)
388 tensor(39.0475, device='cuda:0', grad_fn=<NllLossBackward>)
389 tensor(22.7005, device='cuda:0', grad_fn=<NllLossBackward>)
390 tensor(23.3514, device='cuda:0', grad_fn=<NllLossBackward>)
391 tensor(12.5896, device='cuda:0', grad_fn=<NllLossBackward>)
392 tensor(13.1317, device='cuda:0', grad_fn=<NllLossBackward>)
393 tensor(23.7712, device='cuda:0', grad_fn=<NllLossBackward>)
394 tensor(25.8054, device='cuda:0', grad_fn=<NllLossBackward>)
395 tensor(28.6204, device='cuda:0', grad_fn=<NllLossBackward>)
396 tensor(12.8685, device='cuda:0', grad_fn=<NllLossBackward>)
397 tensor(8.9002, device='cuda:0', grad_fn=<NllLossBackward>)
398 tensor(19.5119, device='cuda:0', grad_fn=<NllLossBackward>)
399 tensor(21.0649, device='cuda:0', grad_fn=<NllLossBackward>)
400 tensor(25.4174, device='cuda:0', grad_

513 tensor(19.0422, device='cuda:0', grad_fn=<NllLossBackward>)
514 tensor(26.8104, device='cuda:0', grad_fn=<NllLossBackward>)
515 tensor(17.6672, device='cuda:0', grad_fn=<NllLossBackward>)
516 tensor(18.4938, device='cuda:0', grad_fn=<NllLossBackward>)
517 tensor(11.9181, device='cuda:0', grad_fn=<NllLossBackward>)
518 tensor(17.0388, device='cuda:0', grad_fn=<NllLossBackward>)
519 tensor(20.6523, device='cuda:0', grad_fn=<NllLossBackward>)
520 tensor(17.0943, device='cuda:0', grad_fn=<NllLossBackward>)
521 tensor(20.9893, device='cuda:0', grad_fn=<NllLossBackward>)
522 tensor(21.1983, device='cuda:0', grad_fn=<NllLossBackward>)
523 tensor(19.3775, device='cuda:0', grad_fn=<NllLossBackward>)
524 tensor(9.4239, device='cuda:0', grad_fn=<NllLossBackward>)
525 tensor(12.4899, device='cuda:0', grad_fn=<NllLossBackward>)
526 tensor(9.3021, device='cuda:0', grad_fn=<NllLossBackward>)
527 tensor(22.7695, device='cuda:0', grad_fn=<NllLossBackward>)
528 tensor(14.7566, device='cuda:0', grad_

641 tensor(16.1706, device='cuda:0', grad_fn=<NllLossBackward>)
642 tensor(17.2539, device='cuda:0', grad_fn=<NllLossBackward>)
643 tensor(11.1969, device='cuda:0', grad_fn=<NllLossBackward>)
644 tensor(11.6564, device='cuda:0', grad_fn=<NllLossBackward>)
645 tensor(17.7987, device='cuda:0', grad_fn=<NllLossBackward>)
646 tensor(13.9630, device='cuda:0', grad_fn=<NllLossBackward>)
647 tensor(14.1354, device='cuda:0', grad_fn=<NllLossBackward>)
648 tensor(15.1440, device='cuda:0', grad_fn=<NllLossBackward>)
649 tensor(15.8340, device='cuda:0', grad_fn=<NllLossBackward>)
650 tensor(18.5162, device='cuda:0', grad_fn=<NllLossBackward>)
651 tensor(15.1341, device='cuda:0', grad_fn=<NllLossBackward>)
652 tensor(10.0657, device='cuda:0', grad_fn=<NllLossBackward>)
653 tensor(12.0652, device='cuda:0', grad_fn=<NllLossBackward>)
654 tensor(11.4299, device='cuda:0', grad_fn=<NllLossBackward>)
655 tensor(17.6425, device='cuda:0', grad_fn=<NllLossBackward>)
656 tensor(22.1065, device='cuda:0', gra

769 tensor(14.0370, device='cuda:0', grad_fn=<NllLossBackward>)
770 tensor(15.4013, device='cuda:0', grad_fn=<NllLossBackward>)
771 tensor(22.9438, device='cuda:0', grad_fn=<NllLossBackward>)
772 tensor(14.6376, device='cuda:0', grad_fn=<NllLossBackward>)
773 tensor(19.2032, device='cuda:0', grad_fn=<NllLossBackward>)
774 tensor(17.1549, device='cuda:0', grad_fn=<NllLossBackward>)
775 tensor(3.6650, device='cuda:0', grad_fn=<NllLossBackward>)
776 tensor(9.8867, device='cuda:0', grad_fn=<NllLossBackward>)
777 tensor(9.4064, device='cuda:0', grad_fn=<NllLossBackward>)
778 tensor(16.1751, device='cuda:0', grad_fn=<NllLossBackward>)
779 tensor(15.8186, device='cuda:0', grad_fn=<NllLossBackward>)
780 tensor(8.4631, device='cuda:0', grad_fn=<NllLossBackward>)
781 tensor(20.6889, device='cuda:0', grad_fn=<NllLossBackward>)
782 tensor(13.8811, device='cuda:0', grad_fn=<NllLossBackward>)
783 tensor(15.2090, device='cuda:0', grad_fn=<NllLossBackward>)
784 tensor(12.2977, device='cuda:0', grad_fn

897 tensor(15.7180, device='cuda:0', grad_fn=<NllLossBackward>)
898 tensor(14.6012, device='cuda:0', grad_fn=<NllLossBackward>)
899 tensor(9.3412, device='cuda:0', grad_fn=<NllLossBackward>)
900 tensor(12.3217, device='cuda:0', grad_fn=<NllLossBackward>)
901 tensor(17.2287, device='cuda:0', grad_fn=<NllLossBackward>)
902 tensor(7.5851, device='cuda:0', grad_fn=<NllLossBackward>)
903 tensor(8.5569, device='cuda:0', grad_fn=<NllLossBackward>)
904 tensor(17.9649, device='cuda:0', grad_fn=<NllLossBackward>)
905 tensor(15.6044, device='cuda:0', grad_fn=<NllLossBackward>)
906 tensor(9.6704, device='cuda:0', grad_fn=<NllLossBackward>)
907 tensor(5.5816, device='cuda:0', grad_fn=<NllLossBackward>)
908 tensor(10.1752, device='cuda:0', grad_fn=<NllLossBackward>)
909 tensor(12.7293, device='cuda:0', grad_fn=<NllLossBackward>)
910 tensor(13.2589, device='cuda:0', grad_fn=<NllLossBackward>)
911 tensor(0.7318, device='cuda:0', grad_fn=<NllLossBackward>)
912 tensor(9.1613, device='cuda:0', grad_fn=<N

1025 tensor(11.5657, device='cuda:0', grad_fn=<NllLossBackward>)
1026 tensor(10.8657, device='cuda:0', grad_fn=<NllLossBackward>)
1027 tensor(13.2761, device='cuda:0', grad_fn=<NllLossBackward>)
1028 tensor(11.7912, device='cuda:0', grad_fn=<NllLossBackward>)
1029 tensor(8.9917, device='cuda:0', grad_fn=<NllLossBackward>)
1030 tensor(7.7282, device='cuda:0', grad_fn=<NllLossBackward>)
1031 tensor(7.8355, device='cuda:0', grad_fn=<NllLossBackward>)
1032 tensor(8.9275, device='cuda:0', grad_fn=<NllLossBackward>)
1033 tensor(9.0232, device='cuda:0', grad_fn=<NllLossBackward>)
1034 tensor(5.0141, device='cuda:0', grad_fn=<NllLossBackward>)
1035 tensor(5.9491, device='cuda:0', grad_fn=<NllLossBackward>)
1036 tensor(5.6460, device='cuda:0', grad_fn=<NllLossBackward>)
1037 tensor(9.7914, device='cuda:0', grad_fn=<NllLossBackward>)
1038 tensor(13.8018, device='cuda:0', grad_fn=<NllLossBackward>)
1039 tensor(8.4598, device='cuda:0', grad_fn=<NllLossBackward>)
1040 tensor(8.3118, device='cuda:0'

1152 tensor(12.0027, device='cuda:0', grad_fn=<NllLossBackward>)
1153 tensor(9.7656, device='cuda:0', grad_fn=<NllLossBackward>)
1154 tensor(9.9121, device='cuda:0', grad_fn=<NllLossBackward>)
1155 tensor(6.1889, device='cuda:0', grad_fn=<NllLossBackward>)
1156 tensor(10.5131, device='cuda:0', grad_fn=<NllLossBackward>)
1157 tensor(9.9595, device='cuda:0', grad_fn=<NllLossBackward>)
1158 tensor(9.3557, device='cuda:0', grad_fn=<NllLossBackward>)
1159 tensor(14.6668, device='cuda:0', grad_fn=<NllLossBackward>)
1160 tensor(13.2218, device='cuda:0', grad_fn=<NllLossBackward>)
1161 tensor(7.4895, device='cuda:0', grad_fn=<NllLossBackward>)
1162 tensor(7.6111, device='cuda:0', grad_fn=<NllLossBackward>)
1163 tensor(11.8129, device='cuda:0', grad_fn=<NllLossBackward>)
1164 tensor(7.6940, device='cuda:0', grad_fn=<NllLossBackward>)
1165 tensor(8.5221, device='cuda:0', grad_fn=<NllLossBackward>)
1166 tensor(6.8325, device='cuda:0', grad_fn=<NllLossBackward>)
1167 tensor(6.2295, device='cuda:0'

1279 tensor(10.9028, device='cuda:0', grad_fn=<NllLossBackward>)
1280 tensor(10.5018, device='cuda:0', grad_fn=<NllLossBackward>)
1281 tensor(11.6966, device='cuda:0', grad_fn=<NllLossBackward>)
1282 tensor(12.9430, device='cuda:0', grad_fn=<NllLossBackward>)
1283 tensor(4.2422, device='cuda:0', grad_fn=<NllLossBackward>)
1284 tensor(8.5332, device='cuda:0', grad_fn=<NllLossBackward>)
1285 tensor(8.0534, device='cuda:0', grad_fn=<NllLossBackward>)
1286 tensor(9.1418, device='cuda:0', grad_fn=<NllLossBackward>)
1287 tensor(11.7661, device='cuda:0', grad_fn=<NllLossBackward>)
1288 tensor(6.5057, device='cuda:0', grad_fn=<NllLossBackward>)
1289 tensor(8.3897, device='cuda:0', grad_fn=<NllLossBackward>)
1290 tensor(9.9658, device='cuda:0', grad_fn=<NllLossBackward>)
1291 tensor(7.3777, device='cuda:0', grad_fn=<NllLossBackward>)
1292 tensor(9.1223, device='cuda:0', grad_fn=<NllLossBackward>)
1293 tensor(8.2337, device='cuda:0', grad_fn=<NllLossBackward>)
1294 tensor(3.3810, device='cuda:0'

1406 tensor(10.6746, device='cuda:0', grad_fn=<NllLossBackward>)
1407 tensor(5.4836, device='cuda:0', grad_fn=<NllLossBackward>)
1408 tensor(7.2466, device='cuda:0', grad_fn=<NllLossBackward>)
1409 tensor(13.9799, device='cuda:0', grad_fn=<NllLossBackward>)
1410 tensor(10.0350, device='cuda:0', grad_fn=<NllLossBackward>)
1411 tensor(9.9865, device='cuda:0', grad_fn=<NllLossBackward>)
1412 tensor(7.0897, device='cuda:0', grad_fn=<NllLossBackward>)
1413 tensor(6.3516, device='cuda:0', grad_fn=<NllLossBackward>)
1414 tensor(6.2308, device='cuda:0', grad_fn=<NllLossBackward>)
1415 tensor(7.9688, device='cuda:0', grad_fn=<NllLossBackward>)
1416 tensor(4.1588, device='cuda:0', grad_fn=<NllLossBackward>)
1417 tensor(11.8214, device='cuda:0', grad_fn=<NllLossBackward>)
1418 tensor(9.4087, device='cuda:0', grad_fn=<NllLossBackward>)
1419 tensor(8.4270, device='cuda:0', grad_fn=<NllLossBackward>)
1420 tensor(7.0038, device='cuda:0', grad_fn=<NllLossBackward>)
1421 tensor(6.1436, device='cuda:0',

1533 tensor(6.8520, device='cuda:0', grad_fn=<NllLossBackward>)
1534 tensor(6.3227, device='cuda:0', grad_fn=<NllLossBackward>)
1535 tensor(10.6074, device='cuda:0', grad_fn=<NllLossBackward>)
1536 tensor(7.8627, device='cuda:0', grad_fn=<NllLossBackward>)
1537 tensor(7.4514, device='cuda:0', grad_fn=<NllLossBackward>)
1538 tensor(7.2527, device='cuda:0', grad_fn=<NllLossBackward>)
1539 tensor(6.7439, device='cuda:0', grad_fn=<NllLossBackward>)
1540 tensor(4.9295, device='cuda:0', grad_fn=<NllLossBackward>)
1541 tensor(8.6635, device='cuda:0', grad_fn=<NllLossBackward>)
1542 tensor(11.6135, device='cuda:0', grad_fn=<NllLossBackward>)
1543 tensor(7.5447, device='cuda:0', grad_fn=<NllLossBackward>)
1544 tensor(6.4267, device='cuda:0', grad_fn=<NllLossBackward>)
1545 tensor(8.3900, device='cuda:0', grad_fn=<NllLossBackward>)
1546 tensor(6.3501, device='cuda:0', grad_fn=<NllLossBackward>)
1547 tensor(7.4907, device='cuda:0', grad_fn=<NllLossBackward>)
1548 tensor(12.4346, device='cuda:0', 

1660 tensor(7.4172, device='cuda:0', grad_fn=<NllLossBackward>)
1661 tensor(3.9038, device='cuda:0', grad_fn=<NllLossBackward>)
1662 tensor(6.9199, device='cuda:0', grad_fn=<NllLossBackward>)
1663 tensor(5.5468, device='cuda:0', grad_fn=<NllLossBackward>)
1664 tensor(8.6093, device='cuda:0', grad_fn=<NllLossBackward>)
1665 tensor(10.1417, device='cuda:0', grad_fn=<NllLossBackward>)
1666 tensor(6.4840, device='cuda:0', grad_fn=<NllLossBackward>)
1667 tensor(9.9716, device='cuda:0', grad_fn=<NllLossBackward>)
1668 tensor(10.0569, device='cuda:0', grad_fn=<NllLossBackward>)
1669 tensor(4.7634, device='cuda:0', grad_fn=<NllLossBackward>)
1670 tensor(4.3804, device='cuda:0', grad_fn=<NllLossBackward>)
1671 tensor(7.2121, device='cuda:0', grad_fn=<NllLossBackward>)
1672 tensor(7.3985, device='cuda:0', grad_fn=<NllLossBackward>)
1673 tensor(8.2373, device='cuda:0', grad_fn=<NllLossBackward>)
1674 tensor(5.0326, device='cuda:0', grad_fn=<NllLossBackward>)
1675 tensor(9.9534, device='cuda:0', g

1787 tensor(7.4379, device='cuda:0', grad_fn=<NllLossBackward>)
1788 tensor(6.7008, device='cuda:0', grad_fn=<NllLossBackward>)
1789 tensor(3.5368, device='cuda:0', grad_fn=<NllLossBackward>)
1790 tensor(3.9436, device='cuda:0', grad_fn=<NllLossBackward>)
1791 tensor(3.9722, device='cuda:0', grad_fn=<NllLossBackward>)
1792 tensor(6.8196, device='cuda:0', grad_fn=<NllLossBackward>)
1793 tensor(9.8705, device='cuda:0', grad_fn=<NllLossBackward>)
1794 tensor(6.1638, device='cuda:0', grad_fn=<NllLossBackward>)
1795 tensor(4.6579, device='cuda:0', grad_fn=<NllLossBackward>)
1796 tensor(6.0287, device='cuda:0', grad_fn=<NllLossBackward>)
1797 tensor(6.1943, device='cuda:0', grad_fn=<NllLossBackward>)
1798 tensor(8.1622, device='cuda:0', grad_fn=<NllLossBackward>)
1799 tensor(7.3357, device='cuda:0', grad_fn=<NllLossBackward>)
1800 tensor(10.2394, device='cuda:0', grad_fn=<NllLossBackward>)
1801 tensor(6.0867, device='cuda:0', grad_fn=<NllLossBackward>)
1802 tensor(7.6793, device='cuda:0', gr

1914 tensor(8.4159, device='cuda:0', grad_fn=<NllLossBackward>)
1915 tensor(4.9777, device='cuda:0', grad_fn=<NllLossBackward>)
1916 tensor(6.1867, device='cuda:0', grad_fn=<NllLossBackward>)
1917 tensor(8.6602, device='cuda:0', grad_fn=<NllLossBackward>)
1918 tensor(7.8311, device='cuda:0', grad_fn=<NllLossBackward>)
1919 tensor(7.2282, device='cuda:0', grad_fn=<NllLossBackward>)
1920 tensor(6.9126, device='cuda:0', grad_fn=<NllLossBackward>)
1921 tensor(6.7690, device='cuda:0', grad_fn=<NllLossBackward>)
1922 tensor(7.3510, device='cuda:0', grad_fn=<NllLossBackward>)
1923 tensor(6.6214, device='cuda:0', grad_fn=<NllLossBackward>)
1924 tensor(7.6136, device='cuda:0', grad_fn=<NllLossBackward>)
1925 tensor(7.7694, device='cuda:0', grad_fn=<NllLossBackward>)
1926 tensor(4.2838, device='cuda:0', grad_fn=<NllLossBackward>)
1927 tensor(5.5457, device='cuda:0', grad_fn=<NllLossBackward>)
1928 tensor(5.8982, device='cuda:0', grad_fn=<NllLossBackward>)
1929 tensor(6.3842, device='cuda:0', gra

2041 tensor(4.2071, device='cuda:0', grad_fn=<NllLossBackward>)
2042 tensor(5.8094, device='cuda:0', grad_fn=<NllLossBackward>)
2043 tensor(6.1996, device='cuda:0', grad_fn=<NllLossBackward>)
2044 tensor(7.1102, device='cuda:0', grad_fn=<NllLossBackward>)
2045 tensor(7.4503, device='cuda:0', grad_fn=<NllLossBackward>)
2046 tensor(7.2859, device='cuda:0', grad_fn=<NllLossBackward>)
2047 tensor(6.4409, device='cuda:0', grad_fn=<NllLossBackward>)
2048 tensor(4.7177, device='cuda:0', grad_fn=<NllLossBackward>)
2049 tensor(3.8750, device='cuda:0', grad_fn=<NllLossBackward>)
2050 tensor(7.0849, device='cuda:0', grad_fn=<NllLossBackward>)
2051 tensor(5.7791, device='cuda:0', grad_fn=<NllLossBackward>)
2052 tensor(7.5018, device='cuda:0', grad_fn=<NllLossBackward>)
2053 tensor(4.6903, device='cuda:0', grad_fn=<NllLossBackward>)
2054 tensor(8.3467, device='cuda:0', grad_fn=<NllLossBackward>)
2055 tensor(7.0584, device='cuda:0', grad_fn=<NllLossBackward>)
2056 tensor(8.6107, device='cuda:0', gra

2168 tensor(5.7873, device='cuda:0', grad_fn=<NllLossBackward>)
2169 tensor(7.7913, device='cuda:0', grad_fn=<NllLossBackward>)
2170 tensor(5.4881, device='cuda:0', grad_fn=<NllLossBackward>)
2171 tensor(6.3456, device='cuda:0', grad_fn=<NllLossBackward>)
2172 tensor(6.8347, device='cuda:0', grad_fn=<NllLossBackward>)
2173 tensor(4.2443, device='cuda:0', grad_fn=<NllLossBackward>)
2174 tensor(8.0587, device='cuda:0', grad_fn=<NllLossBackward>)
2175 tensor(7.8956, device='cuda:0', grad_fn=<NllLossBackward>)
2176 tensor(8.3679, device='cuda:0', grad_fn=<NllLossBackward>)
2177 tensor(7.1628, device='cuda:0', grad_fn=<NllLossBackward>)
2178 tensor(4.0559, device='cuda:0', grad_fn=<NllLossBackward>)
2179 tensor(2.9755, device='cuda:0', grad_fn=<NllLossBackward>)
2180 tensor(5.7686, device='cuda:0', grad_fn=<NllLossBackward>)
2181 tensor(2.6933, device='cuda:0', grad_fn=<NllLossBackward>)
2182 tensor(6.2173, device='cuda:0', grad_fn=<NllLossBackward>)
2183 tensor(6.0210, device='cuda:0', gra

2295 tensor(8.3331, device='cuda:0', grad_fn=<NllLossBackward>)
2296 tensor(5.3722, device='cuda:0', grad_fn=<NllLossBackward>)
2297 tensor(3.5621, device='cuda:0', grad_fn=<NllLossBackward>)
2298 tensor(5.1418, device='cuda:0', grad_fn=<NllLossBackward>)
2299 tensor(6.6388, device='cuda:0', grad_fn=<NllLossBackward>)
2300 tensor(9.4077, device='cuda:0', grad_fn=<NllLossBackward>)
2301 tensor(3.5310, device='cuda:0', grad_fn=<NllLossBackward>)
2302 tensor(7.1807, device='cuda:0', grad_fn=<NllLossBackward>)
2303 tensor(6.0250, device='cuda:0', grad_fn=<NllLossBackward>)
2304 tensor(6.9557, device='cuda:0', grad_fn=<NllLossBackward>)
2305 tensor(6.4758, device='cuda:0', grad_fn=<NllLossBackward>)
2306 tensor(4.7841, device='cuda:0', grad_fn=<NllLossBackward>)
2307 tensor(7.6286, device='cuda:0', grad_fn=<NllLossBackward>)
2308 tensor(3.3697, device='cuda:0', grad_fn=<NllLossBackward>)
2309 tensor(7.2240, device='cuda:0', grad_fn=<NllLossBackward>)
2310 tensor(2.8560, device='cuda:0', gra

2423 tensor(4.1129, device='cuda:0', grad_fn=<NllLossBackward>)
2424 tensor(7.8925, device='cuda:0', grad_fn=<NllLossBackward>)
2425 tensor(6.2000, device='cuda:0', grad_fn=<NllLossBackward>)
2426 tensor(4.6477, device='cuda:0', grad_fn=<NllLossBackward>)
2427 tensor(4.2372, device='cuda:0', grad_fn=<NllLossBackward>)
2428 tensor(4.8428, device='cuda:0', grad_fn=<NllLossBackward>)
2429 tensor(5.9068, device='cuda:0', grad_fn=<NllLossBackward>)
2430 tensor(5.3114, device='cuda:0', grad_fn=<NllLossBackward>)
2431 tensor(6.1806, device='cuda:0', grad_fn=<NllLossBackward>)
2432 tensor(6.3241, device='cuda:0', grad_fn=<NllLossBackward>)
2433 tensor(6.2970, device='cuda:0', grad_fn=<NllLossBackward>)
2434 tensor(6.9879, device='cuda:0', grad_fn=<NllLossBackward>)
2435 tensor(5.1960, device='cuda:0', grad_fn=<NllLossBackward>)
2436 tensor(6.5853, device='cuda:0', grad_fn=<NllLossBackward>)
2437 tensor(5.5541, device='cuda:0', grad_fn=<NllLossBackward>)
2438 tensor(6.6826, device='cuda:0', gra

2551 tensor(7.1620, device='cuda:0', grad_fn=<NllLossBackward>)
2552 tensor(6.2170, device='cuda:0', grad_fn=<NllLossBackward>)
2553 tensor(6.1518, device='cuda:0', grad_fn=<NllLossBackward>)
2554 tensor(4.7963, device='cuda:0', grad_fn=<NllLossBackward>)
2555 tensor(6.3302, device='cuda:0', grad_fn=<NllLossBackward>)
2556 tensor(5.1670, device='cuda:0', grad_fn=<NllLossBackward>)
2557 tensor(8.8424, device='cuda:0', grad_fn=<NllLossBackward>)
2558 tensor(5.5596, device='cuda:0', grad_fn=<NllLossBackward>)
2559 tensor(6.6605, device='cuda:0', grad_fn=<NllLossBackward>)
2560 tensor(5.9017, device='cuda:0', grad_fn=<NllLossBackward>)
2561 tensor(5.5682, device='cuda:0', grad_fn=<NllLossBackward>)
2562 tensor(6.0503, device='cuda:0', grad_fn=<NllLossBackward>)
2563 tensor(5.1083, device='cuda:0', grad_fn=<NllLossBackward>)
2564 tensor(6.2377, device='cuda:0', grad_fn=<NllLossBackward>)
2565 tensor(2.1796, device='cuda:0', grad_fn=<NllLossBackward>)
2566 tensor(6.5932, device='cuda:0', gra

2679 tensor(5.4236, device='cuda:0', grad_fn=<NllLossBackward>)
2680 tensor(5.0606, device='cuda:0', grad_fn=<NllLossBackward>)
2681 tensor(5.9105, device='cuda:0', grad_fn=<NllLossBackward>)
2682 tensor(4.6684, device='cuda:0', grad_fn=<NllLossBackward>)
2683 tensor(4.8954, device='cuda:0', grad_fn=<NllLossBackward>)
2684 tensor(4.7732, device='cuda:0', grad_fn=<NllLossBackward>)
2685 tensor(4.6974, device='cuda:0', grad_fn=<NllLossBackward>)
2686 tensor(5.4058, device='cuda:0', grad_fn=<NllLossBackward>)
2687 tensor(2.5472, device='cuda:0', grad_fn=<NllLossBackward>)
2688 tensor(2.6539, device='cuda:0', grad_fn=<NllLossBackward>)
2689 tensor(5.0119, device='cuda:0', grad_fn=<NllLossBackward>)
2690 tensor(6.3934, device='cuda:0', grad_fn=<NllLossBackward>)
2691 tensor(5.1551, device='cuda:0', grad_fn=<NllLossBackward>)
2692 tensor(5.6818, device='cuda:0', grad_fn=<NllLossBackward>)
2693 tensor(5.4079, device='cuda:0', grad_fn=<NllLossBackward>)
2694 tensor(3.8244, device='cuda:0', gra

2807 tensor(6.9225, device='cuda:0', grad_fn=<NllLossBackward>)
2808 tensor(7.7832, device='cuda:0', grad_fn=<NllLossBackward>)
2809 tensor(6.9323, device='cuda:0', grad_fn=<NllLossBackward>)
2810 tensor(4.6063, device='cuda:0', grad_fn=<NllLossBackward>)
2811 tensor(5.3575, device='cuda:0', grad_fn=<NllLossBackward>)
2812 tensor(3.7945, device='cuda:0', grad_fn=<NllLossBackward>)
2813 tensor(5.2216, device='cuda:0', grad_fn=<NllLossBackward>)
2814 tensor(4.7010, device='cuda:0', grad_fn=<NllLossBackward>)
2815 tensor(6.1504, device='cuda:0', grad_fn=<NllLossBackward>)
2816 tensor(5.7166, device='cuda:0', grad_fn=<NllLossBackward>)
2817 tensor(6.7134, device='cuda:0', grad_fn=<NllLossBackward>)
2818 tensor(6.3925, device='cuda:0', grad_fn=<NllLossBackward>)
2819 tensor(5.3394, device='cuda:0', grad_fn=<NllLossBackward>)
2820 tensor(5.4912, device='cuda:0', grad_fn=<NllLossBackward>)
2821 tensor(2.7923, device='cuda:0', grad_fn=<NllLossBackward>)
2822 tensor(5.6893, device='cuda:0', gra

2935 tensor(6.6243, device='cuda:0', grad_fn=<NllLossBackward>)
2936 tensor(6.8929, device='cuda:0', grad_fn=<NllLossBackward>)
2937 tensor(5.5471, device='cuda:0', grad_fn=<NllLossBackward>)
2938 tensor(5.0634, device='cuda:0', grad_fn=<NllLossBackward>)
2939 tensor(8.0435, device='cuda:0', grad_fn=<NllLossBackward>)
2940 tensor(6.8774, device='cuda:0', grad_fn=<NllLossBackward>)
2941 tensor(5.4219, device='cuda:0', grad_fn=<NllLossBackward>)
2942 tensor(5.9810, device='cuda:0', grad_fn=<NllLossBackward>)
2943 tensor(6.1196, device='cuda:0', grad_fn=<NllLossBackward>)
2944 tensor(5.9559, device='cuda:0', grad_fn=<NllLossBackward>)
2945 tensor(6.6726, device='cuda:0', grad_fn=<NllLossBackward>)
2946 tensor(4.3302, device='cuda:0', grad_fn=<NllLossBackward>)
2947 tensor(4.7027, device='cuda:0', grad_fn=<NllLossBackward>)
2948 tensor(4.2941, device='cuda:0', grad_fn=<NllLossBackward>)
2949 tensor(5.2039, device='cuda:0', grad_fn=<NllLossBackward>)
2950 tensor(4.5477, device='cuda:0', gra

3063 tensor(4.6526, device='cuda:0', grad_fn=<NllLossBackward>)
3064 tensor(6.2458, device='cuda:0', grad_fn=<NllLossBackward>)
3065 tensor(5.5396, device='cuda:0', grad_fn=<NllLossBackward>)
3066 tensor(5.3594, device='cuda:0', grad_fn=<NllLossBackward>)
3067 tensor(4.9599, device='cuda:0', grad_fn=<NllLossBackward>)
3068 tensor(5.9243, device='cuda:0', grad_fn=<NllLossBackward>)
3069 tensor(5.1978, device='cuda:0', grad_fn=<NllLossBackward>)
3070 tensor(5.4388, device='cuda:0', grad_fn=<NllLossBackward>)
3071 tensor(3.7672, device='cuda:0', grad_fn=<NllLossBackward>)
3072 tensor(4.3950, device='cuda:0', grad_fn=<NllLossBackward>)
3073 tensor(5.4937, device='cuda:0', grad_fn=<NllLossBackward>)
3074 tensor(3.8118, device='cuda:0', grad_fn=<NllLossBackward>)
3075 tensor(5.6057, device='cuda:0', grad_fn=<NllLossBackward>)
3076 tensor(5.1560, device='cuda:0', grad_fn=<NllLossBackward>)
3077 tensor(6.3957, device='cuda:0', grad_fn=<NllLossBackward>)
3078 tensor(8.9827, device='cuda:0', gra

3191 tensor(5.8110, device='cuda:0', grad_fn=<NllLossBackward>)
3192 tensor(5.8303, device='cuda:0', grad_fn=<NllLossBackward>)
3193 tensor(3.5970, device='cuda:0', grad_fn=<NllLossBackward>)
3194 tensor(6.8873, device='cuda:0', grad_fn=<NllLossBackward>)
3195 tensor(4.0340, device='cuda:0', grad_fn=<NllLossBackward>)
3196 tensor(5.2160, device='cuda:0', grad_fn=<NllLossBackward>)
3197 tensor(6.4946, device='cuda:0', grad_fn=<NllLossBackward>)
3198 tensor(5.5709, device='cuda:0', grad_fn=<NllLossBackward>)
3199 tensor(5.4911, device='cuda:0', grad_fn=<NllLossBackward>)
3200 tensor(6.2268, device='cuda:0', grad_fn=<NllLossBackward>)
3201 tensor(4.6789, device='cuda:0', grad_fn=<NllLossBackward>)
3202 tensor(5.7530, device='cuda:0', grad_fn=<NllLossBackward>)
3203 tensor(6.6016, device='cuda:0', grad_fn=<NllLossBackward>)
3204 tensor(3.8468, device='cuda:0', grad_fn=<NllLossBackward>)
3205 tensor(3.9664, device='cuda:0', grad_fn=<NllLossBackward>)
3206 tensor(7.0298, device='cuda:0', gra

3319 tensor(5.7945, device='cuda:0', grad_fn=<NllLossBackward>)
3320 tensor(5.4588, device='cuda:0', grad_fn=<NllLossBackward>)
3321 tensor(3.1660, device='cuda:0', grad_fn=<NllLossBackward>)
3322 tensor(5.2197, device='cuda:0', grad_fn=<NllLossBackward>)
3323 tensor(4.1018, device='cuda:0', grad_fn=<NllLossBackward>)
3324 tensor(4.9168, device='cuda:0', grad_fn=<NllLossBackward>)
3325 tensor(3.9028, device='cuda:0', grad_fn=<NllLossBackward>)
3326 tensor(5.8097, device='cuda:0', grad_fn=<NllLossBackward>)
3327 tensor(7.5400, device='cuda:0', grad_fn=<NllLossBackward>)
3328 tensor(7.3108, device='cuda:0', grad_fn=<NllLossBackward>)
3329 tensor(5.4004, device='cuda:0', grad_fn=<NllLossBackward>)
3330 tensor(5.7351, device='cuda:0', grad_fn=<NllLossBackward>)
3331 tensor(5.6628, device='cuda:0', grad_fn=<NllLossBackward>)
3332 tensor(8.3892, device='cuda:0', grad_fn=<NllLossBackward>)
3333 tensor(4.2619, device='cuda:0', grad_fn=<NllLossBackward>)
3334 tensor(6.8400, device='cuda:0', gra

3447 tensor(3.8281, device='cuda:0', grad_fn=<NllLossBackward>)
3448 tensor(3.4705, device='cuda:0', grad_fn=<NllLossBackward>)
3449 tensor(7.4299, device='cuda:0', grad_fn=<NllLossBackward>)
3450 tensor(4.8739, device='cuda:0', grad_fn=<NllLossBackward>)
3451 tensor(8.0739, device='cuda:0', grad_fn=<NllLossBackward>)
3452 tensor(2.7989, device='cuda:0', grad_fn=<NllLossBackward>)
3453 tensor(4.8369, device='cuda:0', grad_fn=<NllLossBackward>)
3454 tensor(4.8139, device='cuda:0', grad_fn=<NllLossBackward>)
3455 tensor(6.2805, device='cuda:0', grad_fn=<NllLossBackward>)
3456 tensor(5.0124, device='cuda:0', grad_fn=<NllLossBackward>)
3457 tensor(5.7923, device='cuda:0', grad_fn=<NllLossBackward>)
3458 tensor(3.2392, device='cuda:0', grad_fn=<NllLossBackward>)
3459 tensor(2.7759, device='cuda:0', grad_fn=<NllLossBackward>)
3460 tensor(5.1826, device='cuda:0', grad_fn=<NllLossBackward>)
3461 tensor(6.1498, device='cuda:0', grad_fn=<NllLossBackward>)
3462 tensor(4.2197, device='cuda:0', gra

3575 tensor(3.6044, device='cuda:0', grad_fn=<NllLossBackward>)
3576 tensor(5.5765, device='cuda:0', grad_fn=<NllLossBackward>)
3577 tensor(6.6667, device='cuda:0', grad_fn=<NllLossBackward>)
3578 tensor(5.5466, device='cuda:0', grad_fn=<NllLossBackward>)
3579 tensor(3.3391, device='cuda:0', grad_fn=<NllLossBackward>)
3580 tensor(4.0803, device='cuda:0', grad_fn=<NllLossBackward>)
3581 tensor(3.2221, device='cuda:0', grad_fn=<NllLossBackward>)
3582 tensor(3.0635, device='cuda:0', grad_fn=<NllLossBackward>)
3583 tensor(6.6427, device='cuda:0', grad_fn=<NllLossBackward>)
3584 tensor(6.3042, device='cuda:0', grad_fn=<NllLossBackward>)
3585 tensor(4.0284, device='cuda:0', grad_fn=<NllLossBackward>)
3586 tensor(5.0164, device='cuda:0', grad_fn=<NllLossBackward>)
3587 tensor(4.2167, device='cuda:0', grad_fn=<NllLossBackward>)
3588 tensor(5.7380, device='cuda:0', grad_fn=<NllLossBackward>)
3589 tensor(7.3265, device='cuda:0', grad_fn=<NllLossBackward>)
3590 tensor(4.3387, device='cuda:0', gra

3703 tensor(4.7312, device='cuda:0', grad_fn=<NllLossBackward>)
3704 tensor(5.1883, device='cuda:0', grad_fn=<NllLossBackward>)
3705 tensor(4.1163, device='cuda:0', grad_fn=<NllLossBackward>)
3706 tensor(3.9569, device='cuda:0', grad_fn=<NllLossBackward>)
3707 tensor(4.1874, device='cuda:0', grad_fn=<NllLossBackward>)
3708 tensor(6.9715, device='cuda:0', grad_fn=<NllLossBackward>)
3709 tensor(5.2798, device='cuda:0', grad_fn=<NllLossBackward>)
3710 tensor(3.8613, device='cuda:0', grad_fn=<NllLossBackward>)
3711 tensor(4.9465, device='cuda:0', grad_fn=<NllLossBackward>)
3712 tensor(4.9941, device='cuda:0', grad_fn=<NllLossBackward>)
3713 tensor(7.6868, device='cuda:0', grad_fn=<NllLossBackward>)
3714 tensor(6.4288, device='cuda:0', grad_fn=<NllLossBackward>)
3715 tensor(7.0341, device='cuda:0', grad_fn=<NllLossBackward>)
3716 tensor(5.4495, device='cuda:0', grad_fn=<NllLossBackward>)
3717 tensor(5.6602, device='cuda:0', grad_fn=<NllLossBackward>)
3718 tensor(5.0568, device='cuda:0', gra

3831 tensor(6.2410, device='cuda:0', grad_fn=<NllLossBackward>)
3832 tensor(4.7547, device='cuda:0', grad_fn=<NllLossBackward>)
3833 tensor(4.4697, device='cuda:0', grad_fn=<NllLossBackward>)
3834 tensor(5.2783, device='cuda:0', grad_fn=<NllLossBackward>)
3835 tensor(2.7992, device='cuda:0', grad_fn=<NllLossBackward>)
3836 tensor(3.6292, device='cuda:0', grad_fn=<NllLossBackward>)
3837 tensor(3.9111, device='cuda:0', grad_fn=<NllLossBackward>)
3838 tensor(6.4979, device='cuda:0', grad_fn=<NllLossBackward>)
3839 tensor(3.7159, device='cuda:0', grad_fn=<NllLossBackward>)
3840 tensor(3.7071, device='cuda:0', grad_fn=<NllLossBackward>)
3841 tensor(4.6445, device='cuda:0', grad_fn=<NllLossBackward>)
3842 tensor(3.1990, device='cuda:0', grad_fn=<NllLossBackward>)
3843 tensor(6.7293, device='cuda:0', grad_fn=<NllLossBackward>)
3844 tensor(4.3011, device='cuda:0', grad_fn=<NllLossBackward>)
3845 tensor(4.7920, device='cuda:0', grad_fn=<NllLossBackward>)
3846 tensor(2.8498, device='cuda:0', gra

3959 tensor(2.9500, device='cuda:0', grad_fn=<NllLossBackward>)
3960 tensor(4.7943, device='cuda:0', grad_fn=<NllLossBackward>)
3961 tensor(4.3055, device='cuda:0', grad_fn=<NllLossBackward>)
3962 tensor(3.8390, device='cuda:0', grad_fn=<NllLossBackward>)
3963 tensor(5.6767, device='cuda:0', grad_fn=<NllLossBackward>)
3964 tensor(6.4657, device='cuda:0', grad_fn=<NllLossBackward>)
3965 tensor(5.0064, device='cuda:0', grad_fn=<NllLossBackward>)
3966 tensor(6.1469, device='cuda:0', grad_fn=<NllLossBackward>)
3967 tensor(7.2594, device='cuda:0', grad_fn=<NllLossBackward>)
3968 tensor(4.9498, device='cuda:0', grad_fn=<NllLossBackward>)
3969 tensor(5.6754, device='cuda:0', grad_fn=<NllLossBackward>)
3970 tensor(5.3579, device='cuda:0', grad_fn=<NllLossBackward>)
3971 tensor(4.9287, device='cuda:0', grad_fn=<NllLossBackward>)
3972 tensor(7.3242, device='cuda:0', grad_fn=<NllLossBackward>)
3973 tensor(3.5668, device='cuda:0', grad_fn=<NllLossBackward>)
3974 tensor(4.3072, device='cuda:0', gra

4087 tensor(4.1635, device='cuda:0', grad_fn=<NllLossBackward>)
4088 tensor(4.3544, device='cuda:0', grad_fn=<NllLossBackward>)
4089 tensor(4.6796, device='cuda:0', grad_fn=<NllLossBackward>)
4090 tensor(3.3732, device='cuda:0', grad_fn=<NllLossBackward>)
4091 tensor(3.4401, device='cuda:0', grad_fn=<NllLossBackward>)
4092 tensor(3.4215, device='cuda:0', grad_fn=<NllLossBackward>)
4093 tensor(5.5325, device='cuda:0', grad_fn=<NllLossBackward>)
4094 tensor(5.8530, device='cuda:0', grad_fn=<NllLossBackward>)
4095 tensor(5.1052, device='cuda:0', grad_fn=<NllLossBackward>)
4096 tensor(3.2894, device='cuda:0', grad_fn=<NllLossBackward>)
4097 tensor(4.9255, device='cuda:0', grad_fn=<NllLossBackward>)
4098 tensor(3.8688, device='cuda:0', grad_fn=<NllLossBackward>)
4099 tensor(3.9215, device='cuda:0', grad_fn=<NllLossBackward>)
4100 tensor(5.0675, device='cuda:0', grad_fn=<NllLossBackward>)
4101 tensor(3.2500, device='cuda:0', grad_fn=<NllLossBackward>)
4102 tensor(5.2332, device='cuda:0', gra

4215 tensor(4.5534, device='cuda:0', grad_fn=<NllLossBackward>)
4216 tensor(3.0683, device='cuda:0', grad_fn=<NllLossBackward>)
4217 tensor(4.1627, device='cuda:0', grad_fn=<NllLossBackward>)
4218 tensor(4.4475, device='cuda:0', grad_fn=<NllLossBackward>)
4219 tensor(3.4683, device='cuda:0', grad_fn=<NllLossBackward>)
4220 tensor(5.5907, device='cuda:0', grad_fn=<NllLossBackward>)
4221 tensor(4.4918, device='cuda:0', grad_fn=<NllLossBackward>)
4222 tensor(5.7488, device='cuda:0', grad_fn=<NllLossBackward>)
4223 tensor(5.2174, device='cuda:0', grad_fn=<NllLossBackward>)
4224 tensor(5.3656, device='cuda:0', grad_fn=<NllLossBackward>)
4225 tensor(3.4861, device='cuda:0', grad_fn=<NllLossBackward>)
4226 tensor(6.3765, device='cuda:0', grad_fn=<NllLossBackward>)
4227 tensor(5.4289, device='cuda:0', grad_fn=<NllLossBackward>)
4228 tensor(3.9896, device='cuda:0', grad_fn=<NllLossBackward>)
4229 tensor(5.4979, device='cuda:0', grad_fn=<NllLossBackward>)
4230 tensor(3.1096, device='cuda:0', gra

4343 tensor(4.5287, device='cuda:0', grad_fn=<NllLossBackward>)
4344 tensor(4.1597, device='cuda:0', grad_fn=<NllLossBackward>)
4345 tensor(5.2014, device='cuda:0', grad_fn=<NllLossBackward>)
4346 tensor(5.9400, device='cuda:0', grad_fn=<NllLossBackward>)
4347 tensor(3.8314, device='cuda:0', grad_fn=<NllLossBackward>)
4348 tensor(5.7996, device='cuda:0', grad_fn=<NllLossBackward>)
4349 tensor(5.0719, device='cuda:0', grad_fn=<NllLossBackward>)
4350 tensor(4.6384, device='cuda:0', grad_fn=<NllLossBackward>)
4351 tensor(5.2444, device='cuda:0', grad_fn=<NllLossBackward>)
4352 tensor(4.6005, device='cuda:0', grad_fn=<NllLossBackward>)
4353 tensor(4.5051, device='cuda:0', grad_fn=<NllLossBackward>)
4354 tensor(4.5063, device='cuda:0', grad_fn=<NllLossBackward>)
4355 tensor(4.9534, device='cuda:0', grad_fn=<NllLossBackward>)
4356 tensor(4.2974, device='cuda:0', grad_fn=<NllLossBackward>)
4357 tensor(4.1992, device='cuda:0', grad_fn=<NllLossBackward>)
4358 tensor(6.9082, device='cuda:0', gra

4471 tensor(3.0896, device='cuda:0', grad_fn=<NllLossBackward>)
4472 tensor(5.8969, device='cuda:0', grad_fn=<NllLossBackward>)
4473 tensor(4.7601, device='cuda:0', grad_fn=<NllLossBackward>)
4474 tensor(2.2987, device='cuda:0', grad_fn=<NllLossBackward>)
4475 tensor(4.0175, device='cuda:0', grad_fn=<NllLossBackward>)
4476 tensor(5.1548, device='cuda:0', grad_fn=<NllLossBackward>)
4477 tensor(4.1864, device='cuda:0', grad_fn=<NllLossBackward>)
4478 tensor(3.9522, device='cuda:0', grad_fn=<NllLossBackward>)
4479 tensor(3.9031, device='cuda:0', grad_fn=<NllLossBackward>)
4480 tensor(4.7186, device='cuda:0', grad_fn=<NllLossBackward>)
4481 tensor(4.2947, device='cuda:0', grad_fn=<NllLossBackward>)
4482 tensor(4.7243, device='cuda:0', grad_fn=<NllLossBackward>)
4483 tensor(5.8264, device='cuda:0', grad_fn=<NllLossBackward>)
4484 tensor(5.7868, device='cuda:0', grad_fn=<NllLossBackward>)
4485 tensor(4.5703, device='cuda:0', grad_fn=<NllLossBackward>)
4486 tensor(3.5854, device='cuda:0', gra

4599 tensor(5.4929, device='cuda:0', grad_fn=<NllLossBackward>)
4600 tensor(4.7645, device='cuda:0', grad_fn=<NllLossBackward>)
4601 tensor(4.3789, device='cuda:0', grad_fn=<NllLossBackward>)
4602 tensor(4.7471, device='cuda:0', grad_fn=<NllLossBackward>)
4603 tensor(5.8173, device='cuda:0', grad_fn=<NllLossBackward>)
4604 tensor(3.3355, device='cuda:0', grad_fn=<NllLossBackward>)
4605 tensor(3.8247, device='cuda:0', grad_fn=<NllLossBackward>)
4606 tensor(4.7518, device='cuda:0', grad_fn=<NllLossBackward>)
4607 tensor(6.2605, device='cuda:0', grad_fn=<NllLossBackward>)
4608 tensor(3.7426, device='cuda:0', grad_fn=<NllLossBackward>)
4609 tensor(5.3681, device='cuda:0', grad_fn=<NllLossBackward>)
4610 tensor(2.6311, device='cuda:0', grad_fn=<NllLossBackward>)
4611 tensor(4.7139, device='cuda:0', grad_fn=<NllLossBackward>)
4612 tensor(5.2895, device='cuda:0', grad_fn=<NllLossBackward>)
4613 tensor(4.5619, device='cuda:0', grad_fn=<NllLossBackward>)
4614 tensor(4.9638, device='cuda:0', gra

4727 tensor(5.9776, device='cuda:0', grad_fn=<NllLossBackward>)
4728 tensor(4.0562, device='cuda:0', grad_fn=<NllLossBackward>)
4729 tensor(3.1353, device='cuda:0', grad_fn=<NllLossBackward>)
4730 tensor(5.2838, device='cuda:0', grad_fn=<NllLossBackward>)
4731 tensor(6.6810, device='cuda:0', grad_fn=<NllLossBackward>)
4732 tensor(5.5172, device='cuda:0', grad_fn=<NllLossBackward>)
4733 tensor(3.7404, device='cuda:0', grad_fn=<NllLossBackward>)
4734 tensor(4.6672, device='cuda:0', grad_fn=<NllLossBackward>)
4735 tensor(5.7207, device='cuda:0', grad_fn=<NllLossBackward>)
4736 tensor(6.1450, device='cuda:0', grad_fn=<NllLossBackward>)
4737 tensor(11.3786, device='cuda:0', grad_fn=<NllLossBackward>)
4738 tensor(4.2091, device='cuda:0', grad_fn=<NllLossBackward>)
4739 tensor(7.4033, device='cuda:0', grad_fn=<NllLossBackward>)
4740 tensor(4.0037, device='cuda:0', grad_fn=<NllLossBackward>)
4741 tensor(5.9060, device='cuda:0', grad_fn=<NllLossBackward>)
4742 tensor(5.6809, device='cuda:0', gr

4855 tensor(3.8149, device='cuda:0', grad_fn=<NllLossBackward>)
4856 tensor(3.3137, device='cuda:0', grad_fn=<NllLossBackward>)
4857 tensor(4.8798, device='cuda:0', grad_fn=<NllLossBackward>)
4858 tensor(5.0249, device='cuda:0', grad_fn=<NllLossBackward>)
4859 tensor(3.5037, device='cuda:0', grad_fn=<NllLossBackward>)
4860 tensor(4.3554, device='cuda:0', grad_fn=<NllLossBackward>)
4861 tensor(3.5106, device='cuda:0', grad_fn=<NllLossBackward>)
4862 tensor(5.3298, device='cuda:0', grad_fn=<NllLossBackward>)
4863 tensor(3.8572, device='cuda:0', grad_fn=<NllLossBackward>)
4864 tensor(4.1469, device='cuda:0', grad_fn=<NllLossBackward>)
4865 tensor(4.3941, device='cuda:0', grad_fn=<NllLossBackward>)
4866 tensor(5.9269, device='cuda:0', grad_fn=<NllLossBackward>)
4867 tensor(3.4524, device='cuda:0', grad_fn=<NllLossBackward>)
4868 tensor(4.1274, device='cuda:0', grad_fn=<NllLossBackward>)
4869 tensor(4.4166, device='cuda:0', grad_fn=<NllLossBackward>)
4870 tensor(6.7864, device='cuda:0', gra

4983 tensor(4.3310, device='cuda:0', grad_fn=<NllLossBackward>)
4984 tensor(4.5593, device='cuda:0', grad_fn=<NllLossBackward>)
4985 tensor(6.1753, device='cuda:0', grad_fn=<NllLossBackward>)
4986 tensor(4.6411, device='cuda:0', grad_fn=<NllLossBackward>)
4987 tensor(4.4332, device='cuda:0', grad_fn=<NllLossBackward>)
4988 tensor(4.4775, device='cuda:0', grad_fn=<NllLossBackward>)
4989 tensor(4.6131, device='cuda:0', grad_fn=<NllLossBackward>)
4990 tensor(5.7663, device='cuda:0', grad_fn=<NllLossBackward>)
4991 tensor(2.6347, device='cuda:0', grad_fn=<NllLossBackward>)
4992 tensor(4.9417, device='cuda:0', grad_fn=<NllLossBackward>)
4993 tensor(3.1544, device='cuda:0', grad_fn=<NllLossBackward>)
4994 tensor(4.7425, device='cuda:0', grad_fn=<NllLossBackward>)
4995 tensor(4.0552, device='cuda:0', grad_fn=<NllLossBackward>)
4996 tensor(4.9570, device='cuda:0', grad_fn=<NllLossBackward>)
4997 tensor(6.4717, device='cuda:0', grad_fn=<NllLossBackward>)
4998 tensor(4.5590, device='cuda:0', gra