In [91]:
import pandas as pd
import torchvision
import torch
from PIL import Image
import numpy as np

In [92]:
data_csv = pd.read_csv('train.csv')

In [95]:
train_mask = np.random.rand(len(data_csv)) < 0.9

data_csv_train = data_csv[train_mask]
data_csv_eval = data_csv[~train_mask]

6616

In [76]:
batch_size = 8
lr = 1e-4
num_epochs = 2

In [53]:
data_transforms = torchvision.transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

In [102]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, transform, data):
        self.files = ['data/train/' + x + '.jpg' for x in data['image_name']]
        self.labels = [0 if x == 'benign' else 1 for x in data['benign_malignant']]
        self.transform = transform
    
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        return (self.transform(Image.open(self.files[idx])), self.labels[idx])
    
    def get_untransformed(self, idx):
        return Image.open(self.files[idx])


dataset = MyDataset(data_transforms, data_csv_train)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)

dataset_eval = MyDataset(data_transforms, data_csv_eval)
dataloader_eval = torch.utils.data.DataLoader(dataset_eval, batch_size=128, num_workers=4)

In [72]:
model = torchvision.models.resnet18(pretrained=True)
# First go for feature extraction mode
for param in model.parameters():
    param.requires_grad = False

# Reinitialize the last layer of the model
model.fc = torch.nn.Linear(512, 2)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [87]:
# The optimizer only needs to see the layers which have grads
params_to_update = []
for name,param in model.named_parameters():
    if param.requires_grad:
        print(name)
        params_to_update.append(param)

optimizer = torch.optim.Adam(params_to_update, lr=lr)
criterion = torch.nn.CrossEntropyLoss()

fc.weight
fc.bias


In [103]:
def test():
    with torch.no_grad():
        total_loss = 0
        epochs = 0
        for images, labels in dataloader_eval:
            images = images.to(device)
            labels = labels.to(device)
            total_loss += criterion(model(images), labels)
            epochs += 1
        print("Eval loss: " + str(total_loss / epochs))

In [88]:
for epoch in range(num_epochs):
    print('Epoch ' + str(epoch) + ' of ' + str(num_epochs))
    model.train()
    for images, labels in dataloader:
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        loss = criterion(model(images), labels)
        loss.backward()
        optimizer.step()
        print(loss)
    test()

Epoch 0 of 2
tensor(0.3567, grad_fn=<NllLossBackward>)
tensor(0.3149, grad_fn=<NllLossBackward>)
tensor(0.2992, grad_fn=<NllLossBackward>)
tensor(0.2718, grad_fn=<NllLossBackward>)
tensor(0.2449, grad_fn=<NllLossBackward>)
tensor(0.5384, grad_fn=<NllLossBackward>)
tensor(0.3363, grad_fn=<NllLossBackward>)
tensor(0.2118, grad_fn=<NllLossBackward>)
tensor(0.2043, grad_fn=<NllLossBackward>)
tensor(0.1785, grad_fn=<NllLossBackward>)
tensor(0.1668, grad_fn=<NllLossBackward>)
tensor(0.1572, grad_fn=<NllLossBackward>)
tensor(0.1422, grad_fn=<NllLossBackward>)
tensor(0.3681, grad_fn=<NllLossBackward>)
tensor(0.1308, grad_fn=<NllLossBackward>)
tensor(0.1185, grad_fn=<NllLossBackward>)
tensor(0.5120, grad_fn=<NllLossBackward>)
tensor(0.1238, grad_fn=<NllLossBackward>)
tensor(0.0934, grad_fn=<NllLossBackward>)
tensor(0.0962, grad_fn=<NllLossBackward>)
tensor(0.3640, grad_fn=<NllLossBackward>)
tensor(0.0877, grad_fn=<NllLossBackward>)
tensor(0.0927, grad_fn=<NllLossBackward>)
tensor(0.0825, grad_f

tensor(0.0130, grad_fn=<NllLossBackward>)
tensor(0.0155, grad_fn=<NllLossBackward>)
tensor(0.0119, grad_fn=<NllLossBackward>)
tensor(0.0137, grad_fn=<NllLossBackward>)
tensor(0.5131, grad_fn=<NllLossBackward>)
tensor(0.0115, grad_fn=<NllLossBackward>)
tensor(0.0124, grad_fn=<NllLossBackward>)
tensor(0.0123, grad_fn=<NllLossBackward>)
tensor(0.0131, grad_fn=<NllLossBackward>)
tensor(0.5931, grad_fn=<NllLossBackward>)
tensor(0.0140, grad_fn=<NllLossBackward>)
tensor(0.0134, grad_fn=<NllLossBackward>)
tensor(0.0144, grad_fn=<NllLossBackward>)
tensor(0.0153, grad_fn=<NllLossBackward>)
tensor(0.0210, grad_fn=<NllLossBackward>)
tensor(0.0152, grad_fn=<NllLossBackward>)
tensor(0.6075, grad_fn=<NllLossBackward>)
tensor(0.0128, grad_fn=<NllLossBackward>)
tensor(0.0140, grad_fn=<NllLossBackward>)
tensor(0.0155, grad_fn=<NllLossBackward>)
tensor(0.0152, grad_fn=<NllLossBackward>)
tensor(0.0201, grad_fn=<NllLossBackward>)
tensor(0.0191, grad_fn=<NllLossBackward>)
tensor(0.0130, grad_fn=<NllLossBac

tensor(0.0245, grad_fn=<NllLossBackward>)
tensor(0.0255, grad_fn=<NllLossBackward>)
tensor(0.0237, grad_fn=<NllLossBackward>)
tensor(0.0244, grad_fn=<NllLossBackward>)
tensor(0.0292, grad_fn=<NllLossBackward>)
tensor(0.0256, grad_fn=<NllLossBackward>)
tensor(0.5928, grad_fn=<NllLossBackward>)
tensor(0.0237, grad_fn=<NllLossBackward>)
tensor(0.0212, grad_fn=<NllLossBackward>)
tensor(0.0228, grad_fn=<NllLossBackward>)
tensor(0.0273, grad_fn=<NllLossBackward>)
tensor(0.0239, grad_fn=<NllLossBackward>)
tensor(0.5838, grad_fn=<NllLossBackward>)
tensor(0.0226, grad_fn=<NllLossBackward>)
tensor(0.0237, grad_fn=<NllLossBackward>)
tensor(0.0272, grad_fn=<NllLossBackward>)
tensor(0.0274, grad_fn=<NllLossBackward>)
tensor(0.0242, grad_fn=<NllLossBackward>)
tensor(0.0242, grad_fn=<NllLossBackward>)
tensor(0.0247, grad_fn=<NllLossBackward>)
tensor(0.0226, grad_fn=<NllLossBackward>)
tensor(0.6095, grad_fn=<NllLossBackward>)
tensor(0.0215, grad_fn=<NllLossBackward>)
tensor(0.0250, grad_fn=<NllLossBac

tensor(0.5224, grad_fn=<NllLossBackward>)
tensor(0.0215, grad_fn=<NllLossBackward>)
tensor(0.0211, grad_fn=<NllLossBackward>)
tensor(0.0247, grad_fn=<NllLossBackward>)
tensor(0.0213, grad_fn=<NllLossBackward>)
tensor(0.0252, grad_fn=<NllLossBackward>)
tensor(0.0242, grad_fn=<NllLossBackward>)
tensor(0.0299, grad_fn=<NllLossBackward>)
tensor(0.4739, grad_fn=<NllLossBackward>)
tensor(0.4870, grad_fn=<NllLossBackward>)
tensor(0.0236, grad_fn=<NllLossBackward>)
tensor(0.0233, grad_fn=<NllLossBackward>)
tensor(0.0222, grad_fn=<NllLossBackward>)
tensor(0.0254, grad_fn=<NllLossBackward>)
tensor(0.0280, grad_fn=<NllLossBackward>)
tensor(0.4164, grad_fn=<NllLossBackward>)
tensor(0.0245, grad_fn=<NllLossBackward>)
tensor(0.0244, grad_fn=<NllLossBackward>)
tensor(0.0317, grad_fn=<NllLossBackward>)
tensor(0.3553, grad_fn=<NllLossBackward>)
tensor(0.0270, grad_fn=<NllLossBackward>)
tensor(0.0264, grad_fn=<NllLossBackward>)
tensor(0.0223, grad_fn=<NllLossBackward>)
tensor(0.0271, grad_fn=<NllLossBac

tensor(0.0308, grad_fn=<NllLossBackward>)
tensor(0.0325, grad_fn=<NllLossBackward>)
tensor(0.4317, grad_fn=<NllLossBackward>)
tensor(0.0283, grad_fn=<NllLossBackward>)
tensor(0.0295, grad_fn=<NllLossBackward>)
tensor(0.0401, grad_fn=<NllLossBackward>)
tensor(0.0265, grad_fn=<NllLossBackward>)
tensor(0.0284, grad_fn=<NllLossBackward>)
tensor(0.4846, grad_fn=<NllLossBackward>)
tensor(0.0284, grad_fn=<NllLossBackward>)
tensor(0.0324, grad_fn=<NllLossBackward>)
tensor(0.0272, grad_fn=<NllLossBackward>)
tensor(0.0271, grad_fn=<NllLossBackward>)
tensor(0.0257, grad_fn=<NllLossBackward>)
tensor(0.0314, grad_fn=<NllLossBackward>)
tensor(0.3968, grad_fn=<NllLossBackward>)
tensor(0.0273, grad_fn=<NllLossBackward>)
tensor(0.0290, grad_fn=<NllLossBackward>)
tensor(0.0300, grad_fn=<NllLossBackward>)
tensor(0.0311, grad_fn=<NllLossBackward>)
tensor(0.0263, grad_fn=<NllLossBackward>)
tensor(0.0261, grad_fn=<NllLossBackward>)
tensor(0.0229, grad_fn=<NllLossBackward>)
tensor(0.0245, grad_fn=<NllLossBac

tensor(0.0121, grad_fn=<NllLossBackward>)
tensor(0.0153, grad_fn=<NllLossBackward>)
tensor(0.0138, grad_fn=<NllLossBackward>)
tensor(0.0117, grad_fn=<NllLossBackward>)
tensor(0.0130, grad_fn=<NllLossBackward>)
tensor(0.0125, grad_fn=<NllLossBackward>)
tensor(0.5129, grad_fn=<NllLossBackward>)
tensor(0.0143, grad_fn=<NllLossBackward>)
tensor(0.0149, grad_fn=<NllLossBackward>)
tensor(0.0145, grad_fn=<NllLossBackward>)
tensor(0.0123, grad_fn=<NllLossBackward>)
tensor(0.4471, grad_fn=<NllLossBackward>)
tensor(0.0168, grad_fn=<NllLossBackward>)
tensor(0.5596, grad_fn=<NllLossBackward>)
tensor(0.0137, grad_fn=<NllLossBackward>)
tensor(0.0189, grad_fn=<NllLossBackward>)
tensor(0.0154, grad_fn=<NllLossBackward>)
tensor(0.0154, grad_fn=<NllLossBackward>)
tensor(0.0145, grad_fn=<NllLossBackward>)
tensor(0.0183, grad_fn=<NllLossBackward>)
tensor(0.5672, grad_fn=<NllLossBackward>)
tensor(0.0140, grad_fn=<NllLossBackward>)
tensor(0.0190, grad_fn=<NllLossBackward>)
tensor(0.0162, grad_fn=<NllLossBac

tensor(0.0228, grad_fn=<NllLossBackward>)
tensor(0.0239, grad_fn=<NllLossBackward>)
tensor(0.0227, grad_fn=<NllLossBackward>)
tensor(0.0261, grad_fn=<NllLossBackward>)
tensor(0.0213, grad_fn=<NllLossBackward>)
tensor(0.0237, grad_fn=<NllLossBackward>)
tensor(0.0295, grad_fn=<NllLossBackward>)
tensor(0.0186, grad_fn=<NllLossBackward>)
tensor(0.0193, grad_fn=<NllLossBackward>)
tensor(0.0175, grad_fn=<NllLossBackward>)
tensor(0.0208, grad_fn=<NllLossBackward>)
tensor(0.0211, grad_fn=<NllLossBackward>)
tensor(0.0189, grad_fn=<NllLossBackward>)
tensor(0.4847, grad_fn=<NllLossBackward>)
tensor(1.0079, grad_fn=<NllLossBackward>)
tensor(0.0249, grad_fn=<NllLossBackward>)
tensor(0.0176, grad_fn=<NllLossBackward>)
tensor(0.0182, grad_fn=<NllLossBackward>)
tensor(0.0174, grad_fn=<NllLossBackward>)
tensor(0.0182, grad_fn=<NllLossBackward>)
tensor(0.0189, grad_fn=<NllLossBackward>)
tensor(0.0240, grad_fn=<NllLossBackward>)
tensor(0.0175, grad_fn=<NllLossBackward>)
tensor(0.0169, grad_fn=<NllLossBac

tensor(0.0301, grad_fn=<NllLossBackward>)
tensor(0.3495, grad_fn=<NllLossBackward>)
tensor(0.0240, grad_fn=<NllLossBackward>)
tensor(0.0286, grad_fn=<NllLossBackward>)
tensor(0.3887, grad_fn=<NllLossBackward>)
tensor(0.0288, grad_fn=<NllLossBackward>)
tensor(0.0255, grad_fn=<NllLossBackward>)
tensor(0.0249, grad_fn=<NllLossBackward>)
tensor(0.0274, grad_fn=<NllLossBackward>)
tensor(0.0268, grad_fn=<NllLossBackward>)
tensor(0.0292, grad_fn=<NllLossBackward>)
tensor(0.0299, grad_fn=<NllLossBackward>)
tensor(0.0268, grad_fn=<NllLossBackward>)
tensor(0.4815, grad_fn=<NllLossBackward>)
tensor(0.0247, grad_fn=<NllLossBackward>)
tensor(0.0233, grad_fn=<NllLossBackward>)
tensor(0.0320, grad_fn=<NllLossBackward>)
tensor(0.0279, grad_fn=<NllLossBackward>)
tensor(0.0263, grad_fn=<NllLossBackward>)
tensor(0.0217, grad_fn=<NllLossBackward>)
tensor(0.5615, grad_fn=<NllLossBackward>)
tensor(0.0267, grad_fn=<NllLossBackward>)
tensor(0.0304, grad_fn=<NllLossBackward>)
tensor(0.0257, grad_fn=<NllLossBac

tensor(0.0163, grad_fn=<NllLossBackward>)
tensor(0.0159, grad_fn=<NllLossBackward>)
tensor(0.0141, grad_fn=<NllLossBackward>)
tensor(0.0116, grad_fn=<NllLossBackward>)
tensor(0.0134, grad_fn=<NllLossBackward>)
tensor(0.0116, grad_fn=<NllLossBackward>)
tensor(0.0129, grad_fn=<NllLossBackward>)
tensor(0.0133, grad_fn=<NllLossBackward>)
tensor(0.0100, grad_fn=<NllLossBackward>)
tensor(0.0131, grad_fn=<NllLossBackward>)
tensor(0.0103, grad_fn=<NllLossBackward>)
tensor(0.0103, grad_fn=<NllLossBackward>)
tensor(0.0091, grad_fn=<NllLossBackward>)
tensor(0.0098, grad_fn=<NllLossBackward>)
tensor(0.0133, grad_fn=<NllLossBackward>)
tensor(0.0128, grad_fn=<NllLossBackward>)
tensor(0.0146, grad_fn=<NllLossBackward>)
tensor(0.0089, grad_fn=<NllLossBackward>)
tensor(0.0100, grad_fn=<NllLossBackward>)
tensor(0.0094, grad_fn=<NllLossBackward>)
tensor(0.0104, grad_fn=<NllLossBackward>)
tensor(0.0090, grad_fn=<NllLossBackward>)
tensor(0.0076, grad_fn=<NllLossBackward>)
tensor(0.0075, grad_fn=<NllLossBac

tensor(0.0210, grad_fn=<NllLossBackward>)
tensor(0.0250, grad_fn=<NllLossBackward>)
tensor(0.0162, grad_fn=<NllLossBackward>)
tensor(0.0145, grad_fn=<NllLossBackward>)
tensor(0.0174, grad_fn=<NllLossBackward>)
tensor(0.5083, grad_fn=<NllLossBackward>)
tensor(0.4135, grad_fn=<NllLossBackward>)
tensor(0.0200, grad_fn=<NllLossBackward>)
tensor(0.4716, grad_fn=<NllLossBackward>)
tensor(0.0257, grad_fn=<NllLossBackward>)
tensor(0.0172, grad_fn=<NllLossBackward>)
tensor(0.0178, grad_fn=<NllLossBackward>)
tensor(0.0180, grad_fn=<NllLossBackward>)
tensor(0.0214, grad_fn=<NllLossBackward>)
tensor(0.0215, grad_fn=<NllLossBackward>)
tensor(0.0189, grad_fn=<NllLossBackward>)
tensor(0.0250, grad_fn=<NllLossBackward>)
tensor(0.0214, grad_fn=<NllLossBackward>)
tensor(0.0248, grad_fn=<NllLossBackward>)
tensor(0.4751, grad_fn=<NllLossBackward>)
tensor(0.0220, grad_fn=<NllLossBackward>)
tensor(0.0241, grad_fn=<NllLossBackward>)
tensor(0.0313, grad_fn=<NllLossBackward>)
tensor(0.0216, grad_fn=<NllLossBac

tensor(0.0157, grad_fn=<NllLossBackward>)
tensor(0.0190, grad_fn=<NllLossBackward>)
tensor(0.0164, grad_fn=<NllLossBackward>)
tensor(0.0200, grad_fn=<NllLossBackward>)
tensor(0.0290, grad_fn=<NllLossBackward>)
tensor(0.0206, grad_fn=<NllLossBackward>)
tensor(0.0250, grad_fn=<NllLossBackward>)
tensor(0.0234, grad_fn=<NllLossBackward>)
tensor(0.0249, grad_fn=<NllLossBackward>)
tensor(0.0231, grad_fn=<NllLossBackward>)
tensor(0.0394, grad_fn=<NllLossBackward>)
tensor(0.0269, grad_fn=<NllLossBackward>)
tensor(0.0304, grad_fn=<NllLossBackward>)
tensor(0.0366, grad_fn=<NllLossBackward>)
tensor(0.0271, grad_fn=<NllLossBackward>)
tensor(0.0263, grad_fn=<NllLossBackward>)
tensor(0.0300, grad_fn=<NllLossBackward>)
tensor(0.0284, grad_fn=<NllLossBackward>)
tensor(0.3936, grad_fn=<NllLossBackward>)
tensor(0.5452, grad_fn=<NllLossBackward>)
tensor(0.0200, grad_fn=<NllLossBackward>)
tensor(0.0263, grad_fn=<NllLossBackward>)
tensor(0.0319, grad_fn=<NllLossBackward>)
tensor(0.0251, grad_fn=<NllLossBac

tensor(0.3067, grad_fn=<NllLossBackward>)
tensor(0.0261, grad_fn=<NllLossBackward>)
tensor(0.0267, grad_fn=<NllLossBackward>)
tensor(0.0347, grad_fn=<NllLossBackward>)
tensor(0.0409, grad_fn=<NllLossBackward>)
tensor(0.0252, grad_fn=<NllLossBackward>)
tensor(0.0303, grad_fn=<NllLossBackward>)
tensor(0.0231, grad_fn=<NllLossBackward>)
tensor(0.0313, grad_fn=<NllLossBackward>)
tensor(0.0291, grad_fn=<NllLossBackward>)
tensor(0.0309, grad_fn=<NllLossBackward>)
tensor(0.0251, grad_fn=<NllLossBackward>)
tensor(0.3600, grad_fn=<NllLossBackward>)
tensor(0.5185, grad_fn=<NllLossBackward>)
tensor(0.0252, grad_fn=<NllLossBackward>)
tensor(0.0252, grad_fn=<NllLossBackward>)
tensor(0.0230, grad_fn=<NllLossBackward>)
tensor(0.0196, grad_fn=<NllLossBackward>)
tensor(0.0232, grad_fn=<NllLossBackward>)
tensor(0.0231, grad_fn=<NllLossBackward>)
tensor(0.4361, grad_fn=<NllLossBackward>)
tensor(0.4528, grad_fn=<NllLossBackward>)
tensor(0.0195, grad_fn=<NllLossBackward>)
tensor(0.0270, grad_fn=<NllLossBac

tensor(0.4766, grad_fn=<NllLossBackward>)
tensor(0.5377, grad_fn=<NllLossBackward>)
tensor(0.0206, grad_fn=<NllLossBackward>)
tensor(0.4896, grad_fn=<NllLossBackward>)
tensor(0.0281, grad_fn=<NllLossBackward>)
tensor(0.0295, grad_fn=<NllLossBackward>)
tensor(0.0254, grad_fn=<NllLossBackward>)
tensor(0.4617, grad_fn=<NllLossBackward>)
tensor(0.0287, grad_fn=<NllLossBackward>)
tensor(0.4266, grad_fn=<NllLossBackward>)
tensor(0.0337, grad_fn=<NllLossBackward>)
tensor(0.0283, grad_fn=<NllLossBackward>)
tensor(0.4008, grad_fn=<NllLossBackward>)
tensor(0.3360, grad_fn=<NllLossBackward>)
tensor(0.0349, grad_fn=<NllLossBackward>)
tensor(0.0272, grad_fn=<NllLossBackward>)
tensor(0.0365, grad_fn=<NllLossBackward>)
tensor(0.0496, grad_fn=<NllLossBackward>)
tensor(0.0304, grad_fn=<NllLossBackward>)
tensor(0.0331, grad_fn=<NllLossBackward>)
tensor(0.0277, grad_fn=<NllLossBackward>)
tensor(0.0279, grad_fn=<NllLossBackward>)
tensor(0.0340, grad_fn=<NllLossBackward>)
tensor(0.0339, grad_fn=<NllLossBac

tensor(0.0278, grad_fn=<NllLossBackward>)
tensor(0.0224, grad_fn=<NllLossBackward>)
tensor(0.4929, grad_fn=<NllLossBackward>)
tensor(0.0185, grad_fn=<NllLossBackward>)
tensor(0.0228, grad_fn=<NllLossBackward>)
tensor(0.0229, grad_fn=<NllLossBackward>)
tensor(0.0212, grad_fn=<NllLossBackward>)
tensor(0.0314, grad_fn=<NllLossBackward>)
tensor(0.0240, grad_fn=<NllLossBackward>)
tensor(0.0193, grad_fn=<NllLossBackward>)
tensor(0.0182, grad_fn=<NllLossBackward>)
tensor(0.3051, grad_fn=<NllLossBackward>)
tensor(0.0166, grad_fn=<NllLossBackward>)
tensor(0.6247, grad_fn=<NllLossBackward>)
tensor(0.4626, grad_fn=<NllLossBackward>)
tensor(0.3116, grad_fn=<NllLossBackward>)
tensor(0.0225, grad_fn=<NllLossBackward>)
tensor(0.0190, grad_fn=<NllLossBackward>)
tensor(0.6885, grad_fn=<NllLossBackward>)
tensor(0.0256, grad_fn=<NllLossBackward>)
tensor(0.0252, grad_fn=<NllLossBackward>)
tensor(0.2454, grad_fn=<NllLossBackward>)
tensor(0.0246, grad_fn=<NllLossBackward>)
tensor(0.0337, grad_fn=<NllLossBac

tensor(0.0131, grad_fn=<NllLossBackward>)
tensor(0.0154, grad_fn=<NllLossBackward>)
tensor(0.0098, grad_fn=<NllLossBackward>)
tensor(0.0099, grad_fn=<NllLossBackward>)
tensor(0.0098, grad_fn=<NllLossBackward>)
tensor(0.5707, grad_fn=<NllLossBackward>)
tensor(0.0094, grad_fn=<NllLossBackward>)
tensor(0.0113, grad_fn=<NllLossBackward>)
tensor(0.0117, grad_fn=<NllLossBackward>)
tensor(0.0099, grad_fn=<NllLossBackward>)
tensor(0.0095, grad_fn=<NllLossBackward>)
tensor(0.0096, grad_fn=<NllLossBackward>)
tensor(0.0122, grad_fn=<NllLossBackward>)
tensor(0.0092, grad_fn=<NllLossBackward>)
tensor(0.0092, grad_fn=<NllLossBackward>)
tensor(0.0100, grad_fn=<NllLossBackward>)
tensor(0.0102, grad_fn=<NllLossBackward>)
tensor(0.0095, grad_fn=<NllLossBackward>)
tensor(0.0101, grad_fn=<NllLossBackward>)
tensor(0.7294, grad_fn=<NllLossBackward>)
tensor(0.0106, grad_fn=<NllLossBackward>)
tensor(0.0112, grad_fn=<NllLossBackward>)
tensor(0.0117, grad_fn=<NllLossBackward>)
tensor(0.0144, grad_fn=<NllLossBac

tensor(0.0139, grad_fn=<NllLossBackward>)
tensor(0.0233, grad_fn=<NllLossBackward>)
tensor(0.0139, grad_fn=<NllLossBackward>)
tensor(0.0163, grad_fn=<NllLossBackward>)
tensor(0.0160, grad_fn=<NllLossBackward>)
tensor(0.0168, grad_fn=<NllLossBackward>)
tensor(0.0125, grad_fn=<NllLossBackward>)
tensor(0.0108, grad_fn=<NllLossBackward>)
tensor(0.0126, grad_fn=<NllLossBackward>)
tensor(0.3859, grad_fn=<NllLossBackward>)
tensor(0.0136, grad_fn=<NllLossBackward>)
tensor(0.5859, grad_fn=<NllLossBackward>)
tensor(0.0168, grad_fn=<NllLossBackward>)
tensor(0.0169, grad_fn=<NllLossBackward>)
tensor(0.5739, grad_fn=<NllLossBackward>)
tensor(0.0137, grad_fn=<NllLossBackward>)
tensor(0.0129, grad_fn=<NllLossBackward>)
tensor(0.0144, grad_fn=<NllLossBackward>)
tensor(0.0146, grad_fn=<NllLossBackward>)
tensor(0.0172, grad_fn=<NllLossBackward>)
tensor(0.2347, grad_fn=<NllLossBackward>)
tensor(0.0193, grad_fn=<NllLossBackward>)
tensor(0.0214, grad_fn=<NllLossBackward>)
tensor(0.0157, grad_fn=<NllLossBac

tensor(0.0119, grad_fn=<NllLossBackward>)
tensor(0.0128, grad_fn=<NllLossBackward>)
tensor(0.0103, grad_fn=<NllLossBackward>)
tensor(0.5576, grad_fn=<NllLossBackward>)
tensor(0.0106, grad_fn=<NllLossBackward>)
tensor(0.0206, grad_fn=<NllLossBackward>)
tensor(0.0100, grad_fn=<NllLossBackward>)
tensor(0.0116, grad_fn=<NllLossBackward>)
tensor(0.0096, grad_fn=<NllLossBackward>)
tensor(0.0133, grad_fn=<NllLossBackward>)
tensor(0.0136, grad_fn=<NllLossBackward>)
tensor(0.0141, grad_fn=<NllLossBackward>)
tensor(0.0145, grad_fn=<NllLossBackward>)
tensor(0.0086, grad_fn=<NllLossBackward>)
tensor(0.0104, grad_fn=<NllLossBackward>)
tensor(0.3316, grad_fn=<NllLossBackward>)
tensor(0.0131, grad_fn=<NllLossBackward>)
tensor(0.0099, grad_fn=<NllLossBackward>)
tensor(0.0123, grad_fn=<NllLossBackward>)
tensor(0.0124, grad_fn=<NllLossBackward>)
tensor(0.0104, grad_fn=<NllLossBackward>)
tensor(0.0143, grad_fn=<NllLossBackward>)
tensor(0.0133, grad_fn=<NllLossBackward>)
tensor(0.0124, grad_fn=<NllLossBac

tensor(0.0154, grad_fn=<NllLossBackward>)
tensor(0.0117, grad_fn=<NllLossBackward>)
tensor(0.0108, grad_fn=<NllLossBackward>)
tensor(0.0108, grad_fn=<NllLossBackward>)
tensor(0.0105, grad_fn=<NllLossBackward>)
tensor(0.0138, grad_fn=<NllLossBackward>)
tensor(0.7117, grad_fn=<NllLossBackward>)
tensor(0.0094, grad_fn=<NllLossBackward>)
tensor(0.3855, grad_fn=<NllLossBackward>)
tensor(0.0092, grad_fn=<NllLossBackward>)
tensor(0.0108, grad_fn=<NllLossBackward>)
tensor(0.0226, grad_fn=<NllLossBackward>)
tensor(0.0117, grad_fn=<NllLossBackward>)
tensor(0.0220, grad_fn=<NllLossBackward>)
tensor(0.0147, grad_fn=<NllLossBackward>)
tensor(0.0137, grad_fn=<NllLossBackward>)
tensor(0.5996, grad_fn=<NllLossBackward>)
tensor(0.5854, grad_fn=<NllLossBackward>)
tensor(0.0095, grad_fn=<NllLossBackward>)
tensor(0.0191, grad_fn=<NllLossBackward>)
tensor(0.0114, grad_fn=<NllLossBackward>)
tensor(0.0102, grad_fn=<NllLossBackward>)
tensor(0.0131, grad_fn=<NllLossBackward>)
tensor(0.0171, grad_fn=<NllLossBac

tensor(0.0147, grad_fn=<NllLossBackward>)
tensor(0.0200, grad_fn=<NllLossBackward>)
tensor(0.0141, grad_fn=<NllLossBackward>)
tensor(0.0141, grad_fn=<NllLossBackward>)
tensor(0.0194, grad_fn=<NllLossBackward>)
tensor(0.4676, grad_fn=<NllLossBackward>)
tensor(0.0121, grad_fn=<NllLossBackward>)
tensor(0.0148, grad_fn=<NllLossBackward>)
tensor(0.0235, grad_fn=<NllLossBackward>)
tensor(0.0155, grad_fn=<NllLossBackward>)
tensor(0.3917, grad_fn=<NllLossBackward>)
tensor(0.0190, grad_fn=<NllLossBackward>)
tensor(0.0172, grad_fn=<NllLossBackward>)
tensor(0.0144, grad_fn=<NllLossBackward>)
tensor(0.0186, grad_fn=<NllLossBackward>)
tensor(0.0152, grad_fn=<NllLossBackward>)
tensor(0.0213, grad_fn=<NllLossBackward>)
tensor(0.0223, grad_fn=<NllLossBackward>)
tensor(0.0164, grad_fn=<NllLossBackward>)
tensor(0.0160, grad_fn=<NllLossBackward>)
tensor(0.0169, grad_fn=<NllLossBackward>)
tensor(0.3464, grad_fn=<NllLossBackward>)
tensor(0.5891, grad_fn=<NllLossBackward>)
tensor(0.0242, grad_fn=<NllLossBac

tensor(0.3611, grad_fn=<NllLossBackward>)
tensor(0.0146, grad_fn=<NllLossBackward>)
tensor(0.0125, grad_fn=<NllLossBackward>)
tensor(0.0170, grad_fn=<NllLossBackward>)
tensor(0.0124, grad_fn=<NllLossBackward>)
tensor(0.0125, grad_fn=<NllLossBackward>)
tensor(0.0134, grad_fn=<NllLossBackward>)
tensor(0.0115, grad_fn=<NllLossBackward>)
tensor(0.4720, grad_fn=<NllLossBackward>)
tensor(0.0151, grad_fn=<NllLossBackward>)
tensor(0.0166, grad_fn=<NllLossBackward>)
tensor(0.0163, grad_fn=<NllLossBackward>)
tensor(0.4285, grad_fn=<NllLossBackward>)
tensor(0.0132, grad_fn=<NllLossBackward>)
tensor(0.0168, grad_fn=<NllLossBackward>)
tensor(0.2823, grad_fn=<NllLossBackward>)
tensor(0.0144, grad_fn=<NllLossBackward>)
tensor(0.0222, grad_fn=<NllLossBackward>)
tensor(0.5385, grad_fn=<NllLossBackward>)
tensor(0.0135, grad_fn=<NllLossBackward>)
tensor(0.0152, grad_fn=<NllLossBackward>)
tensor(0.0136, grad_fn=<NllLossBackward>)
tensor(0.0173, grad_fn=<NllLossBackward>)
tensor(0.0245, grad_fn=<NllLossBac

tensor(0.0156, grad_fn=<NllLossBackward>)
tensor(0.4340, grad_fn=<NllLossBackward>)
tensor(0.0116, grad_fn=<NllLossBackward>)
tensor(0.0120, grad_fn=<NllLossBackward>)
tensor(0.0102, grad_fn=<NllLossBackward>)
tensor(0.0254, grad_fn=<NllLossBackward>)
tensor(0.3743, grad_fn=<NllLossBackward>)
tensor(0.0200, grad_fn=<NllLossBackward>)
tensor(0.0116, grad_fn=<NllLossBackward>)
tensor(0.0105, grad_fn=<NllLossBackward>)
tensor(0.4986, grad_fn=<NllLossBackward>)
tensor(0.0132, grad_fn=<NllLossBackward>)
tensor(0.0232, grad_fn=<NllLossBackward>)
tensor(0.0152, grad_fn=<NllLossBackward>)
tensor(0.3724, grad_fn=<NllLossBackward>)
tensor(0.0139, grad_fn=<NllLossBackward>)
tensor(0.9574, grad_fn=<NllLossBackward>)
tensor(0.0146, grad_fn=<NllLossBackward>)
tensor(0.0175, grad_fn=<NllLossBackward>)
tensor(0.0134, grad_fn=<NllLossBackward>)
tensor(0.0208, grad_fn=<NllLossBackward>)
tensor(0.0178, grad_fn=<NllLossBackward>)
tensor(0.3819, grad_fn=<NllLossBackward>)
tensor(0.0212, grad_fn=<NllLossBac

tensor(0.0066, grad_fn=<NllLossBackward>)
tensor(0.0076, grad_fn=<NllLossBackward>)
tensor(0.5621, grad_fn=<NllLossBackward>)
tensor(0.0063, grad_fn=<NllLossBackward>)
tensor(0.0157, grad_fn=<NllLossBackward>)
tensor(0.3593, grad_fn=<NllLossBackward>)
tensor(0.0074, grad_fn=<NllLossBackward>)
tensor(0.5436, grad_fn=<NllLossBackward>)
tensor(0.0079, grad_fn=<NllLossBackward>)
tensor(0.0077, grad_fn=<NllLossBackward>)
tensor(0.0100, grad_fn=<NllLossBackward>)
tensor(0.0118, grad_fn=<NllLossBackward>)
tensor(0.7523, grad_fn=<NllLossBackward>)
tensor(0.0104, grad_fn=<NllLossBackward>)
tensor(0.0091, grad_fn=<NllLossBackward>)
tensor(0.0093, grad_fn=<NllLossBackward>)
tensor(0.0116, grad_fn=<NllLossBackward>)
tensor(0.0156, grad_fn=<NllLossBackward>)
tensor(0.0124, grad_fn=<NllLossBackward>)
tensor(0.0101, grad_fn=<NllLossBackward>)
tensor(0.2894, grad_fn=<NllLossBackward>)
tensor(0.0121, grad_fn=<NllLossBackward>)
tensor(0.2911, grad_fn=<NllLossBackward>)
tensor(0.0108, grad_fn=<NllLossBac

tensor(0.6374, grad_fn=<NllLossBackward>)
tensor(0.5672, grad_fn=<NllLossBackward>)
tensor(0.0148, grad_fn=<NllLossBackward>)
tensor(0.0159, grad_fn=<NllLossBackward>)
tensor(0.0149, grad_fn=<NllLossBackward>)
tensor(0.3947, grad_fn=<NllLossBackward>)
tensor(0.0232, grad_fn=<NllLossBackward>)
tensor(0.0144, grad_fn=<NllLossBackward>)
tensor(0.6365, grad_fn=<NllLossBackward>)
tensor(0.3473, grad_fn=<NllLossBackward>)
tensor(0.0213, grad_fn=<NllLossBackward>)
tensor(0.0170, grad_fn=<NllLossBackward>)
tensor(0.0457, grad_fn=<NllLossBackward>)
tensor(0.0323, grad_fn=<NllLossBackward>)
tensor(0.0230, grad_fn=<NllLossBackward>)
tensor(0.0257, grad_fn=<NllLossBackward>)
tensor(0.0204, grad_fn=<NllLossBackward>)
tensor(0.0189, grad_fn=<NllLossBackward>)
tensor(0.0267, grad_fn=<NllLossBackward>)
tensor(0.0204, grad_fn=<NllLossBackward>)
tensor(0.0244, grad_fn=<NllLossBackward>)
tensor(0.0250, grad_fn=<NllLossBackward>)
tensor(0.0270, grad_fn=<NllLossBackward>)
tensor(0.0296, grad_fn=<NllLossBac

tensor(0.0138, grad_fn=<NllLossBackward>)
tensor(0.0145, grad_fn=<NllLossBackward>)
tensor(0.4158, grad_fn=<NllLossBackward>)
tensor(0.0102, grad_fn=<NllLossBackward>)
tensor(0.0106, grad_fn=<NllLossBackward>)
tensor(0.6350, grad_fn=<NllLossBackward>)
tensor(0.0138, grad_fn=<NllLossBackward>)
tensor(0.0142, grad_fn=<NllLossBackward>)
tensor(0.0178, grad_fn=<NllLossBackward>)
tensor(0.2652, grad_fn=<NllLossBackward>)
tensor(0.0215, grad_fn=<NllLossBackward>)
tensor(0.0215, grad_fn=<NllLossBackward>)
tensor(0.0109, grad_fn=<NllLossBackward>)
tensor(0.0136, grad_fn=<NllLossBackward>)
tensor(0.4852, grad_fn=<NllLossBackward>)
tensor(0.0163, grad_fn=<NllLossBackward>)
tensor(0.0210, grad_fn=<NllLossBackward>)
tensor(0.6039, grad_fn=<NllLossBackward>)
tensor(0.0128, grad_fn=<NllLossBackward>)
tensor(0.0193, grad_fn=<NllLossBackward>)
tensor(0.0271, grad_fn=<NllLossBackward>)
tensor(0.0157, grad_fn=<NllLossBackward>)
tensor(0.1922, grad_fn=<NllLossBackward>)
tensor(0.0207, grad_fn=<NllLossBac

tensor(0.0100, grad_fn=<NllLossBackward>)
tensor(0.0126, grad_fn=<NllLossBackward>)
tensor(0.0138, grad_fn=<NllLossBackward>)
tensor(0.6861, grad_fn=<NllLossBackward>)
tensor(0.2836, grad_fn=<NllLossBackward>)
tensor(0.0093, grad_fn=<NllLossBackward>)
tensor(0.0101, grad_fn=<NllLossBackward>)
tensor(0.0128, grad_fn=<NllLossBackward>)
tensor(0.4356, grad_fn=<NllLossBackward>)
tensor(0.0100, grad_fn=<NllLossBackward>)
tensor(0.0126, grad_fn=<NllLossBackward>)
tensor(0.6108, grad_fn=<NllLossBackward>)
tensor(0.0158, grad_fn=<NllLossBackward>)
tensor(0.4183, grad_fn=<NllLossBackward>)
tensor(0.0144, grad_fn=<NllLossBackward>)
tensor(0.0264, grad_fn=<NllLossBackward>)
tensor(1.0919, grad_fn=<NllLossBackward>)
tensor(0.0233, grad_fn=<NllLossBackward>)
tensor(0.0220, grad_fn=<NllLossBackward>)
tensor(0.0248, grad_fn=<NllLossBackward>)
tensor(0.0213, grad_fn=<NllLossBackward>)
tensor(0.0464, grad_fn=<NllLossBackward>)
tensor(0.0181, grad_fn=<NllLossBackward>)
tensor(0.4083, grad_fn=<NllLossBac

tensor(0.0085, grad_fn=<NllLossBackward>)
tensor(0.2669, grad_fn=<NllLossBackward>)
tensor(0.0141, grad_fn=<NllLossBackward>)
tensor(0.0074, grad_fn=<NllLossBackward>)
tensor(0.0108, grad_fn=<NllLossBackward>)
tensor(0.0063, grad_fn=<NllLossBackward>)
tensor(0.0087, grad_fn=<NllLossBackward>)
tensor(0.0063, grad_fn=<NllLossBackward>)
tensor(0.0116, grad_fn=<NllLossBackward>)
tensor(0.0068, grad_fn=<NllLossBackward>)
tensor(0.0139, grad_fn=<NllLossBackward>)
tensor(0.0086, grad_fn=<NllLossBackward>)
tensor(0.0062, grad_fn=<NllLossBackward>)
tensor(0.0100, grad_fn=<NllLossBackward>)
tensor(0.0080, grad_fn=<NllLossBackward>)
tensor(0.0164, grad_fn=<NllLossBackward>)
tensor(0.0095, grad_fn=<NllLossBackward>)
tensor(0.0070, grad_fn=<NllLossBackward>)
tensor(0.0095, grad_fn=<NllLossBackward>)
tensor(0.0114, grad_fn=<NllLossBackward>)
tensor(0.0110, grad_fn=<NllLossBackward>)
tensor(0.0138, grad_fn=<NllLossBackward>)
tensor(0.0060, grad_fn=<NllLossBackward>)
tensor(0.0068, grad_fn=<NllLossBac

tensor(0.0326, grad_fn=<NllLossBackward>)
tensor(0.0118, grad_fn=<NllLossBackward>)
tensor(0.0371, grad_fn=<NllLossBackward>)
tensor(0.0196, grad_fn=<NllLossBackward>)
tensor(0.0110, grad_fn=<NllLossBackward>)
tensor(0.0155, grad_fn=<NllLossBackward>)
tensor(0.0166, grad_fn=<NllLossBackward>)
tensor(0.0160, grad_fn=<NllLossBackward>)
tensor(0.0325, grad_fn=<NllLossBackward>)
tensor(0.0113, grad_fn=<NllLossBackward>)
tensor(0.0198, grad_fn=<NllLossBackward>)
tensor(0.0107, grad_fn=<NllLossBackward>)
tensor(0.0141, grad_fn=<NllLossBackward>)
tensor(0.0112, grad_fn=<NllLossBackward>)
tensor(0.0114, grad_fn=<NllLossBackward>)
tensor(0.5262, grad_fn=<NllLossBackward>)
tensor(0.0231, grad_fn=<NllLossBackward>)
tensor(0.0110, grad_fn=<NllLossBackward>)
tensor(0.0102, grad_fn=<NllLossBackward>)
tensor(0.3579, grad_fn=<NllLossBackward>)
tensor(0.0149, grad_fn=<NllLossBackward>)
tensor(0.0191, grad_fn=<NllLossBackward>)
tensor(0.0112, grad_fn=<NllLossBackward>)
tensor(0.5379, grad_fn=<NllLossBac

tensor(0.3461, grad_fn=<NllLossBackward>)
tensor(0.0195, grad_fn=<NllLossBackward>)
tensor(0.0146, grad_fn=<NllLossBackward>)
tensor(0.0195, grad_fn=<NllLossBackward>)
tensor(0.0195, grad_fn=<NllLossBackward>)
tensor(0.0364, grad_fn=<NllLossBackward>)
tensor(0.0173, grad_fn=<NllLossBackward>)
tensor(0.0176, grad_fn=<NllLossBackward>)
tensor(0.0216, grad_fn=<NllLossBackward>)
tensor(0.2765, grad_fn=<NllLossBackward>)
tensor(0.0164, grad_fn=<NllLossBackward>)
tensor(0.4202, grad_fn=<NllLossBackward>)
tensor(0.0165, grad_fn=<NllLossBackward>)
tensor(0.0359, grad_fn=<NllLossBackward>)
tensor(0.5295, grad_fn=<NllLossBackward>)
tensor(0.0199, grad_fn=<NllLossBackward>)
tensor(0.0196, grad_fn=<NllLossBackward>)
tensor(0.0178, grad_fn=<NllLossBackward>)
tensor(0.0382, grad_fn=<NllLossBackward>)
tensor(0.0165, grad_fn=<NllLossBackward>)
tensor(0.0158, grad_fn=<NllLossBackward>)
tensor(0.0160, grad_fn=<NllLossBackward>)
tensor(0.0152, grad_fn=<NllLossBackward>)
tensor(0.0159, grad_fn=<NllLossBac

tensor(0.0168, grad_fn=<NllLossBackward>)
tensor(0.0224, grad_fn=<NllLossBackward>)
tensor(0.0147, grad_fn=<NllLossBackward>)
tensor(0.0196, grad_fn=<NllLossBackward>)
tensor(0.1602, grad_fn=<NllLossBackward>)
tensor(0.0156, grad_fn=<NllLossBackward>)
tensor(0.0190, grad_fn=<NllLossBackward>)
tensor(0.0177, grad_fn=<NllLossBackward>)
tensor(0.0227, grad_fn=<NllLossBackward>)
tensor(0.0151, grad_fn=<NllLossBackward>)
tensor(0.0124, grad_fn=<NllLossBackward>)
tensor(0.0119, grad_fn=<NllLossBackward>)
tensor(0.0226, grad_fn=<NllLossBackward>)
tensor(0.0174, grad_fn=<NllLossBackward>)
tensor(0.0400, grad_fn=<NllLossBackward>)
tensor(0.0117, grad_fn=<NllLossBackward>)
tensor(0.4597, grad_fn=<NllLossBackward>)
tensor(0.5752, grad_fn=<NllLossBackward>)
tensor(0.0270, grad_fn=<NllLossBackward>)
tensor(0.0156, grad_fn=<NllLossBackward>)
tensor(0.0256, grad_fn=<NllLossBackward>)
tensor(0.0316, grad_fn=<NllLossBackward>)
tensor(0.0214, grad_fn=<NllLossBackward>)
tensor(0.0170, grad_fn=<NllLossBac

tensor(0.0400, grad_fn=<NllLossBackward>)
tensor(0.4021, grad_fn=<NllLossBackward>)
tensor(0.0193, grad_fn=<NllLossBackward>)
tensor(0.0238, grad_fn=<NllLossBackward>)
tensor(0.0181, grad_fn=<NllLossBackward>)
tensor(0.0189, grad_fn=<NllLossBackward>)
tensor(0.0173, grad_fn=<NllLossBackward>)
tensor(0.0182, grad_fn=<NllLossBackward>)
tensor(0.0153, grad_fn=<NllLossBackward>)
tensor(0.0276, grad_fn=<NllLossBackward>)
tensor(0.0361, grad_fn=<NllLossBackward>)
tensor(0.0200, grad_fn=<NllLossBackward>)
tensor(0.0276, grad_fn=<NllLossBackward>)
tensor(0.0143, grad_fn=<NllLossBackward>)
tensor(0.0237, grad_fn=<NllLossBackward>)
tensor(0.0161, grad_fn=<NllLossBackward>)
tensor(0.5533, grad_fn=<NllLossBackward>)
tensor(0.0170, grad_fn=<NllLossBackward>)
tensor(0.0133, grad_fn=<NllLossBackward>)
tensor(0.0186, grad_fn=<NllLossBackward>)
tensor(0.0145, grad_fn=<NllLossBackward>)
tensor(0.0141, grad_fn=<NllLossBackward>)
tensor(0.0163, grad_fn=<NllLossBackward>)
tensor(0.4231, grad_fn=<NllLossBac

tensor(0.0153, grad_fn=<NllLossBackward>)
tensor(0.0150, grad_fn=<NllLossBackward>)
tensor(0.0273, grad_fn=<NllLossBackward>)
tensor(0.0178, grad_fn=<NllLossBackward>)
tensor(0.0136, grad_fn=<NllLossBackward>)
tensor(0.0209, grad_fn=<NllLossBackward>)
tensor(0.0269, grad_fn=<NllLossBackward>)
tensor(0.0153, grad_fn=<NllLossBackward>)
tensor(0.0082, grad_fn=<NllLossBackward>)
tensor(0.0125, grad_fn=<NllLossBackward>)
tensor(0.0149, grad_fn=<NllLossBackward>)
tensor(0.0105, grad_fn=<NllLossBackward>)
tensor(0.0129, grad_fn=<NllLossBackward>)
tensor(0.8169, grad_fn=<NllLossBackward>)
tensor(0.0113, grad_fn=<NllLossBackward>)
tensor(0.0131, grad_fn=<NllLossBackward>)
tensor(0.0111, grad_fn=<NllLossBackward>)
tensor(0.0155, grad_fn=<NllLossBackward>)
tensor(0.0097, grad_fn=<NllLossBackward>)
tensor(0.3437, grad_fn=<NllLossBackward>)
tensor(0.0117, grad_fn=<NllLossBackward>)
tensor(0.0077, grad_fn=<NllLossBackward>)
tensor(0.0100, grad_fn=<NllLossBackward>)
tensor(0.4000, grad_fn=<NllLossBac

KeyboardInterrupt: 

huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
huhu
Eval loss: tensor(0.0723)
