<a href="https://colab.research.google.com/github/ahmad-PH/nag-notebooks/blob/master/NAG_tripletLossExperiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import subprocess
def run_shell_command(cmd):
  p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
  print(str(p.communicate()[0], 'utf-8'))


def detect_env():
    import os
    if 'content' in os.listdir('/'):
      return "colab"
    else:
      return "IBM"
  
def create_env():
  if detect_env() == "IBM":
    return IBMEnv()
  elif detect_env() == "colab":
    return ColabEnv()


class Env:
  def get_nag_util_files(self):
      import os
      
      print("\ngetting git files ...")
      if os.path.isdir(self.python_files_path):
        os.chdir(self.python_files_path)
        run_shell_command('git pull')
        os.chdir(self.root_folder)
      else:
        run_shell_command('git clone https://github.com/ahmad-PH/nag-public.git')
      print("done.")
      
  def get_csv_path(self):
    return self.get_csv_dir() + self.save_filename
  
  def get_models_path(self):
    return self.get_models_dir() + self.save_filename
  

class IBMEnv(Env):
    def __init__(self):
      self.root_folder = "/root/Derakhshani/adversarial"
      self.temp_csv_path = self.root_folder + "/temp"
      self.python_files_path = self.root_folder + "/nag-public"
      self.python_files_dir = "NAG-11May-beforeDenoiser"
      
      import sys
      sys.path.append('./nag/nag_util')
    
    def get_csv_dir(self):
      return self.root_folder + "/textual_notes/CSVs/"
    
    def get_models_dir(self):
      return self.root_folder + "/models/"
    
    def setup(self):
      self.get_nag_util_files()
      
      import os; import torch;
      os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
      cuda_index = 0
      os.environ['CUDA_VISIBLE_DEVICES']=str(cuda_index)
#       defaults.device = torch.device('cuda:' + str(cuda_index))
#       print('cuda:' + str(cuda_index))
#       torch.cuda.set_device('cuda:1')
      
    def load_dataset(self, compressed_name, unpacked_name):
      pass

    def load_test_dataset(self, root_folder):
      pass
    
    def set_data_path(self, path):
      self.data_path = Path(self.root_folder + '/datasets/' + path)
    
        
class ColabEnv(Env):
    def __init__(self):
      self.root_folder = '/content'
      self.temp_csv_path = self.root_folder
      self.python_files_path = self.root_folder + '/nag-public'
      self.python_files_dir = "NAG-11May-beforeDenoiser"
      self.torchvision_upgraded = False
      
    def get_csv_dir(self):
      return self.root_folder + '/gdrive/My Drive/DL/textual_notes/CSVs/'
    
    def get_models_dir(self):
      return self.root_folder + '/gdrive/My Drive/DL/models/'
        
    def setup(self):
        # ######################################################
        # # TODO remove this once torchvision 0.3 is present by
        # # default in Colab
        # ######################################################
        global torchvision_upgraded
        try:
            torchvision_upgraded
        except NameError:
          !pip uninstall -y torchvision
          !pip install https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
          torchvision_upgraded = True
        else:
          print("torchvision already upgraded")
          
        from google.colab import drive
        drive.mount('/content/gdrive')
        
        self.get_nag_util_files()
        
    def load_dataset(self, compressed_name, unpacked_name):
      if compressed_name not in os.listdir('.'):
        print(compressed_name + ' not found, getting it from drive')
        shutil.copyfile("/content/gdrive/My Drive/DL/{}.tar.gz".format(compressed_name), "./{}.tar.gz".format(compressed_name))

        gunzip_arg = "./{}.tar.gz".format(compressed_name)
        !gunzip -f $gunzip_arg

        tar_arg = "./{}.tar".format(compressed_name)
        !tar -xvf $tar_arg > /dev/null

        os.rename(unpacked_name, compressed_name)

    #     ls_arg = "./{}/train/n01440764".format(compressed_name)
    #     !ls $ls_arg

        !rm $tar_arg

        print("done") 
      else:
        print(compressed_name + " found")
        
    def load_test_dataset(self, root_folder):
      test_folder = root_folder + '/test/'
      if 'test' not in os.listdir(root_folder):
        print('getting test dataset from drive')
        os.mkdir(test_folder)
        for i in range(1,11):
          shutil.copy("/content/gdrive/My Drive/DL/full_test_folder/{}.zip".format(i), test_folder)
          shutil.unpack_archive(test_folder + "/{}.zip".format(i), test_folder)
          os.remove(test_folder + "/{}.zip".format(i))
          print("done with the {}th fragment".format(i))
      else:
        print('test dataset found.')
        
    def set_data_path(self, path):
      self.data_path = Path('./' + path)
        

In [3]:
env = create_env()
env.setup()


getting git files ...
Already up-to-date.

done.


In [4]:
from fastai.vision import *
from fastai.imports import *
from fastai.callbacks import *
from fastai.utils.mem import *
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.models as models
import sys; import os; import shutil

In [5]:
sys.path.append(env.python_files_path + '/' + env.python_files_dir)

from nag_util import *
import nag_util

In [6]:
from functools import partial
from torch.nn import init
from typing import Iterable
torch.Tensor.ndim = property(lambda x: len(x.shape))

def listify(o):
    if o is None: return []
    if isinstance(o, list): return o
    if isinstance(o, Iterable): return list(o)
    return [o]

class ListContainer():
  def __init__(self, items): self.items = listify(items)
  def __getitem__(self, idx):
    if isinstance(idx, (int, slice)): return self.items[idx]
    if isinstance(idx[0], bool):
      assert len(idx) == len(self)
      return [o for m,o in zip(idx, self.items) if m]
    return [self.items[i] for i in idx]
  
  def __len__(self): return len(self.items)
  def __iter__(self): return iter(self.items)
  def __setitem__(self, i, o): self.items[i] = o
  def __delitem__(self, i): del(self.items[i])
  def __repr__(self):
    res = f"{self.__class__.__name__} ({len(self)} items)\n{self.items[:10]}"
    if len(self)>10: res = res[:-1] + "...]"
    return res

def children(m): return list(m.children())

def append_stats_non_normal(hook, mod, inp, outp):
  if not hasattr(hook,'stats'): hook.stats = ([],[],[])
  means,stds,hists = hook.stats
  means.append(outp.data.mean().cpu())
  stds .append(outp.data.std().cpu())
  hists.append(outp.data.cpu().histc(40,0,10)) #histc isn't implemented on the GPU

def append_stats_normal(hook, mod, inp, outp):
    if not hasattr(hook,'stats'): hook.stats = ([],[],[])
    means,stds,hists = hook.stats
    means.append(outp.data.mean().cpu())
    stds .append(outp.data.std().cpu())
    hists.append(outp.data.cpu().histc(40,-7,7))

def get_hist(h):
  return torch.stack(h.stats[2]).t().float().log1p()

def get_min(h):
  h1 = torch.stack(h.stats[2]).t().float()
  return h1[:2].sum(0)/h1.sum(0)

class Hook():
  def __init__(self, m, f): self.hook = m.register_forward_hook(partial(f, self))
  def __del__(self): self.remove()
  def remove(self): self.hook.remove()
    
class Hooks(ListContainer):
  def __init__(self, ms, f): super().__init__([Hook(m, f) for m in ms.children()])
  def __enter__(self, *args): return self
  def __exit__ (self, *args): self.remove()
  def __del__(self): self.remove()

  def __delitem__(self, i):
    self[i].remove()
    super().__delitem__(i)

  def remove(self):
    for h in self: h.remove()

def init_cnn_(m, f):
    if isinstance(m, nn.ConvTranspose2d):
      f(m.weight, a=0.1)
      if getattr(m, 'bias', None) is not None: m.bias.data.zero_()
    #non-orthogonal
#     if isinstance(m, nn.Linear):
#       f(m.weight, a=0.)
#       if getattr(m, 'bias', None) is not None: m.bias.data.zero_()
    #orthogonal
    if isinstance(m, nn.Linear):
      init.orthogonal_(m.weight)
      m.bias.data.zero_()
    for l in m.children(): init_cnn_(l, f)  
      
def init_cnn(m, uniform=False):
    f = init.kaiming_uniform_ if uniform else init.kaiming_normal_
    init_cnn_(m, f)

class GeneralRelu(nn.Module):
  def __init__(self, leak=None, sub=None, maxv=None):
    super().__init__()
    self.leak,self.sub,self.maxv = leak,sub,maxv

  def forward(self, x): 
    x = F.leaky_relu(x,self.leak) if self.leak is not None else F.relu(x)
    if self.sub is not None: x.sub_(self.sub)
    if self.maxv is not None: x.clamp_max_(self.maxv)
    return x
  
class deconv_layer(nn.Module):
    def __init__(self, in_ch, out_ch, k_size = (4,4), s = (2,2), pad = (1,1), b = True, activation = True):
        super(deconv_layer, self).__init__()

        self.CT2d = nn.ConvTranspose2d(in_channels = in_ch,
                                  out_channels = out_ch,
                                  kernel_size = k_size,
                                  stride = s, 
                                  padding = pad,
                                  bias = b)
        self.BN2d = nn.BatchNorm2d(out_ch)
        
        self.activation = activation
        if self.activation:
            self.relu = GeneralRelu(0, 0.2, 5)
        
        self.weight_init()
    
    def forward(self, input):
        if self.activation:
            return self.relu(self.BN2d(self.CT2d(input)))
        else:
            return self.BN2d(self.CT2d(input))

    def weight_init(self):
        self.CT2d.weight.data.normal_(mean = 0, std = 0.02)
        self.CT2d.bias.data.fill_(0)


In [7]:
# mode = "sanity_check"
mode = "normal"
# mode = "div_metric_calc"

In [8]:
model = models.resnet50
# model = models.resnet152
# model = models.vgg16_bn
# model = torchvision.models.googlenet

In [9]:
if mode == "normal":
  env.load_dataset('dataset','data')
  env.set_data_path('dataset')
elif mode == "sanity_check":
  env.load_dataset('dataset_sanity_check_small', 'dataset_sanity_check_small')  
  env.set_data_path('dataset_sanity_check_small')
elif mode == "div_metric_calc":
  env.load_dataset('dataset','data')
  env.set_data_path('dataset')
  env.load_test_dataset(str(env.data_path))

In [10]:
batch_size = 16
gpu_flag = True
nag_util.batch_size = batch_size; nag_util.gpu_flag = gpu_flag;
# nag_util.set_globals(gpu_flag, batch_size)
tfms = get_transforms(do_flip=False, max_rotate=0)
data = (ImageList.from_folder(env.data_path)
        .split_by_folder(valid=('test' if mode == 'div_metric_calc' else 'valid'))
        .label_from_folder()
        .transform(tfms, size=224)
        .databunch(bs=batch_size, num_workers=1)
        .normalize(imagenet_stats))

# data.show_batch(rows=2, figsize=(5,5))

In [11]:
model_name = model.__name__
z_dim = 1000
arch = SoftmaxWrapper(model(pretrained=True).cuda().eval())
nag_util.arch = arch
requires_grad(arch, False)

# vgg:
# layers = []
# blocks = [i-1 for i,o in enumerate(children(arch.features)) if isinstance(o, nn.MaxPool2d)]
# layers = [arch.features[i] for i in blocks]
# layer_weights = [1] * len(layers)

layers = [
    arch.softmax
]

layer_weights = [1.] * len(layers)

# inception:
# layers = [
#     arch.Conv2d_1a_3x3,
#     arch.Mixed_6e,
#     arch.Mixed_7a,
#     arch.fc    
# ]
# layer_weights = [1.0/4.0] * len(layers)

In [12]:
# class Gen(nn.Module):
#   def __init__(self, z_dim, gf_dim=64, y_dim = None, df_dim = 64, image_shape = [3,128,128]):
#     super(Gen, self).__init__()

#     self.bs = None
#     self.z_dim = z_dim
#     self.gf_dim = gf_dim
#     self.y_dim = y_dim
#     self.df_dim = df_dim
#     self.image_shape = image_shape

#     self.z_ = nn.Linear(self.z_dim, self.gf_dim * 7 * 4 * 4, bias=True)
#     self.z_.bias.data.fill_(0)
#     self.BN_ = nn.BatchNorm2d(self.gf_dim * 7)

#     self.half = max(self.gf_dim // 2, 1) 
#     self.quarter = max(self.gf_dim // 4, 1)
#     self.eighth = max(self.gf_dim // 8, 1)
#     # sixteenth = max(self.gf_dim // 16, 1)
    
#     self.CT2d_1 = deconv_layer(self.gf_dim * 8, self.gf_dim * 4, k_size = (5,5), pad = (2,2))
#     self.CT2d_2 = deconv_layer(self.gf_dim * 5, self.gf_dim * 2)    
#     self.CT2d_3 = deconv_layer(self.gf_dim * 2 + self.half, self.gf_dim * 1)
#     self.CT2d_4 = deconv_layer(self.gf_dim * 1 + self.quarter, self.gf_dim * 1)
#     self.CT2d_5 = deconv_layer(self.gf_dim * 1 + self.eighth, self.gf_dim * 1)
#     self.CT2d_6 = deconv_layer(self.gf_dim * 1 + self.eighth, self.gf_dim * 1)
#     self.CT2d_7 = deconv_layer(self.gf_dim * 1 + self.eighth, 3, k_size = (5,5), s = (1,1), pad = (2,2), activation = False)

#   def randomized_deconv_layer(self, h_input, z_size_0, z_size_1, deconv_layer, expected_output_size):
#     h_input_z = self.make_z([self.bs, z_size_0, z_size_1, z_size_1])
#     h_input = torch.cat([h_input, h_input_z], dim = 1)
#     output = deconv_layer(h_input)
#     assert output.shape[2:] == (expected_output_size, expected_output_size), \
#             "Unexpected output shape at randomized_deconv_layer. expected" + \
#             "({0},{0}), got {1}".format(expected_output_size, output.shape[2:])
#     return output
  
#   def forward_z(self, z):
#     self.bs = z.shape[0]
    
#     h0 = F.relu(self.BN_(self.z_(z).contiguous().view(self.bs, -1, 4, 4)))
#     assert h0.shape[2:] == (4, 4), "Unexpected shape, it shoud be (4,4)"

#     h1 = self.randomized_deconv_layer(h0, self.gf_dim, 4, self.CT2d_1, 7)
#     h2 = self.randomized_deconv_layer(h1, self.gf_dim, 7, self.CT2d_2, 14)
#     h3 = self.randomized_deconv_layer(h2, self.half, 14, self.CT2d_3, 28)
#     h4 = self.randomized_deconv_layer(h3, self.quarter, 28, self.CT2d_4, 56)
#     h5 = self.randomized_deconv_layer(h4, self.eighth, 56, self.CT2d_5, 112)
#     h6 = self.randomized_deconv_layer(h5, self.eighth, 112, self.CT2d_6, 224)
#     h7 = self.randomized_deconv_layer(h6, self.eighth, 224, self.CT2d_7, 224)

#     ksi = 10.0
#     output_coeff = ksi / (255.0 * np.mean(imagenet_stats[1])) 
#     # this coeff scales the output to be appropriate for images that are 
#     # normalized using imagenet_stats (and are hence in the approximate [-2.5, 2.5]
#     # interval)
#     return output_coeff * torch.tanh(h7)
  
# #   # blind-selection
#   def forward(self, inputs):
#     self.bs = inputs.shape[0]

#     benign_preds_onehot = arch(inputs)
#     benign_preds = torch.argmax(benign_preds_onehot, dim = 1)
    
#     z = torch.zeros([self.bs, 1000]).cuda()
#     for i in range(self.bs):
#       random_label = self.randint(0,1000, exclude = benign_preds[i].item())
#       z[i][random_label] = 1.
    
#     z_out = self.forward_z(z)
    
#     return z_out, None, None, inputs, benign_preds_onehot, z

# #   #second-best selection: made validation so much worse
# #   def forward(self, inputs):
# #     self.bs = inputs.shape[0]

# #     benign_preds_onehot = arch(inputs)
# #     target_preds = torch.topk(benign_preds_onehot, 2, dim = 1).indices[:, 1:]
    
# #     z = torch.zeros([self.bs, 1000]).cuda()
# #     for i in range(self.bs):
# #       z[i][target_preds[i]] = 1.
    
# #     z_out = self.forward_z(z)
    
# #     return z_out, None, None, inputs, benign_preds_onehot, z
  
# #    def forward(self, inputs):
# #     self.bs = inputs.shape[0]

# #     benign_preds_onehot = arch(inputs)
# #     benign_preds = torch.argmax(benign_preds_onehot, dim = 1)
    
# #     z = torch.zeros([self.bs, 1000]).cuda()
# #     random_label = self.randint(0,1000, exclude = benign_preds.tolist())
# #     for i in range(self.bs):
# #       z[i][random_label] = 1.
    
# #     z_out = self.forward_z(z)
    
# #     return z_out, None, None, inputs, benign_preds_onehot, z
  
#   @staticmethod
#   def randint(low, high, exclude):
#     temp = np.random.randint(low, high - 1)
#     if temp == exclude:
#       temp = temp + 1
#     return temp
  
#   def forward_single_z(self, z):
#     return self.forward_z(z[None]).squeeze()
           
  
#   def make_triplet_samples(self, z, margin, r2, r3):
#     positive_sample = z + self.random_vector_volume(z.shape, 0, margin).cuda() 
#     negative_sample = z + self.random_vector_volume(z.shape, r2, r3).cuda()
#     return positive_sample, negative_sample

#   def random_vector_surface(self, shape, r = 1.):
#     mat = torch.randn(size=shape).cuda()
#     norm = torch.norm(mat, p=2, dim=1, keepdim = True).cuda()
#     return (mat/norm) * r

  
#   def random_vector_volume(self, shape, inner_r, outer_r):
#     fraction = torch.empty(shape[0]).uniform_(inner_r, outer_r).cuda()
#     fraction = ((fraction / outer_r) ** (1 / shape[1])) * outer_r # volume-normalize the fraction
#     fraction.unsqueeze_(-1)
#     return self.random_vector_surface(shape, 1) * fraction

#   def make_z(self, in_shape):
#     return torch.empty(in_shape).cuda().uniform_(-1,1)


In [13]:
# non-targeted Gen
class Gen(nn.Module):
  def __init__(self, z_dim, gf_dim=64, y_dim = None, df_dim = 64, image_shape = [3,128,128]):
    super(Gen, self).__init__()

    self.bs = None
    self.z_dim = z_dim
    self.gf_dim = gf_dim
    self.y_dim = y_dim
    self.df_dim = df_dim
    self.image_shape = image_shape

    self.z_ = nn.Linear(self.z_dim, self.gf_dim * 7 * 4 * 4, bias=True)
    self.z_.bias.data.fill_(0)
    self.BN_ = nn.BatchNorm2d(self.gf_dim * 7)

    self.half = max(self.gf_dim // 2, 1) 
    self.quarter = max(self.gf_dim // 4, 1)
    self.eighth = max(self.gf_dim // 8, 1)
    # sixteenth = max(self.gf_dim // 16, 1)

    self.CT2d_1 = deconv_layer(self.gf_dim * 8, self.gf_dim * 4, k_size = (5,5), pad = (2,2))
    self.CT2d_2 = deconv_layer(self.gf_dim * 5, self.gf_dim * 2)    
    self.CT2d_3 = deconv_layer(self.gf_dim * 2 + self.half, self.gf_dim * 1)
    self.CT2d_4 = deconv_layer(self.gf_dim * 1 + self.quarter, self.gf_dim * 1)
    self.CT2d_5 = deconv_layer(self.gf_dim * 1 + self.eighth, self.gf_dim * 1)
    self.CT2d_6 = deconv_layer(self.gf_dim * 1 + self.eighth, self.gf_dim * 1)
    self.CT2d_7 = deconv_layer(self.gf_dim * 1 + self.eighth, 3, k_size = (5,5), s = (1,1), pad = (2,2), activation = False)

  def randomized_deconv_layer(self, h_input, z_size_0, z_size_1, deconv_layer, expected_output_size):
    h_input_z = self.make_z([self.bs, z_size_0, z_size_1, z_size_1])
    h_input = torch.cat([h_input, h_input_z], dim = 1)
    output = deconv_layer(h_input)
    assert output.shape[2:] == (expected_output_size, expected_output_size), \
            "Unexpected output shape at randomized_deconv_layer. expected" + \
            "({0},{0}), got {1}".format(expected_output_size, output.shape[2:])
    return output
  
  def forward_z(self, z):
    self.bs = z.shape[0]
    
    h0 = F.relu(self.BN_(self.z_(z).contiguous().view(self.bs, -1, 4, 4)))
    assert h0.shape[2:] == (4, 4), "Non-expected shape, it shoud be (4,4)"

    h1 = self.randomized_deconv_layer(h0, self.gf_dim, 4, self.CT2d_1, 7)
    h2 = self.randomized_deconv_layer(h1, self.gf_dim, 7, self.CT2d_2, 14)
    h3 = self.randomized_deconv_layer(h2, self.half, 14, self.CT2d_3, 28)
    h4 = self.randomized_deconv_layer(h3, self.quarter, 28, self.CT2d_4, 56)
    h5 = self.randomized_deconv_layer(h4, self.eighth, 56, self.CT2d_5, 112)
    h6 = self.randomized_deconv_layer(h5, self.eighth, 112, self.CT2d_6, 224)
    h7 = self.randomized_deconv_layer(h6, self.eighth, 224, self.CT2d_7, 224)

    ksi = 10.0
    output_coeff = ksi / (255.0 * np.mean(imagenet_stats[1])) 
    # this coeff scales the output to be appropriate for images that are 
    # normalized using imagenet_stats (and are hence in the approximate [-2.5, 2.5]
    # interval)
    return output_coeff * torch.tanh(h7)

  def forward(self, inputs):
    self.bs = inputs.shape[0]
    z = inputs.new_empty([self.bs, self.z_dim]).uniform_(-1,1).cuda()
    p, n = self.make_triplet_samples(z, 0.1, 0.1, 2.)
    
    z_out = self.forward_z(z)
    p_out = self.forward_z(p)
    n_out = self.forward_z(n)
    
    return z_out, p_out, n_out, inputs
#     return z_out, None, None, inputs
  
  def forward_single_z(self, z):
    return self.forward_z(z[None]).squeeze()
           
  
  def make_triplet_samples(self, z, margin, r2, r3):
    positive_sample = z + self.random_vector_volume(z.shape, 0, margin).cuda() 
    negative_sample = z + self.random_vector_volume(z.shape, r2, r3).cuda()
    return positive_sample, negative_sample

  def random_vector_surface(self, shape, r = 1.):
    mat = torch.randn(size=shape).cuda()
    norm = torch.norm(mat, p=2, dim=1, keepdim = True).cuda()
    return (mat/norm) * r

  
  def random_vector_volume(self, shape, inner_r, outer_r):
    fraction = torch.empty(shape[0]).uniform_(inner_r, outer_r).cuda()
    fraction = ((fraction / outer_r) ** (1 / shape[1])) * outer_r # volume-normalize the fraction
    fraction.unsqueeze_(-1)
    return self.random_vector_surface(shape, 1) * fraction

  def make_z(self, in_shape):
    return torch.empty(in_shape).cuda().uniform_(-1,1)

In [14]:
def js_distance(x1, x2):
  m = 0.5 * (x1 + x2)
  return 0.5 * (F.kl_div(x1, m) + F.kl_div(x2, m))

def kl_distance(x1, x2):
  inp = torch.log(x1)
  target = x2
  return F.kl_div(inp, target, reduction='batchmean')

def distrib_distance(x1, x2):
  x1 = tensorify(x1)
  x2 = tensorify(x2)
  x1 = x1 / torch.sum(x1)
  x2 = x2 / torch.sum(x2)
  return kl_distance(x1[None], x2[None])

def distrib_distance(x1, x2):
  if not isinstance(x1, torch.Tensor): x1 = torch.tensor(x1)
  if not isinstance(x2, torch.Tensor): x2 = torch.tensor(x2)
  x1 = x1 * 100. / torch.sum(x1)
  x2 = x2 * 100. / torch.sum(x2)
  return torch.norm(x1 - x2, 2)

def distance_from_uniform(x):
  return distrib_distance(x, [1.] * len(x))

def wasserstein_distance(x1, x2):
  return torch.mean(x1 - x2)

def l1_distance(x1, x2):
  return F.l1_loss(x1, x2)

def l2_distance(x1, x2):
  return F.mse_loss(x1 * 10, x2 * 10)

def mse_loss(x1, x2):
  return F.mse_loss(x1, x2)

def cos_distance(x1, x2, dim = 1):
  return -1 * torch.mean(F.cosine_similarity(x1, x2, dim=dim))

triplet_call_cnt = 0

def triplet_loss(anchor, positive, negative, distance_func, margin):
  # max distance when using l1_distance is 2
  # max distacne when using l2-distance is sqrt(2)
#   print("anchor: ", anchor.min(), anchor.max())
  ap_dist = distance_func(anchor, positive)
  an_dist = distance_func(anchor, negative)

  global triplet_call_cnt
  triplet_call_cnt += 1
  if triplet_call_cnt % 10 in [0,1] : #and anchor.shape[1] == 1000:
    print("a: ", end=""); print_big_vector(anchor[0])
    print("p: ", end=""); print_big_vector(positive[0])
    print("n: ", end=""); print_big_vector(negative[0])
    print("func:{}, ap_dist: {}, an_dist: {}".format(distance_func.__name__, ap_dist, an_dist))
    
  return torch.mean(F.relu(ap_dist - an_dist + margin))

In [15]:
def diversity_loss(input, target):
#   return -1 * torch.mean(torch.pow(f_x_a-f_x_s,2))
  if input.shape[0] != batch_size:
    print("input shape: ", input.shape)
    print("target shape: ", target.shape, "\n\n")
  return torch.mean(F.cosine_similarity(
    input.view([batch_size, -1]),
    target.view([batch_size, -1]), 
  ))

fool_loss_count = 0

# def fool_loss(input, target):
#   true_class = torch.argmax(target, dim=1).view(-1,1).long().cuda()
#   target_probabilities = input.gather(1, true_class)
#   epsilon = 1e-10
#   result =  torch.mean(-1 * torch.log(1 - target_probabilities + epsilon))
  
#   global fool_loss_count
#   fool_loss_count += 1
#   if fool_loss_count % 40 == 0:
#     print("target probs {}, loss: {}: ".format(target_probabilities, result))
    
#   return result


def fool_loss(model_output, target_labels):
  target_labels = target_labels.view(-1, 1).long().cuda()
  target_probabilities = model_output.gather(1, target_labels)
  epsilon = 1e-10
  # highest possible fool_loss is - log(1e-10) == 23
  result = torch.mean(-1 * torch.log(target_probabilities + epsilon))
  
  global fool_loss_count
  fool_loss_count += 1
  if fool_loss_count % 20 == 0:
    print("target probs {}, loss: {}: ".format(target_probabilities, result))
  
  return result

def validation(gen_output, target):
  perturbations, _, _, clean_images, _, _ = gen_output
  perturbed_images = clean_images + perturbations
  benign_preds = torch.argmax(arch(clean_images), 1)
  adversary_preds = torch.argmax(arch(perturbed_images), 1)
  return (benign_preds != adversary_preds).float().mean()

# def validation(gen_output, target):
#   perturbations, _, _, clean_images = gen_output
#   perturbed_images = clean_images + perturbations
#   benign_preds = torch.argmax(arch(clean_images), 1)
#   adversary_preds = torch.argmax(arch(perturbed_images), 1)
#   return (benign_preds != adversary_preds).float().mean()


In [16]:
# class FeatureLoss(nn.Module):
#     def __name__(self):
#       return "feature_loss"
  
#     def __init__(self, dis, layers, layer_weights):
#         super().__init__()
        
#         # define generator here 
#         self.dis = dis
#         self.diversity_layers = layers
#         self.hooks = hook_outputs(self.diversity_layers, detach=False)
#         self.weights = layer_weights
#         self.metric_names = ["fool_loss"] #+ [f"div_loss_{i}" for i in range(len(layers))] #maybe Gram
# #         self.triplet_hooks = hook_outputs([arch.m.features[4]], detach=False)
    
#     def make_features(self, x, clone=False):
#         y = self.dis(x)
#         return y, [(o.clone() if clone else o) for o in self.hooks.stored]
  
#     def forward(self, inp, target):
#       sigma_B, _, _, X_B, B_Y, z = inp

#       X_A = X_B + sigma_B
# #       X_S = self.add_perturbation_shuffled(X_B, sigma_B) # Shuffled Addversarial Examples

#       A_Y, A_feat = self.make_features(X_A)
# #       _, S_feat = self.make_features(X_S)

#       chosen_labels = z.argmax(dim=1)
#       fooling_loss =  fool_loss(A_Y, chosen_labels)

# #       raw_diversity_losses = [diversity_loss(a_f, s_f) for a_f, s_f in zip(A_feat, S_feat)]
# #       weighted_diversity_losses = [diversity_loss(a_f, s_f) * weight for a_f, s_f, weight in zip(A_feat, S_feat, self.weights)]

# #       self.losses = [fooling_loss] + weighted_diversity_losses
# #       self.metrics = dict(zip(self.metric_names, [fooling_loss] + raw_diversity_losses))

#       self.losses = [fooling_loss]
#       self.metrics = dict(zip(self.metric_names, [fooling_loss]))

#       return sum(self.losses)
  
#     def add_perturbation_shuffled(self, inp, perturbation):
# #         j = torch.randperm(inp.shape[0])
#         j = derangement(inp.shape[0])
#         return inp.add(perturbation[j])

In [17]:
#non-targeted FeatureLoss
class FeatureLoss(nn.Module):
    def __name__(self):
      return "feature_loss"
  
    def __init__(self, dis, layers, layer_weights):
        super().__init__()
        
        # define generator here 
        self.dis = dis
        self.diversity_layers = layers
        self.hooks = hook_outputs(self.diversity_layers, detach=False)
        self.weights = layer_weights
        self.metric_names = ["fool_loss"] + [f"div_loss_{i}" for i in range(len(layers))] + ['triplet_loss']# Maybe Gram
        self.triplet_weight = 10.
        self.triplet_weight_noise = 5.
        self.triplet_weight_sm = 5.
    
    def make_features(self, x, clone=False):
        y = self.dis(x)
        return y, [(o.clone() if clone else o) for o in self.hooks.stored]
    
    def forward(self, inp, target):
        sigma_B, sigma_pos, sigma_neg, X_B = inp

        X_A = self.add_perturbation(X_B, sigma_B) 
        X_A_pos = self.add_perturbation(X_B, sigma_pos)
        X_A_neg = self.add_perturbation(X_B, sigma_neg) 
        
        X_S = self.add_perturbation_shuffled(X_B, sigma_B) # Shuffled Addversarial Examples
        
        B_Y, _ = self.make_features(X_B)
        A_Y, A_feat = self.make_features(X_A)
        _, S_feat = self.make_features(X_S)
        pos_softmax, _ = self.make_features(X_A_pos)
        neg_softmax, _ = self.make_features(X_A_neg)
        
        fooling_loss =  fool_loss(A_Y, B_Y)
      
        raw_diversity_losses = [diversity_loss(a_f, s_f) for a_f, s_f in zip(A_feat, S_feat)]
        weighted_diversity_losses = [diversity_loss(a_f, s_f) * weight for a_f, s_f, weight in zip(A_feat, S_feat, self.weights)]
        raw_triplet_loss = triplet_loss(A_Y, pos_softmax, neg_softmax, cos_distance, 1.4)
        weighted_triplet_loss = raw_triplet_loss * self.triplet_weight
    
        self.losses = [fooling_loss] + weighted_diversity_losses + [weighted_triplet_loss]
        self.metrics = dict(zip(self.metric_names, [fooling_loss] + raw_diversity_losses + [weighted_triplet_loss]))

        return sum(self.losses)

#     #use two types of triplet losses
#     def forward(self, inp, target):
#       sigma_B, sigma_pos, sigma_neg, X_B = inp

#       X_A = self.add_perturbation(X_B, sigma_B) 
#       X_A_pos = self.add_perturbation(X_B, sigma_pos)
#       X_A_neg = self.add_perturbation(X_B, sigma_neg) 

#       X_S = self.add_perturbation_shuffled(X_B, sigma_B) # Shuffled Addversarial Examples

#       B_Y, _ = self.make_features(X_B)
#       A_Y, A_feat = self.make_features(X_A)
#       _, S_feat = self.make_features(X_S)
#       pos_softmax, _ = self.make_features(X_A_pos)
#       neg_softmax, _ = self.make_features(X_A_neg)

#       fooling_loss =  fool_loss(A_Y, B_Y)

#       raw_diversity_losses = [diversity_loss(a_f, s_f) for a_f, s_f in zip(A_feat, S_feat)]
#       weighted_diversity_losses = [diversity_loss(a_f, s_f) * weight for a_f, s_f, weight in zip(A_feat, S_feat, self.weights)]
      
#       raw_triplet_loss_sm = triplet_loss(A_Y, pos_softmax, neg_softmax, cos_distance, 1.4)
#       weighted_triplet_loss_sm = raw_triplet_loss_sm * self.triplet_weight_sm
      
#       raw_triplet_loss_noise = triplet_loss(sigma_B, sigma_pos, sigma_neg, l2_distance, 5.)
#       weighted_triplet_loss_noise = raw_triplet_loss_noise * self.triplet_weight_noise

#       self.losses = [fooling_loss] + weighted_diversity_losses + [weighted_triplet_loss_sm, weighted_triplet_loss_noise] 
#       self.metrics = dict(zip(self.metric_names, [fooling_loss] + raw_diversity_losses + [weighted_triplet_loss_sm, weighted_triplet_loss_noise]))

#       return sum(self.losses)

#     # just fooling and diversity
#     def forward(self, inp, target):
#       sigma_B, sigma_pos, sigma_neg, X_B = inp

#       X_A = self.add_perturbation(X_B, sigma_B) 

#       X_S = self.add_perturbation_shuffled(X_B, sigma_B) # Shuffled Addversarial Examples

#       B_Y, _ = self.make_features(X_B)
#       A_Y, A_feat = self.make_features(X_A)
#       _, S_feat = self.make_features(X_S)

#       fooling_loss =  fool_loss(A_Y, B_Y)

#       raw_diversity_losses = [diversity_loss(a_f, s_f) for a_f, s_f in zip(A_feat, S_feat)]
#       weighted_diversity_losses = [diversity_loss(a_f, s_f) * weight for a_f, s_f, weight in zip(A_feat, S_feat, self.weights)]

#       self.losses = [fooling_loss] + weighted_diversity_losses
#       self.metrics = dict(zip(self.metric_names, [fooling_loss] + raw_diversity_losses))

#       return sum(self.losses)
  
  
    def add_perturbation(self, inp, perturbation):
        return inp.add(perturbation)
  
    def add_perturbation_shuffled(self, inp, perturbation):
        j = derangement(inp.shape[0])
        return inp.add(perturbation[j])

In [18]:
feat_loss = FeatureLoss(arch, layers, layer_weights)

In [19]:
env.save_filename = 'resnet50_x'
# env.save_filename = 'resnet50_17'
# env.save_filename = 'vgg16_32'

if Path(env.get_csv_path() + '.csv').exists(): raise FileExistsError("csv_path already exists")
if Path(env.get_models_path()).exists(): raise FileExistsError("models_path already exists")

In [20]:
learn = None; gen = None; gc.collect()
csv_logger = partial(ImmediateCSVLogger, filename= env.temp_csv_path + '/' + env.save_filename)
# learn = Learner(data, Gen(z_dim=10), loss_func = feat_loss, metrics=[validation], callback_fns=LossMetrics, opt_func = optim.SGD)
# learn = Learner(data, Gen(z_dim=z_dim), loss_func = feat_loss, metrics=[validation], callback_fns=[LossMetrics, DiversityWeightsScheduler])
gen = Gen(z_dim=z_dim)
init_cnn(gen, True)

learn = Learner(data, gen, loss_func = feat_loss, metrics=[validation], callback_fns=[LossMetrics, csv_logger])

# load_starting_point(learn, model_name, z_dim)
# random_seed(42, True)

In [21]:
# !cp "/content/gdrive/My Drive/DL/models/vgg16_12-last.pth"  "/content/"
# learn.load('/content/vgg16_12-last')

load_filename = 'resnet50_30/resnet50_30_99'
# load_filename = 'investigate_resnet50_2/3/resnet50_5'
# load_filename = 'vgg16_30/vgg16_30_69'
# load_filename = 'vgg16_12-last'

learn.load('/root/Derakhshani/adversarial/models/' + load_filename)

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


RuntimeError: Error(s) in loading state_dict for Gen:
	size mismatch for z_.weight: copying a param with shape torch.Size([7168, 10]) from checkpoint, the shape in current model is torch.Size([7168, 1000]).

In [None]:
# list(gen.children())

In [None]:
try:
  load_filename
except NameError:
  load_filename = None

print("the selected settings are : ")
print("\tmode: {} \n\tnetw-under-attack: {} \n\tload filename: {} \n\tsave filename: {}\n".format(
  mode, model.__name__, load_filename , env.save_filename
))
print("please MAKE SURE that the config is correct.")

In [None]:
# learn.lr_find(1e-6, 100)
# learn.recorder.plot()

In [77]:
from distutils import dir_util 

def investigate_initial_settings(n_settings, n_epochs, lr, wd, results_dir):
  os.mkdir(env.get_csv_dir() + results_dir)
  os.mkdir(env.get_models_dir() + results_dir)
  
  for setting_ind in range(n_settings):
    learn = None; gen = None; gc.collect()
    gen = Gen(z_dim = z_dim)
    init_cnn(gen, True)
    
    tmp_csv_filename =  env.temp_csv_path + '/' + env.save_filename + '_' + str(setting_ind)
    csv_logger = partial(ImmediateCSVLogger, filename=tmp_csv_filename)
    
    learn = Learner(data, gen, loss_func = feat_loss, metrics=[validation], callback_fns=[LossMetrics, csv_logger])
    saver_best = SaveModelCallback(learn, every='improvement', monitor='validation', name=model.__name__ + "-best")
    saver_every_epoch = SaveModelCallback(learn, every='epoch', name=model.__name__)

    learn.fit(n_epochs, lr=lr, wd = wd, callbacks=[saver_best, saver_every_epoch])
    
    shutil.copyfile(tmp_csv_filename + ".csv", env.get_csv_dir() + results_dir + '/' + str(setting_ind) + '.csv')
    dir_util.copy_tree(env.data_path/"models", env.get_models_dir() + results_dir + '/' + str(setting_ind))
    shutil.rmtree(env.data_path/"models")  

In [None]:
results_dir = 'investigate_resnet50_2'
investigate_initial_settings(10, 6, lr = 1e-2, wd = 0.001, results_dir = results_dir)
# shutil.rmtree(env.get_models_dir() + results_dir)
# shutil.rmtree(env.get_csv_dir() + results_dir)

epoch,train_loss,valid_loss,validation,fool_loss,time
0,10.793944,10.825519,0.617,10.825521,03:48
1,10.349816,10.614554,0.648,10.614556,03:37
2,10.095075,10.368769,0.649,10.368768,03:46
3,9.996818,10.319197,0.665,10.319198,03:36
4,9.89287,10.020784,0.658,10.020785,03:36
5,9.833074,10.202142,0.665,10.202142,03:37


target probs tensor([[1.2855e-04],
        [1.2112e-07],
        [1.1345e-03],
        [1.6796e-06],
        [9.2400e-06],
        [4.9741e-07],
        [1.2049e-07],
        [1.3657e-09],
        [4.7160e-08],
        [9.5592e-08],
        [4.5527e-06],
        [3.2703e-06],
        [2.3492e-05],
        [1.6425e-10],
        [2.2253e-08],
        [9.2964e-10]], device='cuda:0', grad_fn=<GatherBackward>), loss: 14.770257949829102: 
target probs tensor([[9.2478e-06],
        [2.3340e-05],
        [3.8071e-08],
        [1.8071e-06],
        [8.5537e-06],
        [1.1916e-07],
        [1.6467e-08],
        [1.0541e-06],
        [7.2065e-07],
        [2.0422e-05],
        [6.5832e-06],
        [3.2541e-06],
        [6.8237e-07],
        [5.3469e-03],
        [2.9525e-08],
        [1.5642e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 13.074273109436035: 
target probs tensor([[8.4510e-05],
        [3.6403e-07],
        [9.7964e-05],
        [1.1909e-04],
        [2.1398e-07],
   

target probs tensor([[4.1005e-05],
        [7.3194e-03],
        [6.9808e-05],
        [2.1025e-04],
        [4.8964e-08],
        [8.1811e-08],
        [1.0359e-04],
        [4.6106e-07],
        [1.4983e-07],
        [8.7189e-06],
        [3.4331e-04],
        [2.2320e-04],
        [1.2646e-06],
        [9.5488e-06],
        [5.9168e-04],
        [1.0273e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.11087703704834: 
target probs tensor([[1.5247e-04],
        [2.4714e-05],
        [1.5733e-06],
        [9.8212e-04],
        [2.6841e-04],
        [1.6090e-04],
        [4.7182e-06],
        [2.0080e-06],
        [2.2955e-06],
        [1.0813e-04],
        [1.1709e-07],
        [1.5239e-05],
        [3.9004e-05],
        [7.9169e-05],
        [2.6495e-03],
        [5.2466e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.554981231689453: 
target probs tensor([[1.7593e-04],
        [2.6753e-05],
        [4.4022e-07],
        [5.6932e-07],
        [1.4897e-04],
    

target probs tensor([[4.3542e-08],
        [5.1874e-06],
        [1.1801e-04],
        [1.4660e-08],
        [3.9213e-04],
        [1.8936e-04],
        [9.5016e-06],
        [7.4704e-05],
        [4.5251e-05],
        [5.3679e-07],
        [1.3092e-05],
        [3.5962e-03],
        [3.4335e-10],
        [6.5711e-07],
        [2.9358e-04],
        [9.3952e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.760281562805176: 
target probs tensor([[1.4656e-05],
        [5.2965e-05],
        [1.2552e-06],
        [1.2852e-04],
        [1.9920e-06],
        [2.9481e-04],
        [3.1884e-05],
        [3.9909e-04],
        [1.8249e-04],
        [2.8220e-07],
        [1.0026e-06],
        [2.5938e-05],
        [9.7143e-06],
        [6.3875e-05],
        [2.6857e-05],
        [1.7731e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.56760025024414: 
target probs tensor([[1.3044e-04],
        [9.6338e-06],
        [7.7744e-05],
        [1.8911e-05],
        [8.9873e-04],
    

target probs tensor([[6.0990e-06],
        [9.8656e-05],
        [1.1241e-05],
        [3.7892e-07],
        [5.4188e-06],
        [2.1195e-05],
        [4.8880e-12],
        [1.5445e-04],
        [2.9978e-05],
        [7.4426e-05],
        [8.3568e-07],
        [8.2709e-05],
        [4.4115e-06],
        [2.3137e-04],
        [1.3072e-04],
        [2.7049e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.595804214477539: 
target probs tensor([[9.3554e-03],
        [1.1732e-04],
        [3.6298e-04],
        [6.2567e-04],
        [1.6381e-04],
        [5.5444e-05],
        [2.8929e-04],
        [5.1219e-05],
        [9.7680e-06],
        [1.5276e-06],
        [6.0493e-04],
        [9.8497e-09],
        [9.1357e-07],
        [1.0811e-07],
        [2.8373e-07],
        [2.3442e-10]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.447968482971191: 
target probs tensor([[7.7555e-04],
        [9.0044e-06],
        [4.3509e-04],
        [1.3830e-05],
        [2.3908e-03],
   

target probs tensor([[8.5830e-05],
        [1.3659e-04],
        [2.1056e-04],
        [8.4633e-05],
        [7.3485e-05],
        [5.7756e-04],
        [4.4721e-04],
        [5.0899e-06],
        [2.7778e-06],
        [3.6774e-06],
        [2.1731e-04],
        [9.6555e-06],
        [1.7449e-03],
        [2.0815e-06],
        [6.8661e-04],
        [7.3508e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.944377899169922: 
target probs tensor([[1.1869e-03],
        [2.8617e-04],
        [4.3002e-05],
        [3.5895e-05],
        [2.3878e-04],
        [6.2316e-04],
        [1.5095e-02],
        [6.8660e-04],
        [9.1592e-05],
        [1.2948e-06],
        [8.0726e-05],
        [4.9080e-04],
        [1.7219e-05],
        [4.5578e-04],
        [7.2366e-09],
        [5.7285e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.627959251403809: 
target probs tensor([[1.7033e-05],
        [1.8558e-05],
        [5.6470e-06],
        [5.0483e-10],
        [7.9134e-09],
     

target probs tensor([[1.4860e-04],
        [2.2325e-07],
        [1.8182e-03],
        [1.4352e-04],
        [3.3642e-05],
        [1.0134e-05],
        [7.8676e-05],
        [2.4500e-04],
        [5.0334e-07],
        [1.0620e-05],
        [2.1667e-08],
        [2.0977e-05],
        [7.2042e-05],
        [7.0657e-05],
        [2.9851e-04],
        [6.5776e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.485014915466309: 
target probs tensor([[5.1024e-03],
        [3.2500e-04],
        [3.5926e-04],
        [1.8016e-04],
        [2.1033e-05],
        [1.8654e-05],
        [5.3111e-05],
        [5.7983e-05],
        [2.0308e-03],
        [2.8896e-03],
        [4.4889e-06],
        [9.3033e-06],
        [1.2101e-05],
        [1.0084e-02],
        [1.5156e-06],
        [1.8466e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.061301231384277: 
target probs tensor([[3.3084e-07],
        [3.1218e-04],
        [1.3110e-03],
        [1.5415e-04],
        [2.4435e-06],
    

target probs tensor([[2.2231e-04],
        [5.6926e-06],
        [1.6510e-04],
        [1.2627e-03],
        [1.8482e-05],
        [4.0622e-04],
        [1.5812e-06],
        [1.0419e-03],
        [4.3279e-07],
        [1.6037e-03],
        [1.1575e-05],
        [5.0267e-04],
        [6.8063e-05],
        [9.2300e-09],
        [1.2544e-06],
        [5.9226e-08]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.822943687438965: 
target probs tensor([[9.4230e-07],
        [8.3018e-04],
        [3.9167e-04],
        [6.9870e-05],
        [3.2572e-06],
        [1.3192e-04],
        [1.6375e-04],
        [1.3360e-04],
        [1.2856e-04],
        [4.4152e-04],
        [2.6083e-05],
        [2.1045e-06],
        [9.1283e-04],
        [5.1556e-07],
        [2.7703e-06],
        [5.1465e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.12789249420166: 
target probs tensor([[3.1197e-04],
        [4.5562e-06],
        [1.7763e-03],
        [1.2684e-03],
        [7.7723e-07],
    

target probs tensor([[3.1980e-05],
        [1.3517e-02],
        [1.8280e-04],
        [3.2290e-05],
        [1.4770e-03],
        [4.4815e-03],
        [3.3920e-04],
        [2.1426e-04],
        [3.2094e-05],
        [1.1544e-06],
        [2.9966e-05],
        [2.4030e-05],
        [1.9319e-04],
        [1.4724e-05],
        [2.3441e-04],
        [5.3330e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.056903839111328: 
target probs tensor([[2.4222e-04],
        [8.0270e-03],
        [8.6748e-04],
        [9.0927e-05],
        [6.9234e-05],
        [7.9653e-07],
        [8.0869e-06],
        [2.5937e-03],
        [4.1303e-03],
        [1.4022e-03],
        [1.4891e-03],
        [5.0348e-05],
        [8.8356e-05],
        [1.0795e-04],
        [4.0809e-05],
        [5.4777e-08]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.035301208496094: 
target probs tensor([[2.6447e-04],
        [1.1069e-07],
        [1.8000e-04],
        [8.0452e-07],
        [1.6456e-03],
     

target probs tensor([[1.1129e-05],
        [8.0863e-06],
        [3.5414e-05],
        [1.8562e-03],
        [4.4340e-04],
        [7.9037e-04],
        [3.5732e-05],
        [6.1270e-07],
        [8.1503e-02],
        [1.8548e-07],
        [7.7665e-06],
        [1.5755e-05],
        [1.3166e-02],
        [1.9413e-06],
        [3.1578e-04],
        [1.4048e-05]], device='cuda:0'), loss: 9.78901481628418: 
target probs tensor([[1.0483e-04],
        [7.3617e-06],
        [8.3966e-05],
        [1.1633e-04],
        [7.2338e-05],
        [2.4537e-10],
        [3.4805e-03],
        [5.1031e-04],
        [4.7750e-05],
        [1.3310e-03],
        [1.0464e-03],
        [5.8923e-05],
        [2.1924e-03],
        [1.6525e-03],
        [6.8629e-04],
        [2.8112e-07]], device='cuda:0'), loss: 9.503606796264648: 
target probs tensor([[6.5100e-05],
        [1.6324e-05],
        [2.7511e-10],
        [9.0027e-05],
        [7.5449e-04],
        [6.8177e-10],
        [6.3938e-05],
        [4.159

target probs tensor([[7.5576e-04],
        [3.1257e-04],
        [4.7686e-04],
        [1.5001e-04],
        [3.2578e-06],
        [1.7678e-03],
        [1.2723e-05],
        [5.9189e-03],
        [2.7797e-06],
        [5.5028e-04],
        [5.4292e-04],
        [1.0683e-04],
        [8.9343e-06],
        [1.3411e-04],
        [1.7813e-03],
        [4.2247e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.668060302734375: 
target probs tensor([[7.1334e-08],
        [4.0499e-05],
        [9.8010e-06],
        [1.0601e-05],
        [2.2207e-04],
        [3.5968e-03],
        [1.1483e-03],
        [1.9866e-07],
        [3.6511e-05],
        [2.9195e-03],
        [1.2840e-05],
        [7.7526e-05],
        [2.8056e-05],
        [3.4032e-06],
        [2.2900e-06],
        [2.6886e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.428717613220215: 
target probs tensor([[4.9354e-04],
        [6.8215e-05],
        [3.6911e-05],
        [2.1330e-06],
        [6.6716e-04],
    

epoch,train_loss,valid_loss,validation,fool_loss,time
0,10.171727,10.380875,0.739,10.380874,03:46
1,9.825987,10.074086,0.752,10.074086,03:36
2,9.522401,9.910895,0.751,9.910895,03:34
3,9.414354,9.696198,0.751,9.696198,03:35
4,9.439554,9.719126,0.754,9.719124,03:33
5,9.359967,9.481337,0.75,9.481337,03:36


target probs tensor([[7.9367e-09],
        [8.2795e-09],
        [5.1505e-07],
        [4.6460e-08],
        [9.7214e-07],
        [3.0473e-10],
        [9.0743e-09],
        [2.2316e-05],
        [3.3595e-07],
        [4.2971e-07],
        [1.2451e-08],
        [5.8665e-07],
        [6.5788e-09],
        [1.1540e-08],
        [2.4172e-08],
        [1.7710e-11]], device='cuda:0', grad_fn=<GatherBackward>), loss: 17.055286407470703: 
target probs tensor([[2.9768e-08],
        [3.3996e-07],
        [1.9537e-09],
        [5.7514e-09],
        [3.6521e-07],
        [7.6856e-09],
        [1.2607e-05],
        [1.7247e-09],
        [4.5535e-08],
        [1.5838e-07],
        [1.7379e-07],
        [2.2259e-07],
        [8.1698e-12],
        [4.9916e-06],
        [2.4710e-10],
        [1.4362e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 16.86936378479004: 
target probs tensor([[1.6907e-06],
        [1.2702e-06],
        [9.2704e-07],
        [1.3059e-06],
        [2.0283e-04],
    

target probs tensor([[2.0140e-04],
        [6.4859e-04],
        [3.2345e-03],
        [1.5259e-05],
        [3.7936e-07],
        [3.6809e-08],
        [4.6743e-06],
        [2.8187e-04],
        [4.4463e-05],
        [3.3089e-05],
        [6.4965e-06],
        [1.4623e-05],
        [2.0953e-04],
        [9.6313e-07],
        [2.4426e-05],
        [1.8941e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.478044509887695: 
target probs tensor([[8.6034e-05],
        [5.2414e-06],
        [2.0625e-04],
        [3.7351e-08],
        [2.0113e-05],
        [3.5806e-04],
        [1.1918e-04],
        [1.9319e-04],
        [1.0040e-05],
        [1.2166e-06],
        [5.7512e-07],
        [2.2626e-05],
        [2.1869e-05],
        [1.2368e-05],
        [5.1158e-05],
        [1.6450e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.179020881652832: 
target probs tensor([[4.2559e-05],
        [1.9321e-06],
        [8.4932e-06],
        [1.0493e-04],
        [3.3138e-05],
   

target probs tensor([[7.2074e-06],
        [7.0832e-07],
        [9.0473e-05],
        [1.5289e-04],
        [6.6373e-05],
        [5.5522e-04],
        [2.3537e-09],
        [8.9973e-06],
        [3.5591e-05],
        [6.5195e-04],
        [4.9912e-04],
        [6.3122e-05],
        [2.2985e-05],
        [1.5112e-07],
        [1.6310e-04],
        [2.7239e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.964330673217773: 
target probs tensor([[1.3844e-04],
        [9.0658e-04],
        [1.5758e-05],
        [1.8922e-05],
        [7.7177e-07],
        [2.7741e-05],
        [1.5215e-04],
        [4.9175e-05],
        [1.3246e-04],
        [1.2622e-05],
        [3.4069e-06],
        [1.6110e-04],
        [2.4826e-03],
        [2.1055e-05],
        [2.2406e-03],
        [1.5843e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.928604125976562: 
target probs tensor([[9.5593e-04],
        [1.4536e-06],
        [1.6458e-03],
        [8.9846e-05],
        [9.1281e-05],
    

target probs tensor([[2.2806e-03],
        [9.2613e-09],
        [2.1701e-04],
        [8.6658e-05],
        [7.3150e-05],
        [5.1701e-04],
        [1.3536e-03],
        [2.1150e-05],
        [1.0056e-03],
        [2.4220e-03],
        [2.9502e-05],
        [4.0312e-05],
        [2.1361e-06],
        [3.7974e-04],
        [5.9200e-04],
        [4.1091e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.153457641601562: 
target probs tensor([[3.6280e-05],
        [1.0401e-07],
        [1.6406e-04],
        [1.4716e-06],
        [2.4336e-04],
        [2.7138e-04],
        [2.3185e-05],
        [2.7928e-04],
        [4.4108e-06],
        [1.1881e-07],
        [4.3158e-04],
        [3.5056e-05],
        [1.1532e-04],
        [3.7573e-04],
        [4.9376e-05],
        [8.5285e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.397350311279297: 
target probs tensor([[7.2880e-05],
        [1.5423e-05],
        [1.7289e-04],
        [7.2148e-04],
        [1.6923e-03],
    

target probs tensor([[7.7810e-06],
        [4.5290e-06],
        [2.4656e-04],
        [9.7688e-04],
        [4.1657e-04],
        [9.0745e-06],
        [1.3776e-04],
        [2.8085e-04],
        [8.1853e-06],
        [6.6965e-05],
        [1.8046e-05],
        [2.8229e-05],
        [1.7503e-03],
        [3.4200e-05],
        [2.1209e-04],
        [2.5775e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.346368789672852: 
target probs tensor([[3.3405e-10],
        [1.3746e-04],
        [3.8424e-03],
        [3.2058e-05],
        [7.4972e-05],
        [4.8528e-05],
        [1.0337e-06],
        [9.6772e-04],
        [3.6112e-05],
        [2.9953e-04],
        [1.8226e-04],
        [5.9660e-04],
        [5.9064e-04],
        [5.6588e-04],
        [1.2057e-04],
        [8.5383e-08]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.068775177001953: 
target probs tensor([[1.3692e-04],
        [2.4339e-08],
        [8.6732e-05],
        [4.9109e-05],
        [7.5933e-05],
    

target probs tensor([[4.8273e-05],
        [1.4103e-03],
        [4.9671e-04],
        [7.4235e-06],
        [2.8589e-04],
        [6.3612e-04],
        [2.4932e-05],
        [7.1703e-08],
        [2.1489e-04],
        [6.4739e-05],
        [8.0247e-04],
        [8.9962e-05],
        [2.9824e-04],
        [2.4314e-04],
        [6.5597e-04],
        [8.6210e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.422300338745117: 
target probs tensor([[6.0658e-05],
        [5.8500e-07],
        [1.2606e-06],
        [4.1298e-05],
        [2.5337e-04],
        [1.7202e-03],
        [1.7498e-03],
        [1.1780e-04],
        [5.8019e-04],
        [3.2859e-04],
        [1.4458e-04],
        [4.6517e-04],
        [9.3026e-04],
        [7.5134e-04],
        [2.9140e-04],
        [1.9023e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.79065227508545: 
target probs tensor([[1.2046e-04],
        [2.4528e-04],
        [1.7413e-08],
        [2.2891e-05],
        [1.4833e-04],
      

target probs tensor([[8.0147e-04],
        [9.5891e-05],
        [1.0717e-03],
        [1.8878e-05],
        [2.3676e-05],
        [5.0416e-04],
        [7.6747e-05],
        [1.1908e-07],
        [5.0678e-05],
        [6.5649e-06],
        [4.9709e-04],
        [9.5017e-06],
        [4.5949e-05],
        [1.6428e-04],
        [1.0176e-05],
        [2.2471e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.978363037109375: 
target probs tensor([[1.3032e-04],
        [9.8008e-04],
        [5.3585e-05],
        [7.0326e-05],
        [5.4244e-04],
        [1.2017e-03],
        [8.8717e-04],
        [9.2902e-05],
        [1.0930e-03],
        [5.6699e-04],
        [2.7602e-04],
        [4.1813e-04],
        [8.6768e-08],
        [1.5593e-05],
        [4.1518e-06],
        [4.2201e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.973981857299805: 
target probs tensor([[8.3645e-04],
        [1.0959e-02],
        [4.1006e-04],
        [2.1187e-05],
        [3.8125e-04],
     

target probs tensor([[4.0279e-04],
        [6.9880e-05],
        [9.2768e-04],
        [3.4337e-05],
        [1.5309e-04],
        [1.7576e-04],
        [1.3475e-06],
        [3.9274e-06],
        [2.8920e-08],
        [6.8668e-04],
        [1.3909e-05],
        [1.7679e-05],
        [1.1675e-03],
        [3.1830e-06],
        [6.2057e-06],
        [7.1406e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.50367546081543: 
target probs tensor([[3.3529e-05],
        [9.7929e-04],
        [5.9427e-05],
        [3.8286e-02],
        [7.3975e-05],
        [8.1420e-04],
        [1.4715e-04],
        [3.8659e-04],
        [5.4037e-04],
        [4.5122e-05],
        [3.2042e-03],
        [4.2227e-05],
        [3.7545e-06],
        [1.0290e-04],
        [6.7691e-04],
        [9.3050e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.733582496643066: 
target probs tensor([[4.7528e-05],
        [3.0227e-05],
        [3.2229e-04],
        [4.7280e-05],
        [2.9671e-07],
     

target probs tensor([[5.7239e-03],
        [1.3390e-03],
        [2.1440e-05],
        [3.8091e-05],
        [6.5466e-02],
        [3.1872e-06],
        [2.2388e-05],
        [1.0744e-06],
        [5.0004e-06],
        [9.1787e-07],
        [5.7351e-06],
        [8.6213e-05],
        [2.6544e-03],
        [2.0729e-06],
        [8.4626e-06],
        [1.7767e-06]], device='cuda:0'), loss: 10.250701904296875: 
target probs tensor([[1.3177e-04],
        [3.8048e-05],
        [2.2170e-05],
        [6.3652e-04],
        [6.9687e-04],
        [7.3517e-04],
        [1.9421e-09],
        [1.4233e-04],
        [7.4819e-07],
        [1.2026e-03],
        [2.6490e-03],
        [1.4693e-03],
        [5.6609e-05],
        [3.2435e-07],
        [3.7548e-05],
        [3.9367e-05]], device='cuda:0'), loss: 9.929784774780273: 
target probs tensor([[2.2514e-03],
        [2.2264e-06],
        [4.4413e-06],
        [7.4956e-07],
        [5.3143e-04],
        [4.0690e-05],
        [3.8796e-04],
        [2.7

target probs tensor([[4.0688e-04],
        [1.7188e-04],
        [2.6050e-05],
        [6.2486e-05],
        [5.3686e-04],
        [7.4005e-05],
        [1.7119e-03],
        [6.8525e-05],
        [4.0748e-04],
        [1.8079e-05],
        [1.6767e-03],
        [1.4631e-04],
        [3.9689e-05],
        [4.9438e-05],
        [3.4446e-09],
        [6.6595e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.404983520507812: 
target probs tensor([[1.9899e-07],
        [4.5509e-04],
        [7.6030e-05],
        [3.4683e-04],
        [1.3367e-03],
        [8.0509e-04],
        [8.2253e-04],
        [6.2067e-05],
        [1.1990e-04],
        [1.1640e-06],
        [2.5568e-04],
        [1.5829e-05],
        [6.8740e-06],
        [5.7409e-02],
        [2.2199e-08],
        [1.6053e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.783084869384766: 
target probs tensor([[5.5688e-04],
        [8.7178e-04],
        [3.9865e-04],
        [1.5348e-05],
        [1.4778e-03],
     

epoch,train_loss,valid_loss,validation,fool_loss,time
0,10.906361,11.080358,0.527,11.080359,03:35
1,10.398422,10.606828,0.652,10.606828,03:35
2,10.170149,10.308915,0.652,10.308915,03:35
3,9.922343,10.158274,0.668,10.158275,03:34
4,9.796849,10.004011,0.694,10.004011,03:35
5,9.727354,9.83879,0.697,9.838789,03:34


target probs tensor([[2.6149e-08],
        [2.1165e-07],
        [3.2954e-11],
        [4.4934e-07],
        [3.5264e-09],
        [3.4903e-08],
        [3.4673e-07],
        [2.2161e-04],
        [6.0153e-04],
        [5.2279e-08],
        [3.3983e-05],
        [4.3868e-09],
        [4.5711e-07],
        [1.5975e-06],
        [1.1262e-06],
        [2.0454e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 14.762704849243164: 
target probs tensor([[2.2845e-05],
        [2.5587e-09],
        [2.3886e-10],
        [2.3741e-06],
        [3.1521e-07],
        [8.4213e-05],
        [1.9009e-05],
        [1.7621e-08],
        [3.6988e-08],
        [1.8571e-07],
        [4.0412e-03],
        [1.1060e-08],
        [1.5904e-06],
        [1.6449e-05],
        [1.0821e-06],
        [2.3875e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 14.108662605285645: 
target probs tensor([[3.9897e-05],
        [1.5266e-04],
        [2.6010e-06],
        [8.5626e-06],
        [4.4311e-04],
   

target probs tensor([[9.6738e-06],
        [1.1189e-04],
        [1.8245e-03],
        [2.1467e-05],
        [5.3898e-12],
        [5.0227e-04],
        [1.1032e-04],
        [2.5178e-04],
        [3.7582e-07],
        [9.4230e-06],
        [4.5846e-05],
        [1.5274e-06],
        [1.6357e-03],
        [1.3284e-10],
        [1.0937e-07],
        [3.3392e-08]], device='cuda:0', grad_fn=<GatherBackward>), loss: 12.328285217285156: 
target probs tensor([[5.0594e-04],
        [1.0791e-04],
        [3.2980e-09],
        [9.8563e-05],
        [1.7638e-09],
        [3.8977e-06],
        [6.3666e-06],
        [4.4211e-04],
        [4.5856e-06],
        [2.7773e-05],
        [1.4836e-07],
        [1.1275e-05],
        [1.1844e-07],
        [6.6020e-03],
        [4.5599e-05],
        [4.9714e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.635177612304688: 
target probs tensor([[1.4076e-05],
        [6.1558e-05],
        [1.7398e-06],
        [3.2746e-05],
        [2.1505e-05],
   

target probs tensor([[1.3758e-08],
        [6.8515e-07],
        [1.0154e-07],
        [2.5904e-05],
        [8.6430e-06],
        [3.1631e-03],
        [1.7283e-05],
        [6.9663e-04],
        [2.3829e-05],
        [1.0992e-05],
        [1.8687e-03],
        [1.7276e-07],
        [3.0803e-05],
        [5.0726e-05],
        [3.5088e-04],
        [1.0772e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.992815971374512: 
target probs tensor([[4.8650e-04],
        [1.2691e-04],
        [5.7706e-06],
        [2.5835e-04],
        [1.6052e-05],
        [1.7844e-07],
        [5.6732e-05],
        [6.9852e-07],
        [3.4470e-05],
        [2.1079e-04],
        [2.1074e-04],
        [6.4273e-04],
        [8.0020e-06],
        [4.5831e-03],
        [3.6514e-05],
        [1.9838e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.15485954284668: 
target probs tensor([[3.1405e-04],
        [6.2468e-05],
        [3.2163e-07],
        [9.3860e-06],
        [2.6562e-04],
    

target probs tensor([[1.2418e-04],
        [8.8391e-04],
        [3.1959e-03],
        [9.9282e-10],
        [3.1449e-05],
        [1.6359e-04],
        [2.3660e-06],
        [5.2504e-06],
        [3.2011e-05],
        [4.9027e-03],
        [8.3295e-04],
        [5.2249e-05],
        [9.0832e-05],
        [7.2855e-05],
        [2.9505e-05],
        [2.1656e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.951513290405273: 
target probs tensor([[1.4452e-10],
        [6.3153e-07],
        [2.9986e-06],
        [4.3396e-04],
        [2.1058e-08],
        [1.5202e-04],
        [1.4641e-03],
        [1.2750e-07],
        [2.5796e-04],
        [5.3450e-06],
        [5.2172e-06],
        [1.8371e-05],
        [1.3737e-04],
        [5.9392e-04],
        [9.3638e-05],
        [3.5363e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.709407806396484: 
target probs tensor([[4.8675e-03],
        [1.2651e-05],
        [3.9191e-05],
        [3.3399e-06],
        [1.6587e-03],
    

target probs tensor([[1.9282e-05],
        [2.0438e-08],
        [2.3533e-04],
        [2.9104e-04],
        [9.8974e-05],
        [2.3355e-05],
        [5.5872e-05],
        [2.8517e-06],
        [6.7249e-06],
        [3.3290e-04],
        [2.8685e-04],
        [2.0935e-05],
        [3.1312e-03],
        [4.3057e-07],
        [1.3797e-06],
        [8.9277e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.599306106567383: 
target probs tensor([[1.3193e-03],
        [8.4676e-06],
        [3.2089e-05],
        [1.8175e-04],
        [8.0292e-04],
        [5.8473e-05],
        [9.5151e-06],
        [4.3259e-05],
        [1.2131e-03],
        [9.8985e-04],
        [1.1948e-05],
        [3.6899e-05],
        [1.4138e-04],
        [2.6896e-06],
        [7.9476e-06],
        [9.0329e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.604658126831055: 
target probs tensor([[2.6663e-05],
        [6.3647e-06],
        [3.2989e-05],
        [2.3372e-03],
        [1.3018e-05],
    

target probs tensor([[1.1296e-03],
        [2.3311e-03],
        [3.5484e-05],
        [3.8563e-06],
        [3.2528e-04],
        [3.5248e-08],
        [9.7000e-07],
        [9.9757e-06],
        [6.0669e-04],
        [9.0144e-05],
        [9.8895e-05],
        [1.4886e-06],
        [3.1403e-06],
        [6.6415e-04],
        [1.4963e-05],
        [4.0159e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.705989837646484: 
target probs tensor([[3.4458e-05],
        [5.8650e-05],
        [3.6051e-05],
        [4.3600e-06],
        [1.6505e-05],
        [8.8455e-05],
        [9.1132e-05],
        [5.7127e-04],
        [1.1718e-04],
        [3.4579e-05],
        [5.7510e-05],
        [1.2363e-04],
        [3.3408e-06],
        [2.1710e-08],
        [2.9333e-05],
        [6.0606e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.655961990356445: 
target probs tensor([[4.6002e-06],
        [5.5877e-04],
        [1.8442e-04],
        [3.6533e-05],
        [4.2471e-03],
   

target probs tensor([[6.2759e-07],
        [5.6333e-05],
        [9.8912e-03],
        [2.5886e-05],
        [2.3854e-04],
        [3.6057e-02],
        [1.0117e-04],
        [7.7202e-07],
        [1.4721e-04],
        [6.1120e-05],
        [4.4888e-05],
        [2.8409e-04],
        [1.2171e-07],
        [4.7256e-04],
        [5.2587e-06],
        [8.1607e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.608028411865234: 
target probs tensor([[2.0202e-03],
        [1.4985e-04],
        [3.0258e-05],
        [2.6513e-03],
        [3.7617e-07],
        [3.7985e-04],
        [3.4094e-05],
        [7.8385e-05],
        [6.3400e-06],
        [1.1087e-05],
        [1.0438e-06],
        [6.4335e-04],
        [2.7623e-04],
        [1.3653e-05],
        [8.7202e-06],
        [2.6130e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.847061157226562: 
target probs tensor([[1.0592e-07],
        [7.0020e-05],
        [7.9017e-05],
        [5.9951e-07],
        [2.1236e-03],
     

target probs tensor([[2.7071e-05],
        [9.1929e-03],
        [4.0047e-03],
        [2.9712e-05],
        [1.2794e-07],
        [1.0350e-04],
        [3.6284e-06],
        [5.3089e-04],
        [7.6017e-06],
        [3.0630e-06],
        [4.0646e-07],
        [4.9987e-04],
        [9.6705e-04],
        [9.1115e-05],
        [2.2988e-04],
        [1.6464e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.77496337890625: 
target probs tensor([[5.0855e-06],
        [4.3911e-04],
        [1.8454e-08],
        [5.6403e-06],
        [2.1492e-06],
        [1.2371e-04],
        [6.8816e-04],
        [4.1534e-04],
        [2.8622e-03],
        [3.1532e-04],
        [8.2582e-04],
        [2.2954e-05],
        [8.6719e-06],
        [2.0789e-04],
        [9.8109e-09],
        [1.3429e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.237476348876953: 
target probs tensor([[3.2583e-05],
        [1.2261e-04],
        [1.0710e-04],
        [5.2271e-03],
        [1.4882e-05],
     

target probs tensor([[2.3051e-03],
        [2.2585e-06],
        [2.1643e-04],
        [3.6328e-04],
        [1.5251e-03],
        [5.2963e-03],
        [8.0921e-04],
        [5.3576e-07],
        [9.8104e-07],
        [1.2528e-07],
        [4.6362e-04],
        [2.8785e-05],
        [5.6372e-04],
        [1.4382e-04],
        [4.6112e-05],
        [5.0448e-05]], device='cuda:0'), loss: 9.548921585083008: 
target probs tensor([[7.2868e-05],
        [1.7995e-04],
        [5.9720e-06],
        [5.6167e-04],
        [9.8558e-04],
        [1.8376e-07],
        [4.5319e-05],
        [6.1867e-06],
        [1.0366e-05],
        [2.7757e-05],
        [2.2426e-04],
        [6.4998e-05],
        [1.5636e-04],
        [1.9168e-03],
        [1.8134e-04],
        [5.8372e-06]], device='cuda:0'), loss: 9.861785888671875: 
target probs tensor([[2.4419e-04],
        [5.8840e-06],
        [4.4104e-09],
        [6.9491e-05],
        [3.6232e-04],
        [1.5846e-11],
        [1.0536e-05],
        [4.76

target probs tensor([[6.6717e-04],
        [1.3769e-03],
        [9.7041e-05],
        [8.1227e-07],
        [2.6712e-05],
        [7.8282e-06],
        [1.0784e-04],
        [1.4138e-03],
        [6.8009e-06],
        [1.9262e-04],
        [9.8060e-04],
        [2.1462e-03],
        [3.1945e-04],
        [4.6401e-04],
        [9.7660e-06],
        [7.7717e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.087299346923828: 
target probs tensor([[3.0133e-04],
        [4.6697e-04],
        [2.0955e-06],
        [1.0784e-06],
        [4.6295e-03],
        [6.1167e-05],
        [2.2389e-05],
        [4.0487e-05],
        [1.6811e-04],
        [3.0783e-04],
        [2.2087e-05],
        [7.5029e-09],
        [9.3792e-04],
        [1.5774e-04],
        [1.1025e-02],
        [7.7684e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.504827499389648: 
target probs tensor([[2.5193e-04],
        [9.6019e-08],
        [1.8798e-05],
        [1.0165e-02],
        [3.8954e-04],
     

epoch,train_loss,valid_loss,validation,fool_loss,time
0,10.436586,10.726801,0.727,10.7268,03:35
1,10.017644,10.274555,0.75,10.274554,03:48
2,9.774177,10.072034,0.73,10.072035,03:34
3,9.68011,9.843613,0.746,9.843615,03:34
4,9.458936,9.666237,0.75,9.666237,03:48
5,9.290872,9.474617,0.743,9.474617,03:35


target probs tensor([[9.4078e-06],
        [3.7281e-05],
        [4.1066e-01],
        [3.6553e-05],
        [2.9783e-05],
        [8.2255e-06],
        [8.7049e-07],
        [1.0584e-08],
        [7.9017e-06],
        [6.3826e-06],
        [1.2817e-06],
        [2.6611e-08],
        [9.3018e-07],
        [6.0408e-07],
        [2.6050e-05],
        [4.0867e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 12.075145721435547: 
target probs tensor([[1.8507e-11],
        [4.4834e-08],
        [2.5256e-05],
        [9.7407e-05],
        [2.3327e-06],
        [6.6317e-07],
        [8.4341e-09],
        [5.9777e-10],
        [6.2235e-10],
        [9.1480e-05],
        [9.1178e-05],
        [2.3051e-06],
        [9.7226e-08],
        [1.7954e-10],
        [2.7310e-06],
        [1.7839e-10]], device='cuda:0', grad_fn=<GatherBackward>), loss: 15.752555847167969: 
target probs tensor([[2.9499e-07],
        [3.5808e-09],
        [6.0580e-07],
        [3.2828e-07],
        [7.4952e-06],
   

target probs tensor([[4.8363e-04],
        [6.0216e-05],
        [1.8011e-05],
        [5.1059e-08],
        [2.5367e-06],
        [3.3529e-06],
        [7.1701e-05],
        [2.1492e-09],
        [1.2315e-05],
        [9.1650e-07],
        [5.5468e-09],
        [4.7318e-06],
        [7.2352e-07],
        [7.0582e-06],
        [4.0556e-05],
        [1.9976e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 12.856733322143555: 
target probs tensor([[1.9410e-05],
        [2.1215e-03],
        [1.1887e-09],
        [1.4085e-05],
        [2.6712e-05],
        [7.3091e-03],
        [2.4238e-05],
        [1.8042e-04],
        [5.1132e-06],
        [4.3592e-04],
        [5.5774e-03],
        [2.9576e-06],
        [1.0473e-07],
        [6.4521e-04],
        [1.8903e-05],
        [5.7779e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.183337211608887: 
target probs tensor([[6.0235e-05],
        [9.7327e-06],
        [4.3191e-05],
        [2.3952e-05],
        [1.0991e-05],
   

target probs tensor([[5.8767e-05],
        [5.8656e-08],
        [6.1316e-06],
        [3.3449e-05],
        [1.4580e-04],
        [2.1875e-05],
        [1.6635e-03],
        [3.3163e-06],
        [1.9699e-08],
        [5.4396e-07],
        [1.9200e-04],
        [2.3947e-05],
        [1.1055e-04],
        [3.3148e-07],
        [3.4023e-06],
        [2.5201e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.903223037719727: 
target probs tensor([[2.1830e-04],
        [1.3434e-05],
        [7.7038e-05],
        [6.6575e-05],
        [8.6679e-07],
        [1.9730e-04],
        [2.6596e-08],
        [1.4799e-04],
        [4.2340e-06],
        [8.7546e-06],
        [4.4098e-05],
        [1.4570e-05],
        [5.7331e-06],
        [1.2383e-08],
        [8.8870e-06],
        [7.5100e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.647762298583984: 
target probs tensor([[1.0556e-04],
        [3.3188e-05],
        [4.0865e-05],
        [1.1067e-06],
        [9.5914e-04],
   

target probs tensor([[2.6435e-05],
        [2.7593e-04],
        [3.6654e-03],
        [3.1385e-07],
        [1.2945e-04],
        [1.3254e-04],
        [2.5464e-09],
        [1.5461e-04],
        [1.2268e-04],
        [1.0384e-05],
        [2.7704e-09],
        [6.0089e-05],
        [4.9383e-06],
        [1.0625e-04],
        [1.0249e-07],
        [1.5709e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.507268905639648: 
target probs tensor([[7.6876e-04],
        [1.7366e-03],
        [3.0059e-05],
        [1.1756e-04],
        [2.3722e-04],
        [2.8629e-08],
        [3.5790e-05],
        [1.2510e-04],
        [4.0887e-06],
        [4.6007e-04],
        [1.7518e-04],
        [1.1468e-04],
        [2.7223e-04],
        [1.3767e-04],
        [1.5601e-03],
        [5.6962e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.317099571228027: 
target probs tensor([[1.6695e-04],
        [4.2183e-07],
        [6.0950e-05],
        [6.7378e-04],
        [1.2283e-04],
    

target probs tensor([[3.9701e-08],
        [6.5548e-03],
        [5.9695e-06],
        [3.6470e-04],
        [5.0697e-04],
        [8.2249e-04],
        [3.5887e-03],
        [3.8205e-06],
        [1.4197e-05],
        [7.9596e-05],
        [4.0138e-05],
        [1.4498e-06],
        [6.6717e-05],
        [2.1649e-04],
        [1.3192e-04],
        [2.6336e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.637754440307617: 
target probs tensor([[8.3125e-04],
        [3.2793e-05],
        [5.9932e-05],
        [2.8062e-06],
        [1.1525e-03],
        [1.4554e-03],
        [4.3090e-05],
        [1.0640e-05],
        [1.0185e-04],
        [6.0050e-05],
        [2.4763e-05],
        [6.6253e-04],
        [2.4189e-04],
        [5.3643e-04],
        [3.4785e-05],
        [1.9557e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.42698860168457: 
target probs tensor([[5.4578e-05],
        [1.1054e-04],
        [1.5156e-05],
        [1.4275e-05],
        [2.0970e-05],
      

target probs tensor([[7.0285e-05],
        [1.4618e-05],
        [9.1106e-04],
        [8.7163e-07],
        [2.0078e-03],
        [4.6520e-08],
        [1.0502e-04],
        [1.1770e-05],
        [3.4163e-06],
        [1.8942e-04],
        [2.8277e-05],
        [6.4056e-05],
        [2.7146e-04],
        [6.5571e-04],
        [1.1432e-05],
        [3.3561e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.235218048095703: 
target probs tensor([[2.6289e-03],
        [8.9119e-06],
        [1.9202e-04],
        [1.0032e-05],
        [1.2555e-05],
        [4.0957e-04],
        [3.1235e-05],
        [1.0482e-04],
        [2.1212e-04],
        [1.6971e-06],
        [2.0907e-04],
        [6.7993e-06],
        [4.3501e-04],
        [3.0075e-03],
        [8.9813e-04],
        [8.9051e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.12263298034668: 
target probs tensor([[3.5607e-06],
        [1.3830e-05],
        [2.1895e-03],
        [9.0556e-06],
        [2.4746e-04],
     

target probs tensor([[1.7025e-05],
        [1.7903e-05],
        [5.8698e-06],
        [2.2249e-05],
        [4.7904e-04],
        [7.1436e-06],
        [1.5061e-04],
        [5.3942e-06],
        [3.7547e-05],
        [6.3585e-04],
        [1.1463e-04],
        [1.5847e-04],
        [6.7356e-06],
        [9.2218e-04],
        [5.9635e-07],
        [2.9981e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.113119125366211: 
target probs tensor([[5.3244e-05],
        [2.6247e-05],
        [3.1875e-05],
        [9.2250e-06],
        [1.2142e-05],
        [1.5832e-03],
        [1.9257e-04],
        [9.1084e-06],
        [1.7748e-08],
        [2.5059e-04],
        [6.6224e-05],
        [1.6541e-05],
        [1.0418e-03],
        [1.6381e-06],
        [9.6223e-05],
        [1.4482e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.475613594055176: 
target probs tensor([[2.4405e-04],
        [1.4209e-04],
        [1.8361e-04],
        [8.2710e-06],
        [2.9131e-03],
   

target probs tensor([[5.5152e-08],
        [5.2145e-05],
        [1.2465e-05],
        [1.1951e-03],
        [2.6341e-04],
        [8.7916e-07],
        [3.9266e-05],
        [2.3747e-04],
        [3.8170e-04],
        [1.1227e-03],
        [1.8471e-04],
        [3.2508e-04],
        [1.7745e-04],
        [2.0759e-03],
        [1.1227e-04],
        [6.3758e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.239325523376465: 
target probs tensor([[4.4147e-05],
        [1.2135e-04],
        [5.2597e-05],
        [1.4845e-04],
        [7.8063e-06],
        [1.9705e-03],
        [1.0725e-05],
        [1.8752e-05],
        [1.9463e-06],
        [2.0450e-03],
        [1.2620e-04],
        [1.1108e-02],
        [2.9542e-07],
        [2.5162e-03],
        [1.7860e-09],
        [3.1444e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.857511520385742: 
target probs tensor([[6.1962e-09],
        [1.0897e-06],
        [7.0675e-04],
        [2.0598e-08],
        [5.0432e-05],
     

target probs tensor([[1.0494e-04],
        [2.4569e-04],
        [7.8890e-04],
        [1.8734e-05],
        [2.0086e-04],
        [3.1237e-07],
        [1.6081e-05],
        [5.7847e-05],
        [1.5849e-05],
        [5.9534e-06],
        [6.1586e-06],
        [4.2178e-04],
        [1.5944e-05],
        [4.5250e-04],
        [4.9334e-05],
        [8.3404e-06]], device='cuda:0'), loss: 10.187618255615234: 
target probs tensor([[1.0896e-04],
        [9.3402e-06],
        [1.3480e-03],
        [2.1874e-05],
        [1.0159e-03],
        [1.5034e-03],
        [4.9713e-08],
        [4.9060e-05],
        [8.2651e-08],
        [3.2632e-05],
        [1.8769e-03],
        [7.7482e-05],
        [3.5780e-07],
        [6.2658e-05],
        [8.5529e-06],
        [6.6016e-03]], device='cuda:0'), loss: 10.110391616821289: 
target probs tensor([[1.9209e-05],
        [8.2789e-05],
        [8.1893e-04],
        [1.5890e-03],
        [9.7795e-05],
        [1.5409e-04],
        [1.8262e-04],
        [1.

target probs tensor([[7.1360e-05],
        [1.1238e-03],
        [7.7269e-05],
        [3.7204e-05],
        [7.1061e-06],
        [6.6775e-04],
        [2.2408e-04],
        [2.0625e-04],
        [7.9762e-07],
        [4.2925e-04],
        [3.6616e-05],
        [3.2899e-07],
        [2.1046e-04],
        [7.7960e-04],
        [6.4455e-04],
        [8.7308e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.457189559936523: 
target probs tensor([[7.0705e-04],
        [5.9895e-04],
        [1.8549e-05],
        [7.1492e-05],
        [1.0664e-04],
        [9.4311e-05],
        [1.5580e-03],
        [1.9225e-06],
        [3.8653e-04],
        [1.1386e-04],
        [2.2919e-04],
        [2.9035e-04],
        [3.2422e-03],
        [9.2856e-06],
        [7.6158e-04],
        [1.0630e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.766814231872559: 
target probs tensor([[2.4857e-05],
        [4.2969e-03],
        [3.9430e-03],
        [1.6485e-04],
        [3.0264e-03],
     

epoch,train_loss,valid_loss,validation,fool_loss,time
0,11.172297,11.260968,0.595,11.260967,03:33
1,10.557683,10.790855,0.637,10.790857,03:34
2,10.255194,10.388114,0.673,10.388114,03:33
3,10.002081,10.163577,0.701,10.163579,03:34
4,9.78168,10.133568,0.7,10.133569,03:33
5,9.829958,10.062037,0.709,10.062035,03:33


target probs tensor([[4.8517e-04],
        [2.9476e-05],
        [1.3147e-06],
        [1.5484e-07],
        [1.3236e-04],
        [4.0903e-07],
        [2.5830e-08],
        [2.1441e-07],
        [1.8868e-07],
        [7.5205e-04],
        [1.9437e-04],
        [2.1981e-06],
        [4.5103e-04],
        [6.7722e-08],
        [5.3539e-05],
        [9.1936e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.958494186401367: 
target probs tensor([[3.5557e-07],
        [9.6648e-04],
        [4.5082e-07],
        [4.4042e-10],
        [1.9768e-04],
        [8.2551e-05],
        [2.5407e-05],
        [8.7153e-06],
        [2.0071e-06],
        [3.7617e-07],
        [6.2077e-12],
        [3.5259e-05],
        [2.8585e-05],
        [1.7837e-07],
        [1.6073e-05],
        [1.8267e-10]], device='cuda:0', grad_fn=<GatherBackward>), loss: 13.628772735595703: 
target probs tensor([[5.2849e-07],
        [1.5915e-06],
        [1.0960e-05],
        [3.2052e-03],
        [8.2064e-06],
   

target probs tensor([[1.4200e-04],
        [4.4504e-05],
        [1.2826e-04],
        [2.4048e-05],
        [8.0712e-06],
        [8.3836e-05],
        [1.3312e-05],
        [5.9215e-06],
        [2.4854e-04],
        [4.3622e-06],
        [3.3118e-08],
        [9.9221e-05],
        [3.8289e-04],
        [8.8249e-04],
        [5.6290e-06],
        [6.9227e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.406316757202148: 
target probs tensor([[1.1774e-06],
        [1.8450e-03],
        [1.9145e-04],
        [2.3330e-07],
        [1.7866e-06],
        [7.2867e-06],
        [4.4839e-07],
        [3.4245e-12],
        [3.6387e-05],
        [1.7107e-06],
        [1.4411e-04],
        [1.9482e-06],
        [1.6118e-08],
        [1.7788e-04],
        [3.2434e-04],
        [2.0779e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 12.477224349975586: 
target probs tensor([[4.5618e-06],
        [1.1395e-06],
        [2.1963e-04],
        [1.9592e-05],
        [4.6849e-05],
   

target probs tensor([[2.1135e-05],
        [1.8280e-04],
        [5.6661e-03],
        [2.4437e-05],
        [1.6125e-06],
        [9.9507e-07],
        [3.7367e-05],
        [1.9826e-03],
        [6.9180e-04],
        [5.7939e-05],
        [6.4378e-08],
        [2.4964e-06],
        [1.0155e-05],
        [2.9752e-05],
        [2.1822e-06],
        [1.5733e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.8467435836792: 
target probs tensor([[5.8929e-06],
        [3.0910e-04],
        [8.5869e-05],
        [1.8481e-03],
        [9.1827e-08],
        [6.1556e-05],
        [1.1096e-07],
        [2.7113e-08],
        [6.9103e-05],
        [1.6232e-07],
        [6.7710e-05],
        [4.4561e-05],
        [4.1004e-06],
        [2.1452e-04],
        [5.7021e-06],
        [4.9518e-10]], device='cuda:0', grad_fn=<GatherBackward>), loss: 12.131922721862793: 
target probs tensor([[7.3452e-04],
        [2.0474e-05],
        [1.3406e-04],
        [8.0948e-06],
        [1.3333e-04],
     

target probs tensor([[1.9833e-05],
        [1.6930e-05],
        [5.9244e-07],
        [1.0906e-04],
        [5.3866e-06],
        [2.1397e-07],
        [2.5295e-09],
        [1.4539e-06],
        [1.0350e-06],
        [3.4333e-04],
        [2.9425e-06],
        [5.5502e-05],
        [2.1601e-03],
        [6.1321e-06],
        [7.8915e-05],
        [1.1069e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.684491157531738: 
target probs tensor([[9.9158e-09],
        [4.5622e-05],
        [1.0616e-05],
        [8.8253e-05],
        [1.1439e-04],
        [6.8823e-03],
        [6.9401e-06],
        [3.7158e-05],
        [2.4797e-05],
        [3.3931e-05],
        [2.9308e-04],
        [7.6583e-05],
        [1.9339e-07],
        [7.4681e-05],
        [1.0611e-04],
        [1.2072e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.86777400970459: 
target probs tensor([[2.0661e-04],
        [3.2760e-06],
        [4.6147e-07],
        [1.6157e-03],
        [5.7332e-04],
    

target probs tensor([[4.0375e-06],
        [1.9115e-04],
        [1.0873e-04],
        [4.9307e-05],
        [1.1543e-06],
        [8.4038e-06],
        [3.4426e-04],
        [1.6201e-03],
        [1.0519e-05],
        [5.7972e-07],
        [2.8199e-03],
        [9.2603e-04],
        [4.3794e-06],
        [9.9867e-03],
        [6.8986e-05],
        [7.2830e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.945135116577148: 
target probs tensor([[3.6208e-04],
        [4.5057e-05],
        [1.3249e-07],
        [1.1809e-04],
        [2.7832e-05],
        [1.9726e-06],
        [4.9342e-05],
        [3.8135e-05],
        [9.3974e-05],
        [9.1678e-03],
        [1.5379e-04],
        [3.2536e-09],
        [7.2800e-03],
        [4.0116e-05],
        [3.0377e-07],
        [8.2870e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.802616119384766: 
target probs tensor([[4.1942e-05],
        [3.7642e-04],
        [6.8125e-05],
        [3.9131e-06],
        [6.7244e-03],
    

target probs tensor([[8.4056e-06],
        [1.1449e-03],
        [5.9427e-05],
        [1.2069e-03],
        [6.3343e-03],
        [9.9924e-04],
        [2.3652e-05],
        [1.1533e-04],
        [1.0859e-05],
        [1.5247e-04],
        [6.2487e-06],
        [1.0408e-05],
        [4.5645e-04],
        [7.5837e-06],
        [2.5882e-04],
        [1.7763e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.1657075881958: 
target probs tensor([[2.9332e-06],
        [3.5035e-05],
        [5.2880e-05],
        [2.5669e-06],
        [1.4243e-04],
        [2.4358e-02],
        [3.3166e-05],
        [4.2954e-04],
        [3.3343e-04],
        [5.0424e-07],
        [1.0361e-08],
        [1.6730e-04],
        [5.7549e-05],
        [1.0830e-05],
        [9.8077e-07],
        [1.0031e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.491878509521484: 
target probs tensor([[4.5629e-05],
        [7.4326e-06],
        [4.3218e-06],
        [1.7960e-04],
        [5.3500e-03],
      

target probs tensor([[2.6161e-08],
        [2.3981e-03],
        [4.6017e-06],
        [7.6667e-06],
        [8.5259e-06],
        [2.3503e-05],
        [1.6256e-04],
        [7.4897e-07],
        [3.8463e-04],
        [7.3720e-05],
        [1.8518e-05],
        [1.4932e-05],
        [3.9769e-09],
        [2.4052e-05],
        [2.6611e-04],
        [5.6663e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.254127502441406: 
target probs tensor([[1.5957e-04],
        [1.2983e-04],
        [9.1796e-05],
        [1.0994e-02],
        [1.4776e-03],
        [1.2198e-03],
        [4.1105e-05],
        [2.6970e-04],
        [2.7706e-04],
        [4.2154e-04],
        [1.0616e-03],
        [1.9139e-04],
        [2.1658e-04],
        [3.4829e-04],
        [2.5006e-06],
        [1.3364e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.433539390563965: 
target probs tensor([[2.0389e-04],
        [3.3577e-06],
        [1.7264e-05],
        [8.8530e-05],
        [8.1522e-05],
    

target probs tensor([[4.6933e-03],
        [4.8156e-04],
        [8.1772e-06],
        [3.7399e-05],
        [2.3116e-04],
        [6.2662e-06],
        [5.4190e-04],
        [6.0262e-04],
        [1.4829e-03],
        [6.9236e-05],
        [2.6339e-04],
        [8.0294e-05],
        [6.9572e-05],
        [1.2935e-04],
        [1.0204e-05],
        [8.7853e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.81344223022461: 
target probs tensor([[5.2122e-06],
        [2.7304e-06],
        [8.2945e-04],
        [7.4030e-04],
        [5.0625e-06],
        [2.1134e-09],
        [9.8699e-06],
        [6.4125e-05],
        [2.1367e-04],
        [2.1494e-05],
        [3.1323e-04],
        [3.6559e-06],
        [3.3117e-04],
        [3.7509e-05],
        [6.2058e-04],
        [6.1603e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.334325790405273: 
target probs tensor([[1.4065e-05],
        [8.0758e-09],
        [8.6900e-05],
        [1.6456e-05],
        [7.6894e-04],
     

target probs tensor([[5.8081e-05],
        [3.9647e-07],
        [2.6571e-04],
        [8.1711e-04],
        [1.1114e-03],
        [1.1973e-04],
        [8.2242e-04],
        [4.0125e-04],
        [3.9475e-05],
        [4.0765e-07],
        [6.6523e-05],
        [1.1615e-05],
        [8.7808e-04],
        [1.1298e-04],
        [1.1724e-05],
        [2.5271e-05]], device='cuda:0'), loss: 9.655801773071289: 
target probs tensor([[2.8761e-05],
        [1.4243e-03],
        [5.0473e-05],
        [8.3740e-06],
        [2.0419e-05],
        [4.2292e-07],
        [4.0652e-06],
        [9.0378e-03],
        [3.2877e-05],
        [1.9053e-03],
        [2.7270e-04],
        [9.8444e-07],
        [3.6590e-03],
        [7.1231e-04],
        [1.8825e-04],
        [3.1998e-06]], device='cuda:0'), loss: 9.618818283081055: 
target probs tensor([[5.2337e-05],
        [1.6242e-02],
        [3.2996e-09],
        [8.2791e-05],
        [1.5625e-05],
        [4.7923e-11],
        [1.2366e-05],
        [1.00

target probs tensor([[2.1694e-03],
        [8.6698e-05],
        [3.9382e-06],
        [1.4937e-08],
        [1.7668e-05],
        [2.4797e-05],
        [1.0685e-03],
        [5.9972e-04],
        [1.1581e-04],
        [1.1690e-04],
        [5.2048e-06],
        [5.3210e-05],
        [6.1018e-04],
        [3.4812e-02],
        [1.7828e-04],
        [6.8177e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.71674919128418: 
target probs tensor([[5.3347e-05],
        [4.1491e-05],
        [3.0077e-04],
        [2.0912e-05],
        [3.3615e-05],
        [6.7721e-04],
        [1.3238e-03],
        [2.2006e-04],
        [7.6699e-05],
        [3.4213e-03],
        [1.8482e-04],
        [9.2264e-06],
        [8.5708e-07],
        [8.1961e-07],
        [1.3312e-04],
        [3.3267e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.338553428649902: 
target probs tensor([[9.5261e-06],
        [2.0720e-05],
        [1.5937e-04],
        [8.3545e-04],
        [2.6186e-06],
      

epoch,train_loss,valid_loss,validation,fool_loss,time
0,10.669527,10.793911,0.654,10.793911,03:45
1,10.27352,10.456312,0.677,10.456312,03:34
2,10.110948,10.329145,0.662,10.329147,03:34
3,10.023446,10.158373,0.679,10.158374,03:33
4,9.852835,10.053376,0.683,10.053376,03:35
5,9.732811,9.804704,0.685,9.804706,03:34


target probs tensor([[3.5431e-04],
        [2.9174e-05],
        [7.8664e-07],
        [5.7277e-07],
        [2.4868e-07],
        [9.7474e-09],
        [2.4284e-04],
        [2.0433e-08],
        [3.1270e-05],
        [1.1510e-08],
        [1.5470e-07],
        [1.7702e-07],
        [5.4424e-08],
        [1.0022e-06],
        [1.0685e-05],
        [4.4725e-08]], device='cuda:0', grad_fn=<GatherBackward>), loss: 14.078720092773438: 
target probs tensor([[2.5547e-06],
        [1.3485e-08],
        [2.3925e-04],
        [5.5881e-07],
        [3.5160e-05],
        [2.1011e-05],
        [1.2966e-11],
        [4.3324e-07],
        [5.9670e-05],
        [8.5135e-05],
        [7.0443e-06],
        [3.5392e-04],
        [1.0974e-08],
        [1.6121e-04],
        [8.7625e-08],
        [2.2159e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 12.68323040008545: 
target probs tensor([[1.6092e-06],
        [1.1357e-05],
        [7.3426e-09],
        [6.9133e-11],
        [1.1397e-06],
    

target probs tensor([[7.7741e-04],
        [1.3753e-04],
        [1.8259e-05],
        [1.7641e-05],
        [1.8986e-05],
        [3.9862e-05],
        [1.6975e-03],
        [4.8537e-07],
        [9.6153e-05],
        [4.3663e-06],
        [5.5707e-06],
        [1.8734e-05],
        [5.7567e-06],
        [7.9634e-04],
        [5.3913e-05],
        [1.9175e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.268184661865234: 
target probs tensor([[9.9506e-05],
        [9.8480e-05],
        [1.4633e-03],
        [5.3286e-05],
        [2.2291e-04],
        [1.9179e-07],
        [1.4187e-05],
        [6.1488e-05],
        [4.3029e-04],
        [1.4912e-04],
        [1.2475e-06],
        [8.4758e-06],
        [7.1745e-06],
        [1.0337e-03],
        [5.1069e-06],
        [7.8995e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.107992172241211: 
target probs tensor([[3.6668e-05],
        [5.8045e-04],
        [4.3564e-07],
        [9.4757e-04],
        [5.3104e-06],
   

target probs tensor([[1.2329e-05],
        [5.3611e-05],
        [1.5780e-04],
        [4.6019e-04],
        [3.7254e-06],
        [6.6259e-08],
        [2.0722e-03],
        [1.8607e-05],
        [4.4739e-05],
        [3.0690e-06],
        [1.1500e-06],
        [8.6269e-05],
        [3.9697e-06],
        [7.7110e-04],
        [4.7757e-08],
        [1.2540e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.072723388671875: 
target probs tensor([[1.4738e-05],
        [3.5410e-05],
        [1.9648e-03],
        [3.9339e-05],
        [3.1645e-08],
        [4.2307e-06],
        [4.2969e-06],
        [1.7709e-06],
        [1.0129e-04],
        [2.3176e-03],
        [8.0355e-06],
        [1.4511e-05],
        [2.3310e-04],
        [8.5313e-05],
        [1.7868e-07],
        [3.1227e-08]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.354717254638672: 
target probs tensor([[2.9638e-04],
        [1.4361e-04],
        [6.0297e-04],
        [2.5162e-05],
        [7.8627e-06],
   

target probs tensor([[4.3156e-06],
        [1.5288e-05],
        [1.0019e-04],
        [6.6872e-07],
        [3.0385e-04],
        [4.0007e-05],
        [5.1866e-06],
        [2.5841e-06],
        [9.8112e-05],
        [1.0116e-06],
        [1.1381e-05],
        [1.3136e-03],
        [1.2765e-04],
        [2.9683e-05],
        [4.2595e-11],
        [2.3207e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.782415390014648: 
target probs tensor([[2.2408e-04],
        [4.4564e-03],
        [1.6436e-05],
        [5.1063e-08],
        [7.5074e-06],
        [2.8375e-05],
        [3.0545e-04],
        [3.4205e-05],
        [6.0838e-05],
        [4.7265e-05],
        [7.7691e-06],
        [3.3324e-05],
        [2.4372e-03],
        [9.7464e-05],
        [3.0243e-05],
        [5.4189e-08]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.399836540222168: 
target probs tensor([[1.3115e-05],
        [7.6680e-07],
        [2.9000e-04],
        [1.9892e-03],
        [4.7565e-04],
   

target probs tensor([[2.6244e-04],
        [7.5496e-04],
        [6.3161e-05],
        [2.9019e-07],
        [7.1198e-05],
        [1.4223e-04],
        [1.3687e-07],
        [2.6014e-03],
        [4.7508e-07],
        [1.8383e-08],
        [1.6998e-05],
        [1.1350e-04],
        [1.7097e-04],
        [1.6395e-02],
        [1.6001e-02],
        [3.2647e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.000104904174805: 
target probs tensor([[4.3474e-05],
        [1.3496e-03],
        [2.4979e-05],
        [6.2752e-05],
        [1.2485e-03],
        [2.8277e-04],
        [4.8429e-10],
        [1.7961e-02],
        [6.2664e-04],
        [8.6979e-05],
        [9.6505e-05],
        [4.2180e-04],
        [2.3870e-04],
        [2.7814e-04],
        [1.9381e-05],
        [7.8531e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.227169036865234: 
target probs tensor([[4.2676e-05],
        [4.7450e-05],
        [9.3874e-04],
        [7.8962e-03],
        [2.1462e-04],
    

target probs tensor([[3.6578e-05],
        [3.0520e-04],
        [1.0226e-02],
        [3.2195e-03],
        [1.0089e-05],
        [1.6861e-04],
        [2.8743e-03],
        [1.1143e-03],
        [4.2101e-05],
        [3.2458e-04],
        [2.0392e-05],
        [5.2143e-07],
        [4.6189e-05],
        [9.9140e-04],
        [4.6166e-05],
        [9.3302e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.957206726074219: 
target probs tensor([[7.2707e-06],
        [4.2986e-09],
        [9.7319e-04],
        [1.0068e-03],
        [1.1194e-03],
        [2.2604e-05],
        [4.6137e-04],
        [5.8351e-04],
        [2.3941e-08],
        [2.2232e-05],
        [7.3770e-04],
        [1.5695e-05],
        [1.6894e-06],
        [3.4304e-05],
        [2.1426e-04],
        [7.0598e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.496374130249023: 
target probs tensor([[1.4586e-04],
        [4.3269e-05],
        [5.4391e-04],
        [9.0818e-05],
        [1.4956e-04],
    

target probs tensor([[9.4636e-05],
        [3.5141e-04],
        [3.9025e-04],
        [2.6338e-05],
        [8.3374e-05],
        [3.9438e-05],
        [3.5616e-08],
        [4.6445e-07],
        [2.9455e-05],
        [2.4233e-04],
        [9.9702e-04],
        [3.7860e-07],
        [5.5417e-06],
        [1.0958e-04],
        [9.4347e-05],
        [2.7240e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.520722389221191: 
target probs tensor([[4.6080e-07],
        [1.0834e-04],
        [5.5856e-03],
        [1.6629e-04],
        [3.6675e-05],
        [2.5037e-03],
        [4.3562e-05],
        [8.5200e-06],
        [1.5411e-06],
        [3.3665e-05],
        [7.0186e-05],
        [1.5875e-04],
        [3.2169e-05],
        [1.1378e-03],
        [2.9061e-05],
        [2.9096e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.864921569824219: 
target probs tensor([[4.5237e-05],
        [1.3258e-03],
        [1.7916e-05],
        [1.4711e-05],
        [2.4599e-04],
    

target probs tensor([[9.3366e-08],
        [4.8790e-04],
        [1.4869e-05],
        [2.3066e-04],
        [1.1927e-06],
        [2.6619e-07],
        [2.5976e-04],
        [1.0044e-04],
        [1.7121e-06],
        [4.9327e-06],
        [7.2021e-06],
        [4.7761e-06],
        [7.4983e-04],
        [5.1983e-03],
        [1.7370e-04],
        [9.4898e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.450321197509766: 
target probs tensor([[2.8385e-05],
        [7.7238e-05],
        [1.0029e-06],
        [2.1387e-06],
        [8.5194e-05],
        [3.6223e-08],
        [2.3693e-04],
        [9.7991e-05],
        [2.8205e-04],
        [8.7380e-04],
        [5.7596e-07],
        [1.3524e-06],
        [6.4011e-05],
        [2.7026e-04],
        [2.6613e-03],
        [1.1145e-02]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.142547607421875: 
target probs tensor([[8.7310e-05],
        [3.4187e-07],
        [6.3048e-04],
        [2.2076e-05],
        [3.2882e-06],
   

target probs tensor([[1.0574e-04],
        [3.8396e-04],
        [1.1869e-04],
        [2.7576e-04],
        [4.3230e-04],
        [3.3417e-07],
        [8.0936e-04],
        [2.2916e-05],
        [3.4225e-06],
        [8.1846e-04],
        [2.0861e-04],
        [5.3375e-05],
        [4.1223e-04],
        [4.3157e-07],
        [1.2343e-05],
        [5.5719e-09]], device='cuda:0'), loss: 10.341326713562012: 
target probs tensor([[3.5610e-06],
        [2.7711e-03],
        [2.3487e-05],
        [7.4572e-05],
        [2.0467e-04],
        [4.0217e-06],
        [9.3017e-06],
        [3.0695e-03],
        [2.6922e-05],
        [3.1757e-05],
        [1.9166e-04],
        [2.9257e-04],
        [3.7661e-07],
        [2.4235e-06],
        [6.1828e-07],
        [7.4800e-04]], device='cuda:0'), loss: 10.229939460754395: 
target probs tensor([[5.8835e-05],
        [8.4562e-07],
        [1.1380e-03],
        [2.3127e-04],
        [3.0878e-05],
        [4.4531e-06],
        [1.7439e-04],
        [3.

target probs tensor([[6.9635e-03],
        [8.4840e-05],
        [7.8313e-05],
        [1.7624e-05],
        [4.3068e-07],
        [2.8977e-05],
        [3.2042e-05],
        [2.3663e-05],
        [9.0976e-05],
        [8.7593e-04],
        [7.0184e-04],
        [1.5035e-04],
        [3.7218e-04],
        [1.9664e-05],
        [4.5321e-06],
        [1.4352e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.859392166137695: 
target probs tensor([[1.8567e-04],
        [1.4901e-04],
        [2.9855e-05],
        [4.8311e-04],
        [1.1516e-04],
        [1.9297e-05],
        [7.9724e-07],
        [2.3587e-05],
        [3.1551e-03],
        [2.2809e-07],
        [9.2935e-05],
        [1.0632e-04],
        [3.4431e-05],
        [2.8427e-05],
        [1.3783e-04],
        [1.5678e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.01630973815918: 
target probs tensor([[2.8589e-04],
        [2.1157e-04],
        [2.3157e-04],
        [3.7968e-05],
        [1.6946e-04],
     

epoch,train_loss,valid_loss,validation,fool_loss,time
0,10.903863,10.978745,0.568,10.978745,03:49
1,10.197931,10.439893,0.613,10.439892,03:44
2,10.008776,10.254466,0.64,10.254465,03:33
3,9.91163,10.024603,0.663,10.024603,03:34
4,9.899489,10.04149,0.669,10.041491,03:34
5,9.611986,9.859069,0.669,9.859069,03:34


target probs tensor([[1.4580e-05],
        [1.2053e-07],
        [1.7353e-06],
        [7.6781e-06],
        [4.8328e-07],
        [1.0435e-09],
        [6.8065e-05],
        [6.6123e-05],
        [3.9016e-11],
        [2.2392e-06],
        [4.2646e-06],
        [5.2126e-06],
        [1.9068e-06],
        [2.6172e-04],
        [3.6578e-06],
        [8.7884e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 13.410950660705566: 
target probs tensor([[1.7970e-05],
        [7.4170e-08],
        [8.8848e-09],
        [6.8026e-05],
        [2.8878e-08],
        [1.8166e-05],
        [3.0099e-06],
        [2.8857e-06],
        [6.1445e-08],
        [5.2797e-05],
        [8.0834e-05],
        [1.8050e-05],
        [1.3435e-06],
        [2.4433e-05],
        [1.3284e-07],
        [2.6455e-08]], device='cuda:0', grad_fn=<GatherBackward>), loss: 13.338929176330566: 
target probs tensor([[2.1185e-06],
        [1.6019e-04],
        [4.0127e-05],
        [7.0579e-07],
        [6.3233e-07],
   

target probs tensor([[3.6327e-07],
        [9.5128e-03],
        [3.4903e-03],
        [1.8213e-05],
        [5.2667e-09],
        [8.3594e-05],
        [3.6037e-05],
        [7.0718e-06],
        [2.3161e-04],
        [5.1331e-06],
        [2.3493e-04],
        [4.5550e-06],
        [2.4599e-06],
        [1.3257e-07],
        [9.9057e-04],
        [1.7918e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.042686462402344: 
target probs tensor([[6.6221e-05],
        [2.1357e-08],
        [1.3118e-05],
        [9.3011e-04],
        [1.0681e-04],
        [6.9152e-04],
        [3.3725e-05],
        [9.0688e-06],
        [1.5041e-04],
        [6.7611e-05],
        [4.0601e-04],
        [3.5636e-07],
        [8.0842e-06],
        [1.0534e-02],
        [5.1460e-05],
        [9.6453e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.01815414428711: 
target probs tensor([[5.0278e-07],
        [1.0323e-07],
        [3.6591e-04],
        [2.8450e-05],
        [2.7962e-03],
    

target probs tensor([[2.2027e-05],
        [3.7567e-06],
        [1.4075e-04],
        [4.9559e-05],
        [3.9092e-04],
        [7.5709e-05],
        [1.6612e-05],
        [4.4167e-04],
        [1.6161e-04],
        [2.7968e-06],
        [5.8724e-05],
        [6.5280e-05],
        [1.2853e-05],
        [1.0773e-05],
        [1.2027e-05],
        [3.1656e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.065347671508789: 
target probs tensor([[1.3983e-03],
        [5.6550e-04],
        [1.1962e-04],
        [7.1004e-04],
        [4.1324e-08],
        [8.6559e-06],
        [4.9923e-06],
        [3.6619e-04],
        [2.3324e-04],
        [2.1521e-04],
        [1.5027e-04],
        [2.3007e-03],
        [4.3750e-05],
        [3.8035e-06],
        [4.1303e-04],
        [3.8189e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.310761451721191: 
target probs tensor([[4.1098e-06],
        [2.8159e-06],
        [7.4837e-07],
        [1.6623e-09],
        [3.9150e-08],
    

target probs tensor([[2.2911e-05],
        [2.8196e-04],
        [3.1313e-05],
        [2.1818e-05],
        [3.9621e-05],
        [1.5139e-04],
        [1.2980e-04],
        [8.4530e-06],
        [1.3828e-05],
        [6.1936e-05],
        [6.6727e-06],
        [7.8644e-03],
        [6.6114e-05],
        [2.1058e-07],
        [1.2174e-05],
        [1.0397e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.021705627441406: 
target probs tensor([[7.6572e-05],
        [4.3705e-04],
        [1.1003e-03],
        [5.2924e-07],
        [3.7640e-04],
        [3.1531e-05],
        [1.7378e-09],
        [2.0676e-05],
        [1.4629e-03],
        [1.7871e-05],
        [3.1185e-04],
        [2.5486e-05],
        [1.9779e-04],
        [6.9987e-05],
        [1.1983e-03],
        [1.1656e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.707133293151855: 
target probs tensor([[2.0160e-05],
        [1.2978e-04],
        [3.6165e-04],
        [2.8123e-03],
        [2.4637e-03],
    

target probs tensor([[5.7001e-08],
        [1.0601e-03],
        [1.1215e-03],
        [2.0752e-04],
        [2.3252e-04],
        [1.0229e-02],
        [2.1317e-06],
        [8.3228e-05],
        [1.2154e-02],
        [2.5712e-05],
        [2.4682e-04],
        [2.9164e-04],
        [1.3075e-03],
        [1.2150e-04],
        [1.0580e-10],
        [9.1409e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.411565780639648: 
target probs tensor([[3.1243e-04],
        [8.4002e-05],
        [2.9103e-04],
        [1.3636e-05],
        [5.2055e-05],
        [2.3345e-10],
        [7.7781e-04],
        [4.1201e-05],
        [2.3492e-04],
        [6.9518e-04],
        [1.5077e-06],
        [3.3792e-04],
        [1.0470e-06],
        [3.8297e-05],
        [1.8671e-05],
        [7.5335e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.73083782196045: 
target probs tensor([[5.9408e-05],
        [2.4015e-05],
        [1.0276e-05],
        [1.2950e-06],
        [5.5864e-05],
     

target probs tensor([[9.9382e-09],
        [9.8881e-06],
        [1.7997e-04],
        [5.6649e-06],
        [4.2636e-06],
        [2.4189e-07],
        [1.7713e-04],
        [1.1769e-04],
        [1.3208e-03],
        [1.3225e-04],
        [9.3519e-07],
        [4.4514e-07],
        [7.7353e-05],
        [2.7382e-04],
        [2.8009e-06],
        [9.9187e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.229520797729492: 
target probs tensor([[3.9238e-03],
        [1.7751e-06],
        [1.3802e-04],
        [4.0767e-04],
        [2.8745e-05],
        [1.0882e-04],
        [6.8044e-05],
        [1.7711e-04],
        [9.2729e-06],
        [5.2580e-06],
        [2.5579e-04],
        [8.8579e-04],
        [1.8205e-05],
        [5.3323e-05],
        [9.3239e-04],
        [3.1675e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.401716232299805: 
target probs tensor([[6.0192e-04],
        [1.5840e-04],
        [3.6352e-04],
        [3.3015e-05],
        [1.6446e-06],
    

target probs tensor([[6.5902e-06],
        [5.9738e-05],
        [4.7369e-04],
        [6.2259e-07],
        [5.0033e-09],
        [6.0190e-04],
        [9.9756e-04],
        [3.9491e-05],
        [4.2240e-05],
        [2.5523e-03],
        [8.4963e-04],
        [4.8487e-05],
        [1.9124e-03],
        [3.0393e-05],
        [3.2879e-04],
        [2.0141e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.019002914428711: 
target probs tensor([[1.7953e-05],
        [6.4325e-05],
        [2.1765e-05],
        [4.0832e-06],
        [1.3810e-04],
        [3.7631e-03],
        [2.5673e-04],
        [3.7491e-05],
        [3.6604e-03],
        [3.3816e-03],
        [1.1524e-03],
        [1.4540e-03],
        [2.3922e-05],
        [1.0023e-04],
        [7.4994e-05],
        [7.0768e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.615713119506836: 
target probs tensor([[4.3908e-04],
        [1.2347e-05],
        [4.3525e-05],
        [2.5811e-03],
        [6.6060e-04],
    

target probs tensor([[7.7690e-03],
        [4.1280e-03],
        [4.3322e-04],
        [2.5785e-05],
        [2.8065e-04],
        [1.4729e-04],
        [2.8619e-04],
        [1.7022e-05],
        [8.3101e-06],
        [1.2391e-05],
        [1.6842e-02],
        [1.3709e-05],
        [3.3676e-05],
        [2.1400e-03],
        [7.5386e-04],
        [5.6233e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.243350982666016: 
target probs tensor([[4.1928e-07],
        [8.3602e-10],
        [9.1610e-04],
        [2.2571e-04],
        [4.6485e-04],
        [1.2919e-06],
        [1.0602e-04],
        [2.9965e-04],
        [4.4652e-06],
        [4.1498e-05],
        [1.9812e-04],
        [4.9941e-04],
        [4.4566e-05],
        [5.7284e-05],
        [3.9165e-04],
        [1.0857e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.291275024414062: 
target probs tensor([[6.4901e-05],
        [1.9609e-04],
        [5.1732e-05],
        [2.8916e-06],
        [4.2613e-07],
    

target probs tensor([[3.9586e-05],
        [2.0322e-04],
        [1.8325e-04],
        [7.4827e-05],
        [3.2147e-05],
        [1.1927e-03],
        [1.0800e-05],
        [3.9088e-06],
        [1.1501e-04],
        [8.4577e-07],
        [5.0885e-05],
        [8.2297e-06],
        [1.7522e-05],
        [8.0959e-06],
        [1.6517e-04],
        [4.1317e-06]], device='cuda:0'), loss: 10.383544921875: 
target probs tensor([[1.4376e-03],
        [8.1655e-05],
        [2.0902e-04],
        [4.0552e-05],
        [1.0484e-05],
        [2.0076e-08],
        [1.6611e-04],
        [2.9119e-06],
        [6.7389e-06],
        [3.1530e-03],
        [2.0823e-04],
        [4.0606e-06],
        [6.1752e-03],
        [1.0718e-04],
        [1.6962e-03],
        [4.3261e-05]], device='cuda:0'), loss: 9.649477005004883: 
target probs tensor([[1.2745e-04],
        [5.1512e-05],
        [1.4027e-09],
        [3.0851e-04],
        [1.2387e-04],
        [3.8579e-09],
        [7.3679e-05],
        [4.9957

target probs tensor([[3.2530e-06],
        [2.1777e-03],
        [2.2581e-03],
        [3.0509e-05],
        [1.4425e-03],
        [9.7630e-05],
        [2.2711e-06],
        [2.6811e-06],
        [1.2688e-04],
        [2.1752e-04],
        [1.1235e-03],
        [3.4676e-03],
        [5.3522e-05],
        [2.8942e-05],
        [6.5978e-05],
        [7.0088e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.424995422363281: 
target probs tensor([[7.1412e-04],
        [5.7660e-04],
        [5.3247e-05],
        [3.4665e-04],
        [1.3104e-05],
        [2.9262e-03],
        [1.9021e-04],
        [7.7639e-04],
        [5.4950e-03],
        [1.0154e-05],
        [3.6007e-05],
        [3.2107e-06],
        [1.5698e-03],
        [8.7445e-03],
        [5.5405e-07],
        [1.2300e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.862852096557617: 
target probs tensor([[3.3408e-06],
        [5.6029e-06],
        [2.9087e-06],
        [9.9763e-05],
        [2.0560e-03],
     

epoch,train_loss,valid_loss,validation,fool_loss,time
0,10.620019,10.66575,0.676,10.665748,03:44
1,10.417414,10.578252,0.695,10.57825,03:34
2,10.181508,10.625263,0.686,10.625265,03:34
3,10.207391,10.472158,0.674,10.47216,03:32
4,10.064328,10.127918,0.689,10.127918,03:33
5,9.926339,10.086761,0.689,10.086759,03:33


target probs tensor([[3.9594e-06],
        [1.3196e-09],
        [6.5978e-07],
        [5.9521e-11],
        [3.5839e-05],
        [7.5972e-08],
        [1.1678e-05],
        [7.5203e-11],
        [1.9190e-05],
        [5.8322e-05],
        [3.2662e-03],
        [1.2797e-07],
        [4.5607e-11],
        [1.0817e-06],
        [1.8051e-07],
        [2.5348e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 14.816123008728027: 
target probs tensor([[1.2707e-04],
        [3.2905e-07],
        [3.8421e-06],
        [1.8112e-05],
        [2.0277e-04],
        [1.9063e-03],
        [2.3297e-06],
        [4.8648e-05],
        [4.0369e-05],
        [3.2839e-06],
        [1.7697e-07],
        [1.0087e-06],
        [2.6909e-06],
        [1.4062e-07],
        [1.8092e-05],
        [2.5163e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.985441207885742: 
target probs tensor([[3.5314e-07],
        [1.0484e-07],
        [1.1375e-10],
        [7.1972e-02],
        [1.1968e-04],
   

target probs tensor([[1.6702e-05],
        [4.4228e-07],
        [9.0219e-07],
        [5.2645e-08],
        [6.8947e-09],
        [7.8491e-06],
        [6.6584e-07],
        [3.3559e-05],
        [3.7989e-08],
        [9.0322e-05],
        [4.8012e-07],
        [3.3497e-05],
        [6.6623e-05],
        [2.2282e-06],
        [1.1738e-05],
        [1.3231e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 12.989347457885742: 
target probs tensor([[9.7396e-04],
        [8.6135e-07],
        [5.5884e-07],
        [1.3157e-08],
        [1.3352e-05],
        [8.1249e-04],
        [3.7520e-05],
        [3.2690e-06],
        [9.5300e-07],
        [2.4027e-04],
        [5.2423e-03],
        [3.3100e-04],
        [1.8671e-04],
        [6.9445e-05],
        [4.2123e-04],
        [1.7632e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.289609909057617: 
target probs tensor([[1.8873e-04],
        [2.1675e-05],
        [2.4582e-04],
        [6.7640e-04],
        [1.6720e-04],
   

target probs tensor([[9.2866e-06],
        [4.2329e-03],
        [2.1075e-07],
        [1.8235e-04],
        [1.8929e-06],
        [6.4058e-05],
        [1.4081e-06],
        [2.5448e-08],
        [1.5080e-06],
        [2.9735e-04],
        [1.5333e-04],
        [1.8113e-04],
        [1.1135e-03],
        [2.6331e-05],
        [1.7039e-03],
        [2.7338e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.498372077941895: 
target probs tensor([[3.0477e-04],
        [3.3348e-05],
        [1.1310e-05],
        [1.1385e-04],
        [3.4786e-06],
        [2.7950e-07],
        [7.4885e-07],
        [9.7059e-09],
        [2.3592e-08],
        [2.9387e-06],
        [9.8377e-04],
        [3.8473e-06],
        [1.5236e-06],
        [4.8185e-04],
        [3.7834e-05],
        [4.9379e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 12.012472152709961: 
target probs tensor([[5.4033e-04],
        [3.6033e-05],
        [8.4587e-06],
        [6.6331e-06],
        [1.0676e-05],
   

target probs tensor([[3.9184e-04],
        [8.9085e-06],
        [4.5133e-04],
        [5.5030e-07],
        [3.8978e-05],
        [1.0783e-04],
        [3.6994e-04],
        [8.1141e-05],
        [2.2682e-05],
        [2.0281e-05],
        [5.1674e-05],
        [3.1785e-04],
        [7.2309e-04],
        [1.9972e-04],
        [8.4951e-04],
        [1.8215e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.315902709960938: 
target probs tensor([[1.1220e-03],
        [2.0696e-04],
        [3.3590e-05],
        [5.2912e-06],
        [3.7614e-08],
        [3.7683e-07],
        [9.9034e-10],
        [9.8685e-07],
        [2.9375e-03],
        [6.5380e-06],
        [2.8094e-06],
        [2.3016e-04],
        [8.1589e-04],
        [9.3569e-05],
        [2.3801e-05],
        [1.3650e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.039617538452148: 
target probs tensor([[9.1642e-05],
        [7.1313e-08],
        [9.2287e-05],
        [4.8642e-04],
        [6.8220e-04],
    

target probs tensor([[8.7845e-06],
        [1.3581e-05],
        [5.8296e-07],
        [7.9595e-06],
        [1.8406e-06],
        [2.6834e-06],
        [7.1641e-06],
        [3.4916e-05],
        [2.2554e-04],
        [6.1612e-05],
        [3.2435e-04],
        [2.0522e-04],
        [6.2664e-05],
        [4.1366e-04],
        [3.4910e-04],
        [1.9170e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.499711990356445: 
target probs tensor([[1.3198e-03],
        [7.8581e-06],
        [8.8056e-05],
        [4.5714e-04],
        [1.2949e-06],
        [4.3540e-03],
        [3.6208e-05],
        [6.4009e-06],
        [2.1851e-04],
        [2.8163e-04],
        [1.6324e-06],
        [2.8159e-03],
        [1.5212e-03],
        [1.1780e-05],
        [3.8570e-05],
        [7.2767e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.37008285522461: 
target probs tensor([[5.1611e-05],
        [8.1054e-07],
        [3.2718e-05],
        [7.1271e-06],
        [2.8657e-05],
     

target probs tensor([[7.4472e-05],
        [9.9812e-06],
        [1.4682e-04],
        [3.4130e-04],
        [3.0032e-06],
        [2.8019e-06],
        [2.9303e-04],
        [3.0750e-04],
        [3.9754e-07],
        [4.2214e-04],
        [4.8268e-05],
        [6.4140e-05],
        [2.4832e-05],
        [7.0775e-06],
        [8.3330e-04],
        [9.7686e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.171358108520508: 
target probs tensor([[7.9680e-09],
        [7.1186e-05],
        [4.4673e-05],
        [3.0429e-05],
        [5.4310e-06],
        [1.7024e-03],
        [8.0411e-06],
        [4.1857e-04],
        [1.4501e-04],
        [1.4687e-06],
        [1.2686e-04],
        [9.6250e-05],
        [5.1612e-06],
        [2.0377e-07],
        [1.4633e-05],
        [1.3470e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.920381546020508: 
target probs tensor([[1.8292e-05],
        [8.8287e-08],
        [2.2291e-04],
        [5.9946e-07],
        [1.6234e-03],
   

target probs tensor([[6.7576e-05],
        [5.1221e-04],
        [3.8780e-05],
        [9.6711e-05],
        [3.8141e-06],
        [1.4259e-08],
        [5.9577e-04],
        [7.0766e-05],
        [4.4401e-06],
        [4.6062e-04],
        [2.3230e-03],
        [2.5504e-04],
        [3.4415e-05],
        [2.5188e-03],
        [3.8087e-03],
        [8.3280e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.498241424560547: 
target probs tensor([[6.3037e-04],
        [8.3633e-04],
        [6.2883e-06],
        [1.5114e-03],
        [1.2604e-04],
        [9.5236e-05],
        [2.5227e-04],
        [2.1018e-04],
        [4.8107e-06],
        [3.3074e-04],
        [6.7801e-04],
        [2.0634e-07],
        [6.6517e-05],
        [1.0606e-05],
        [1.3861e-04],
        [1.8653e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.481985092163086: 
target probs tensor([[7.1715e-05],
        [3.3723e-05],
        [3.2170e-04],
        [3.4289e-05],
        [3.4008e-05],
     

target probs tensor([[2.3317e-04],
        [1.7047e-04],
        [5.9221e-06],
        [8.3314e-05],
        [3.7159e-05],
        [2.4301e-04],
        [1.2913e-05],
        [1.4897e-02],
        [5.2495e-08],
        [7.7497e-04],
        [2.1180e-05],
        [2.3774e-04],
        [1.6876e-05],
        [4.0656e-05],
        [6.7936e-04],
        [2.1167e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.52137565612793: 
target probs tensor([[1.1379e-04],
        [1.2246e-03],
        [5.1629e-05],
        [2.1791e-04],
        [4.2427e-05],
        [6.8360e-06],
        [4.6290e-06],
        [6.0962e-05],
        [5.0920e-05],
        [1.0440e-03],
        [1.7108e-04],
        [1.1141e-04],
        [2.3727e-09],
        [6.3610e-07],
        [2.1562e-05],
        [1.3104e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.539815902709961: 
target probs tensor([[5.2971e-07],
        [2.2909e-04],
        [2.7951e-04],
        [1.9838e-06],
        [1.1588e-03],
     

target probs tensor([[1.2855e-03],
        [3.7374e-04],
        [5.7345e-06],
        [4.4962e-05],
        [2.7440e-04],
        [9.5256e-06],
        [1.2096e-04],
        [6.8502e-04],
        [7.5769e-07],
        [1.1769e-06],
        [4.0508e-02],
        [7.0343e-04],
        [3.6102e-06],
        [1.5947e-06],
        [1.4813e-05],
        [1.5701e-08]], device='cuda:0'), loss: 10.36691665649414: 
target probs tensor([[1.2193e-04],
        [5.4652e-05],
        [1.0516e-04],
        [5.8146e-05],
        [4.8231e-05],
        [1.6278e-06],
        [1.3691e-05],
        [6.1597e-04],
        [6.5749e-07],
        [3.5202e-04],
        [1.3175e-03],
        [1.3041e-03],
        [2.9232e-07],
        [1.2726e-06],
        [1.2114e-04],
        [1.0606e-03]], device='cuda:0'), loss: 9.971602439880371: 
target probs tensor([[1.3829e-02],
        [7.1899e-07],
        [5.0027e-05],
        [5.8978e-05],
        [4.3543e-04],
        [1.8301e-04],
        [4.2681e-04],
        [4.24

target probs tensor([[2.5301e-04],
        [8.7782e-05],
        [5.0625e-04],
        [2.1823e-11],
        [1.4552e-02],
        [3.3905e-04],
        [1.6229e-06],
        [5.5348e-04],
        [2.6300e-04],
        [4.6990e-06],
        [3.3669e-05],
        [1.8345e-05],
        [1.2344e-04],
        [8.6095e-03],
        [3.8667e-07],
        [6.1270e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.920244216918945: 
target probs tensor([[2.5180e-06],
        [1.5793e-04],
        [8.3215e-05],
        [9.5926e-05],
        [1.1647e-04],
        [1.6679e-05],
        [5.4648e-05],
        [2.9906e-04],
        [1.3443e-04],
        [1.3618e-04],
        [5.3162e-06],
        [2.4317e-05],
        [1.5084e-04],
        [3.7295e-04],
        [4.9687e-08],
        [1.0068e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.954691886901855: 
target probs tensor([[1.0728e-04],
        [2.1320e-05],
        [8.1433e-07],
        [2.4021e-06],
        [2.0056e-05],
     

epoch,train_loss,valid_loss,validation,fool_loss,time
0,10.676715,10.714426,0.629,10.714425,03:47
1,10.376349,10.493862,0.647,10.493862,03:33
2,10.066933,10.447284,0.661,10.447285,03:34
3,10.125536,10.213342,0.67,10.213342,03:35
4,9.947161,10.356217,0.666,10.356215,03:36


target probs tensor([[8.2842e-11],
        [1.0297e-05],
        [2.5542e-06],
        [1.1588e-07],
        [3.6103e-08],
        [2.8028e-06],
        [5.7797e-08],
        [7.7086e-06],
        [1.3598e-08],
        [1.9046e-10],
        [4.7935e-07],
        [9.1394e-05],
        [6.1519e-06],
        [2.9312e-10],
        [3.1334e-07],
        [1.1529e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 15.31429672241211: 
target probs tensor([[1.3257e-06],
        [2.2368e-10],
        [1.6061e-07],
        [1.3777e-06],
        [4.9677e-09],
        [8.1548e-10],
        [3.7665e-08],
        [3.6985e-06],
        [1.4765e-08],
        [1.3612e-09],
        [3.8203e-11],
        [2.4371e-05],
        [2.8900e-07],
        [5.2139e-07],
        [6.7901e-08],
        [1.4474e-09]], device='cuda:0', grad_fn=<GatherBackward>), loss: 17.002567291259766: 
target probs tensor([[3.2509e-04],
        [5.7307e-06],
        [1.5161e-05],
        [8.3315e-07],
        [1.4954e-07],
    

target probs tensor([[9.4436e-05],
        [3.4996e-05],
        [3.5472e-05],
        [3.0556e-06],
        [2.1061e-05],
        [3.9716e-08],
        [1.3544e-04],
        [2.4557e-08],
        [2.9128e-06],
        [3.8731e-06],
        [6.4663e-06],
        [1.3878e-03],
        [3.9367e-07],
        [1.2756e-05],
        [1.1850e-04],
        [8.4732e-07]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.84255313873291: 
target probs tensor([[1.7405e-06],
        [3.6732e-09],
        [3.0396e-05],
        [1.5220e-06],
        [1.0108e-02],
        [2.6356e-05],
        [1.9700e-04],
        [4.0869e-06],
        [8.7442e-05],
        [3.3458e-06],
        [2.3236e-06],
        [6.4066e-06],
        [7.7759e-06],
        [1.2114e-03],
        [2.2405e-03],
        [5.2518e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.577810287475586: 
target probs tensor([[1.3411e-03],
        [2.0963e-06],
        [3.3915e-05],
        [7.7318e-04],
        [1.0554e-05],
    

target probs tensor([[8.9622e-06],
        [1.6571e-04],
        [3.2670e-05],
        [1.7494e-05],
        [3.0718e-05],
        [9.3122e-05],
        [1.6874e-04],
        [2.8435e-04],
        [5.6894e-05],
        [1.1642e-04],
        [1.0643e-05],
        [7.7953e-04],
        [9.0922e-05],
        [1.7644e-04],
        [7.6895e-05],
        [2.8640e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.591062545776367: 
target probs tensor([[3.7455e-04],
        [1.0648e-07],
        [1.5385e-04],
        [3.0478e-09],
        [2.4496e-09],
        [9.6754e-04],
        [5.5692e-04],
        [2.3247e-05],
        [5.7610e-04],
        [1.9902e-04],
        [6.3525e-09],
        [2.5448e-05],
        [2.2778e-04],
        [2.5976e-04],
        [4.1800e-04],
        [6.0504e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.902685165405273: 
target probs tensor([[2.3243e-05],
        [5.7877e-10],
        [3.4392e-04],
        [1.4754e-05],
        [8.1957e-07],
    

target probs tensor([[1.3859e-09],
        [2.8354e-04],
        [3.8948e-05],
        [2.7700e-06],
        [4.7886e-04],
        [1.6362e-04],
        [3.8326e-05],
        [6.3625e-09],
        [8.7453e-06],
        [8.9514e-07],
        [2.2194e-03],
        [1.4802e-08],
        [1.9376e-06],
        [4.2331e-06],
        [3.7594e-04],
        [4.3347e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.874895095825195: 
target probs tensor([[1.7108e-06],
        [1.0187e-04],
        [1.0909e-05],
        [1.6684e-05],
        [1.2368e-07],
        [1.5365e-04],
        [2.1036e-05],
        [1.0860e-06],
        [2.9637e-05],
        [3.9929e-05],
        [1.5681e-05],
        [1.3066e-04],
        [1.6173e-03],
        [1.7742e-05],
        [1.5787e-07],
        [8.7923e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.775444984436035: 
target probs tensor([[7.8058e-04],
        [4.6123e-04],
        [2.2554e-06],
        [7.4701e-06],
        [3.9935e-02],
   

target probs tensor([[1.7248e-03],
        [2.4345e-05],
        [2.3069e-05],
        [2.5458e-04],
        [5.2204e-05],
        [8.7372e-06],
        [5.6247e-03],
        [3.7285e-05],
        [1.8376e-05],
        [2.5253e-05],
        [3.2848e-03],
        [3.2031e-04],
        [2.4614e-08],
        [1.8962e-03],
        [2.2597e-03],
        [4.6488e-09]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.820119857788086: 
target probs tensor([[1.3282e-04],
        [4.3487e-06],
        [2.0522e-14],
        [6.2911e-05],
        [1.0325e-04],
        [9.1161e-05],
        [1.5284e-04],
        [8.8465e-05],
        [3.4472e-05],
        [2.1688e-04],
        [2.7352e-04],
        [3.1432e-05],
        [2.0907e-04],
        [1.2647e-06],
        [1.8450e-03],
        [3.2951e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.26386547088623: 
target probs tensor([[4.8548e-05],
        [2.6649e-06],
        [3.8360e-04],
        [6.4275e-04],
        [2.7934e-06],
     

target probs tensor([[4.5167e-05],
        [2.3808e-03],
        [1.9285e-04],
        [2.2108e-04],
        [2.6466e-05],
        [4.6510e-03],
        [1.7963e-06],
        [5.6245e-06],
        [1.0813e-04],
        [6.5432e-04],
        [5.1378e-06],
        [1.5171e-04],
        [2.5882e-05],
        [4.9308e-05],
        [5.3483e-05],
        [6.4158e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.478168487548828: 
target probs tensor([[2.6953e-05],
        [7.1683e-05],
        [1.1311e-04],
        [6.0291e-04],
        [1.8752e-03],
        [2.4741e-06],
        [2.4908e-07],
        [5.8370e-04],
        [1.8730e-04],
        [1.4384e-06],
        [1.3598e-05],
        [9.8051e-04],
        [1.0219e-05],
        [1.6611e-04],
        [7.2185e-05],
        [5.9888e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.020566940307617: 
target probs tensor([[1.3088e-04],
        [1.5560e-09],
        [5.1114e-05],
        [2.5665e-06],
        [7.9549e-03],
    

target probs tensor([[7.4451e-10],
        [1.3345e-02],
        [4.4398e-04],
        [9.4448e-06],
        [6.2345e-05],
        [4.2952e-06],
        [3.2344e-05],
        [9.8872e-07],
        [2.9803e-07],
        [4.0360e-06],
        [1.7025e-04],
        [2.7629e-05],
        [1.5838e-04],
        [4.8100e-05],
        [3.6764e-08],
        [1.7973e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.359933853149414: 
target probs tensor([[1.3702e-06],
        [9.0580e-03],
        [9.9772e-04],
        [2.7769e-04],
        [1.7496e-06],
        [1.3418e-04],
        [9.8988e-07],
        [2.5787e-05],
        [1.5275e-04],
        [4.1024e-05],
        [7.9979e-05],
        [6.6544e-05],
        [3.9574e-04],
        [2.3718e-07],
        [4.6547e-06],
        [1.4411e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.12612533569336: 
target probs tensor([[7.7061e-05],
        [5.2679e-04],
        [1.0356e-04],
        [7.2127e-04],
        [1.2360e-05],
    

target probs tensor([[4.6012e-07],
        [8.9085e-05],
        [2.4034e-05],
        [8.4764e-05],
        [2.2943e-05],
        [4.1835e-05],
        [3.2358e-03],
        [6.2371e-06],
        [8.4379e-05],
        [2.7738e-05],
        [3.4415e-10],
        [3.6319e-05],
        [4.2279e-04],
        [1.6963e-06],
        [6.0567e-05],
        [3.5827e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.940437316894531: 
target probs tensor([[1.4842e-04],
        [2.0186e-06],
        [1.0164e-01],
        [5.7035e-04],
        [2.3823e-04],
        [3.5511e-04],
        [2.6272e-06],
        [1.2316e-04],
        [1.1920e-04],
        [3.8977e-03],
        [7.5346e-06],
        [8.1839e-05],
        [3.4295e-04],
        [6.1019e-04],
        [8.0187e-05],
        [7.1270e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.60420036315918: 
target probs tensor([[8.3376e-06],
        [2.5374e-04],
        [1.5320e-05],
        [1.0855e-07],
        [9.3403e-06],
     

target probs tensor([[9.4839e-03],
        [1.6308e-05],
        [2.5181e-05],
        [5.9079e-05],
        [1.3920e-04],
        [5.8571e-03],
        [1.5601e-04],
        [4.2900e-07],
        [2.3106e-05],
        [1.2894e-05],
        [7.0761e-05],
        [1.1196e-05],
        [4.5995e-05],
        [6.2302e-06],
        [8.9197e-05],
        [4.9324e-04]], device='cuda:0'), loss: 9.70358943939209: 
target probs tensor([[4.6050e-05],
        [1.3425e-04],
        [4.6845e-05],
        [7.0715e-05],
        [3.9277e-04],
        [4.5402e-06],
        [2.3420e-03],
        [2.7555e-07],
        [2.6096e-04],
        [9.8433e-06],
        [2.0517e-04],
        [1.9127e-06],
        [8.5716e-04],
        [3.9230e-05],
        [1.7093e-04],
        [6.4863e-08]], device='cuda:0'), loss: 10.225179672241211: 
target probs tensor([[1.2266e-05],
        [5.6221e-05],
        [1.9035e-09],
        [1.7443e-03],
        [2.0989e-04],
        [4.1925e-09],
        [1.1577e-05],
        [9.92

In [None]:
if mode == "sanity_check":
  print("\n\n\nWARNING: you are training on a sanity_check dataset.\n\n\n\n")

saver_best = SaveModelCallback(learn, every='improvement', monitor='validation', name=env.save_filename + "-best")
saver_every_epoch = SaveModelCallback(learn, every='epoch', name=env.save_filename)

# with Hooks(gen, append_stats_normal) as hooks:
#   learn.fit(1, lr=5e-03, wd = 0., callbacks=[saver_best, saver_every_epoch])
  
learn.fit(200, lr=1e-2, wd = 0.001, callbacks=[saver_best, saver_every_epoch])

# # learn.fit(70, lr=1e-02, wd = 0.001, callbacks=[saver_best, saver_every_epoch])

# learn.fit(60, lr=1e-2, wd = 0.001, callbacks=[saver_best, saver_every_epoch])

# for i in range(10):
#   learn.fit_one_cycle(7, wd = 0.,max_lr=1., div_factor = 1000.) 
  
# learn.fit_one_cycle(5, max_lr=2e-2) #used for vgg-19-bn
# learn.fit_one_cycle(5, max_lr=3e-3) # used for resnet50

shutil.copyfile(env.temp_csv_path + '/' + env.save_filename + ".csv", env.get_csv_path() + '.csv')
shutil.copytree(env.data_path/"models", env.get_models_path())
shutil.rmtree(env.data_path/"models")

epoch,train_loss,valid_loss,validation,fool_loss,time
0,9.248246,9.5301,0.754,9.530099,03:37
1,9.131828,9.567307,0.759,9.567307,03:33
2,9.259413,9.435518,0.762,9.435517,03:35
3,9.054097,9.331469,0.771,9.331469,03:35
4,8.939501,9.31979,0.762,9.319793,03:36
5,8.96288,9.222818,0.772,9.222818,03:38
6,8.984812,9.244467,0.768,9.244468,03:37
7,8.922865,9.128101,0.774,9.128102,03:39
8,8.754309,9.067658,0.768,9.067658,03:31
9,8.786857,9.040127,0.776,9.040127,03:33


target probs tensor([[5.0006e-04],
        [1.3348e-03],
        [6.6735e-04],
        [1.8381e-04],
        [3.7858e-09],
        [1.2799e-04],
        [4.5507e-04],
        [8.2727e-04],
        [4.5192e-04],
        [6.5848e-06],
        [5.7615e-04],
        [8.8088e-04],
        [1.5797e-05],
        [2.7098e-05],
        [4.2068e-03],
        [3.5767e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.041454315185547: 
target probs tensor([[6.3078e-09],
        [3.0633e-04],
        [3.4111e-04],
        [5.3928e-04],
        [1.6701e-04],
        [7.0428e-05],
        [1.4906e-03],
        [1.6047e-04],
        [8.3365e-04],
        [1.2788e-04],
        [1.6787e-05],
        [2.0948e-04],
        [4.9507e-05],
        [2.5613e-03],
        [5.7373e-05],
        [1.3192e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.129331588745117: 
target probs tensor([[9.2406e-05],
        [7.7818e-05],
        [8.1836e-04],
        [5.5683e-06],
        [1.6330e-03],
     

target probs tensor([[6.9316e-04],
        [6.1476e-05],
        [3.8512e-03],
        [1.5146e-03],
        [2.0479e-05],
        [1.4209e-05],
        [2.8110e-04],
        [1.8949e-04],
        [1.2970e-03],
        [3.1063e-05],
        [2.5497e-04],
        [7.3798e-06],
        [6.3403e-05],
        [9.7550e-05],
        [7.3415e-05],
        [4.2406e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.958498001098633: 
target probs tensor([[3.5593e-05],
        [2.6169e-04],
        [4.4816e-05],
        [4.3095e-04],
        [1.0003e-04],
        [1.0530e-03],
        [2.3372e-04],
        [8.7362e-04],
        [2.9306e-03],
        [6.5903e-05],
        [2.0953e-04],
        [7.1578e-05],
        [2.1510e-03],
        [3.8071e-03],
        [3.2558e-04],
        [1.4273e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 7.968400001525879: 
target probs tensor([[5.2376e-04],
        [4.0884e-04],
        [1.8743e-05],
        [5.4460e-04],
        [3.4974e-06],
     

target probs tensor([[1.1730e-03],
        [7.4328e-05],
        [7.0149e-05],
        [5.2440e-05],
        [2.9257e-05],
        [3.2245e-04],
        [4.2501e-04],
        [1.0768e-05],
        [1.5667e-05],
        [3.6481e-06],
        [1.8527e-06],
        [8.3368e-06],
        [8.7429e-04],
        [1.1270e-03],
        [2.3633e-05],
        [4.4295e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.771492004394531: 
target probs tensor([[7.8111e-04],
        [7.5038e-05],
        [5.9134e-05],
        [2.2932e-05],
        [2.8808e-04],
        [1.2260e-03],
        [1.5658e-02],
        [2.9331e-05],
        [1.9532e-04],
        [5.2704e-05],
        [1.5277e-03],
        [6.3252e-05],
        [5.0946e-08],
        [5.9258e-03],
        [1.0503e-04],
        [3.5226e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.89993667602539: 
target probs tensor([[2.4391e-03],
        [2.0151e-04],
        [3.4635e-04],
        [3.1185e-06],
        [2.3795e-05],
      

target probs tensor([[5.9053e-04],
        [1.9704e-03],
        [3.4688e-04],
        [3.1450e-04],
        [5.3694e-06],
        [8.7640e-06],
        [1.2961e-03],
        [3.7088e-05],
        [3.2470e-04],
        [1.8887e-05],
        [3.0081e-05],
        [1.4849e-04],
        [4.1603e-05],
        [1.9261e-04],
        [1.0622e-02],
        [1.5715e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.775434494018555: 
target probs tensor([[1.0312e-03],
        [1.0997e-03],
        [3.2407e-03],
        [3.7280e-05],
        [8.0565e-05],
        [2.8263e-06],
        [6.8633e-08],
        [2.2292e-04],
        [8.7065e-06],
        [7.1306e-05],
        [1.7414e-03],
        [1.5131e-04],
        [4.1387e-04],
        [1.0988e-03],
        [1.6499e-05],
        [3.9361e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.158031463623047: 
target probs tensor([[6.1895e-05],
        [6.7329e-06],
        [1.3856e-03],
        [1.5986e-04],
        [5.7967e-02],
     

Better model found at epoch 2 with validation value: 0.7620000243186951.
target probs tensor([[3.7455e-05],
        [3.8997e-04],
        [6.6101e-06],
        [3.2807e-06],
        [5.9892e-04],
        [1.0134e-03],
        [8.2123e-04],
        [8.7166e-04],
        [1.5060e-05],
        [9.8947e-06],
        [8.2039e-03],
        [1.1187e-03],
        [2.4823e-03],
        [1.5994e-04],
        [5.9909e-06],
        [3.4557e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.751331329345703: 
target probs tensor([[9.1323e-06],
        [7.8150e-05],
        [3.8441e-06],
        [9.7724e-05],
        [1.7312e-05],
        [2.1963e-03],
        [9.8374e-08],
        [7.9648e-04],
        [6.5206e-05],
        [3.6635e-04],
        [1.7181e-04],
        [7.4370e-05],
        [4.0206e-04],
        [7.5315e-06],
        [3.4382e-05],
        [7.4498e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.890012741088867: 
target probs tensor([[1.1978e-04],
        [2.6335e-03]

target probs tensor([[2.3738e-04],
        [1.6395e-04],
        [2.0235e-05],
        [8.1146e-05],
        [1.5539e-04],
        [4.8549e-05],
        [3.4061e-05],
        [2.8629e-03],
        [1.6419e-03],
        [7.1337e-04],
        [2.7661e-05],
        [1.1450e-03],
        [2.0531e-03],
        [9.0395e-04],
        [1.0275e-04],
        [2.8704e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.349775314331055: 
target probs tensor([[6.0495e-04],
        [2.2689e-03],
        [3.2489e-05],
        [1.3720e-04],
        [6.6994e-08],
        [7.2552e-07],
        [2.2780e-04],
        [2.2081e-03],
        [7.5759e-04],
        [7.7230e-04],
        [2.6549e-05],
        [2.3010e-04],
        [3.5070e-05],
        [8.5885e-04],
        [1.6131e-03],
        [1.9078e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.11019515991211: 
target probs tensor([[1.9285e-03],
        [1.4368e-01],
        [2.1609e-05],
        [3.3829e-03],
        [1.7226e-05],
      

target probs tensor([[7.4497e-06],
        [2.1454e-03],
        [6.7859e-06],
        [5.0250e-04],
        [3.3629e-05],
        [7.5385e-06],
        [4.3857e-06],
        [5.7465e-04],
        [5.1536e-04],
        [3.4693e-06],
        [4.8959e-06],
        [4.4687e-04],
        [1.1838e-05],
        [1.7747e-04],
        [1.6411e-05],
        [1.8269e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.083383560180664: 
target probs tensor([[1.6714e-02],
        [1.6063e-04],
        [1.8164e-04],
        [2.7045e-08],
        [1.5653e-03],
        [5.2788e-04],
        [1.5912e-04],
        [8.9502e-06],
        [3.5937e-05],
        [2.0879e-04],
        [8.7008e-05],
        [6.0143e-05],
        [8.4026e-05],
        [1.7169e-02],
        [8.1198e-05],
        [4.7409e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.84609317779541: 
target probs tensor([[3.3207e-04],
        [3.8435e-06],
        [3.6426e-04],
        [9.6773e-06],
        [6.8741e-05],
     

target probs tensor([[4.1655e-03],
        [1.2656e-04],
        [1.7101e-04],
        [6.2868e-04],
        [2.0060e-07],
        [6.4823e-04],
        [2.2623e-04],
        [3.3606e-04],
        [1.2003e-02],
        [1.1489e-02],
        [2.3993e-03],
        [5.5814e-04],
        [5.3271e-08],
        [2.0351e-03],
        [3.8899e-04],
        [2.0629e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.209383010864258: 
target probs tensor([[6.0041e-04],
        [1.1289e-05],
        [2.3487e-04],
        [7.0695e-07],
        [1.1338e-03],
        [5.9720e-04],
        [4.7946e-06],
        [3.4609e-06],
        [1.7389e-04],
        [2.7439e-04],
        [4.5797e-04],
        [8.4347e-04],
        [3.2017e-06],
        [9.8326e-05],
        [3.6021e-06],
        [4.6552e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.629107475280762: 
target probs tensor([[2.2935e-04],
        [6.5148e-07],
        [2.4939e-03],
        [9.1050e-04],
        [1.3063e-04],
     

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



target probs tensor([[6.4398e-04],
        [7.6785e-04],
        [6.2049e-03],
        [2.1890e-03],
        [5.5576e-08],
        [2.7797e-09],
        [5.5950e-03],
        [7.0215e-04],
        [4.0214e-03],
        [1.0384e-04],
        [3.0282e-03],
        [2.3261e-04],
        [3.2491e-04],
        [1.0211e-02],
        [4.5816e-05],
        [5.2725e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.490955352783203: 
target probs tensor([[1.0110e-02],
        [1.9081e-03],
        [1.9831e-03],
        [3.4760e-05],
        [1.2229e-04],
        [5.6157e-05],
        [2.0734e-04],
        [8.6936e-04],
        [1.0934e-04],
        [2.8214e-04],
        [5.6469e-04],
        [1.4434e-04],
        [1.8253e-04],
        [6.6549e-03],
        [1.9321e-04],
        [2.2745e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 7.721676349639893: 
target probs tensor([[4.1276e-04],
        [2.1427e-04],
        [3.7139e-05],
        [3.0809e-04],
        [8.7906e-04],
     

target probs tensor([[1.0638e-06],
        [4.4032e-04],
        [1.2489e-05],
        [1.8941e-03],
        [7.5191e-05],
        [9.0963e-09],
        [1.3689e-03],
        [1.3102e-05],
        [3.8554e-05],
        [6.4384e-04],
        [3.5165e-04],
        [1.5956e-05],
        [5.7357e-04],
        [1.3537e-02],
        [8.3880e-04],
        [5.6976e-04]], device='cuda:0'), loss: 9.231719017028809: 
target probs tensor([[9.3852e-05],
        [4.1723e-03],
        [2.3637e-07],
        [2.9420e-05],
        [1.1292e-03],
        [5.2582e-08],
        [1.1761e-03],
        [1.0265e-05],
        [2.0662e-04],
        [5.2060e-06],
        [5.0769e-04],
        [6.2257e-06],
        [5.3864e-05],
        [1.0974e-04],
        [1.0743e-03],
        [9.1111e-05]], device='cuda:0'), loss: 9.84566593170166: 
target probs tensor([[1.6583e-04],
        [5.5576e-05],
        [7.9238e-03],
        [1.0505e-05],
        [8.9354e-06],
        [1.0261e-02],
        [2.3981e-05],
        [9.284

target probs tensor([[9.6218e-05],
        [4.9284e-04],
        [6.6205e-08],
        [1.0680e-06],
        [8.2277e-05],
        [4.5589e-04],
        [2.5586e-04],
        [1.2192e-04],
        [5.0917e-04],
        [2.0254e-04],
        [1.5108e-04],
        [1.7090e-05],
        [5.7090e-05],
        [7.5861e-08],
        [1.0159e-04],
        [5.8159e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 10.012174606323242: 
target probs tensor([[4.9630e-05],
        [2.3823e-02],
        [4.0863e-05],
        [1.6290e-04],
        [2.5499e-04],
        [8.8376e-04],
        [4.0964e-03],
        [1.6791e-05],
        [9.4581e-04],
        [8.4686e-05],
        [6.5444e-06],
        [7.1680e-03],
        [5.3818e-05],
        [1.7875e-05],
        [1.1112e-02],
        [7.2331e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.123878479003906: 
target probs tensor([[2.6697e-05],
        [1.2670e-05],
        [6.6499e-04],
        [1.0627e-03],
        [8.5858e-05],
    

target probs tensor([[8.6305e-07],
        [3.3464e-06],
        [2.2252e-03],
        [6.5904e-04],
        [3.3338e-02],
        [1.1752e-04],
        [6.6285e-04],
        [1.4731e-04],
        [1.4451e-03],
        [1.3316e-06],
        [6.7681e-04],
        [1.6849e-05],
        [9.0280e-04],
        [5.1048e-06],
        [1.4767e-05],
        [6.9526e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.3217191696167: 
target probs tensor([[1.1269e-03],
        [2.2827e-06],
        [1.6027e-04],
        [7.0878e-04],
        [6.9555e-04],
        [2.5292e-03],
        [1.0671e-04],
        [6.6469e-03],
        [7.8486e-06],
        [2.1061e-04],
        [8.6497e-04],
        [6.7409e-05],
        [7.3583e-05],
        [3.2486e-04],
        [4.0431e-04],
        [6.0001e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.44628620147705: 
target probs tensor([[5.4281e-03],
        [1.3440e-03],
        [1.5695e-03],
        [4.6125e-06],
        [3.6315e-04],
        

target probs tensor([[1.7480e-04],
        [9.6766e-04],
        [4.8969e-04],
        [7.4812e-04],
        [4.2225e-04],
        [2.5520e-04],
        [1.7096e-03],
        [4.2949e-04],
        [1.5296e-03],
        [4.2228e-04],
        [2.1465e-05],
        [7.3079e-04],
        [1.8397e-03],
        [4.3219e-04],
        [4.0275e-04],
        [7.1052e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 7.619657516479492: 
target probs tensor([[2.5460e-04],
        [4.6299e-04],
        [1.6355e-05],
        [5.7826e-05],
        [8.0199e-03],
        [8.3910e-04],
        [8.7411e-04],
        [4.7362e-04],
        [3.2176e-04],
        [1.0635e-03],
        [3.5915e-06],
        [1.5632e-04],
        [5.0823e-05],
        [3.9331e-06],
        [3.5317e-05],
        [4.8089e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.734481811523438: 
target probs tensor([[4.4090e-04],
        [4.0091e-03],
        [1.0002e-05],
        [1.2176e-03],
        [4.6553e-05],
     

target probs tensor([[3.9149e-04],
        [1.2043e-04],
        [2.4761e-04],
        [4.9327e-06],
        [3.8620e-03],
        [1.8218e-06],
        [1.4150e-05],
        [3.9143e-05],
        [1.6093e-04],
        [1.2451e-04],
        [5.5147e-05],
        [8.6800e-03],
        [9.6932e-04],
        [5.6086e-04],
        [1.3488e-03],
        [1.8624e-02]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.42339038848877: 
target probs tensor([[5.1505e-05],
        [4.9496e-03],
        [1.3155e-05],
        [2.3478e-03],
        [2.1634e-04],
        [3.6762e-05],
        [2.1511e-02],
        [5.7103e-04],
        [2.1441e-05],
        [1.1121e-04],
        [2.8548e-04],
        [1.7412e-04],
        [5.0539e-04],
        [1.4417e-04],
        [4.0728e-05],
        [3.4086e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.3519287109375: 
target probs tensor([[3.9025e-03],
        [2.2356e-04],
        [4.7492e-05],
        [1.7367e-04],
        [7.9507e-07],
        

target probs tensor([[9.8154e-04],
        [1.9793e-03],
        [8.5458e-04],
        [1.4856e-04],
        [1.9401e-04],
        [9.4895e-05],
        [4.5792e-03],
        [5.9030e-05]], device='cuda:0'), loss: 7.745607852935791: 
target probs tensor([[1.6645e-04],
        [2.7985e-03],
        [1.5135e-02],
        [2.3492e-04],
        [6.2049e-06],
        [2.8481e-03],
        [1.8115e-04],
        [2.5899e-04],
        [4.0398e-08],
        [2.2748e-05],
        [2.7034e-05],
        [3.6341e-05],
        [5.8258e-05],
        [1.9362e-03],
        [9.6413e-05],
        [7.9669e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.06180191040039: 
target probs tensor([[1.1014e-04],
        [1.5615e-04],
        [1.3946e-04],
        [1.1797e-05],
        [1.1801e-04],
        [4.2394e-04],
        [1.1379e-08],
        [3.6304e-03],
        [2.3725e-04],
        [4.0454e-04],
        [2.8616e-03],
        [3.3874e-03],
        [2.6722e-06],
        [5.4937e-04],
        [4

target probs tensor([[4.5950e-03],
        [1.3830e-02],
        [1.1882e-01],
        [7.7077e-04],
        [1.2336e-03],
        [5.1754e-03],
        [1.1116e-02],
        [6.6395e-06],
        [8.6456e-05],
        [6.3855e-05],
        [3.5867e-05],
        [2.1063e-04],
        [1.2809e-04],
        [1.9999e-03],
        [2.4554e-03],
        [6.6185e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 7.385863304138184: 
target probs tensor([[3.3913e-04],
        [1.8343e-03],
        [1.2997e-03],
        [6.8276e-06],
        [6.0055e-05],
        [3.4753e-05],
        [4.4928e-04],
        [4.8413e-06],
        [2.7283e-07],
        [6.9759e-04],
        [3.4811e-05],
        [2.7587e-05],
        [1.8893e-04],
        [8.4438e-05],
        [1.2674e-02],
        [3.3377e-05]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.283699035644531: 
target probs tensor([[1.0343e-04],
        [1.3079e-04],
        [4.1350e-04],
        [1.1263e-04],
        [1.9716e-06],
     

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



target probs tensor([[7.8706e-03],
        [2.3835e-05],
        [1.7786e-03],
        [1.7034e-06],
        [6.3835e-07],
        [4.2686e-04],
        [4.2946e-11],
        [7.5493e-08],
        [7.0661e-08],
        [1.3475e-04],
        [4.7408e-04],
        [9.9641e-09],
        [3.4120e-08],
        [9.4691e-04],
        [2.6008e-03],
        [8.3713e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 11.55176067352295: 
target probs tensor([[3.8259e-03],
        [2.5641e-05],
        [8.2240e-06],
        [3.5102e-04],
        [4.8410e-04],
        [1.7893e-04],
        [5.8821e-05],
        [2.4290e-04],
        [9.5906e-04],
        [2.3592e-06],
        [1.0615e-07],
        [1.0769e-03],
        [5.2427e-05],
        [2.8868e-04],
        [1.1343e-04],
        [1.3377e-04]], device='cuda:0', grad_fn=<GatherBackward>), loss: 9.308362007141113: 
target probs tensor([[2.2933e-03],
        [1.2230e-05],
        [1.1020e-01],
        [2.4972e-03],
        [6.7504e-03],
     

target probs tensor([[1.6994e-03],
        [1.4150e-03],
        [5.0900e-03],
        [9.9608e-06],
        [9.4377e-05],
        [4.1506e-04],
        [1.5570e-04],
        [3.1786e-08],
        [8.0760e-05],
        [4.2688e-03],
        [2.8396e-04],
        [3.7615e-07],
        [3.1922e-02],
        [6.7459e-03],
        [3.3920e-04],
        [7.6216e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.679765701293945: 
target probs tensor([[3.0136e-03],
        [2.0050e-03],
        [3.4297e-04],
        [7.9690e-04],
        [2.9923e-03],
        [1.1565e-03],
        [1.9789e-03],
        [5.6998e-04],
        [2.1836e-04],
        [4.9292e-03],
        [8.8449e-04],
        [8.5908e-04],
        [6.7644e-04],
        [1.1820e-06],
        [3.1589e-04],
        [1.5069e-01]], device='cuda:0', grad_fn=<GatherBackward>), loss: 7.008136749267578: 
target probs tensor([[1.3109e-05],
        [2.5476e-03],
        [1.7749e-03],
        [2.9552e-07],
        [8.4541e-04],
     

target probs tensor([[1.3557e-03],
        [1.2392e-04],
        [1.2876e-03],
        [1.0055e-03],
        [1.3402e-05],
        [2.2333e-05],
        [1.6606e-06],
        [5.9456e-04],
        [3.0105e-04],
        [1.8344e-05],
        [5.0509e-04],
        [7.2657e-04],
        [2.3694e-03],
        [1.5406e-04],
        [1.0789e-03],
        [5.4941e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.713835716247559: 
target probs tensor([[3.2685e-03],
        [1.8905e-05],
        [3.9445e-05],
        [1.9507e-05],
        [8.2859e-05],
        [1.9132e-04],
        [5.4563e-04],
        [3.0514e-04],
        [3.3880e-07],
        [3.4201e-03],
        [2.8300e-05],
        [7.0315e-03],
        [3.7125e-03],
        [4.4172e-04],
        [1.4086e-03],
        [5.8050e-04]], device='cuda:0'), loss: 8.406028747558594: 
target probs tensor([[2.6854e-05],
        [6.0704e-04],
        [2.5387e-14],
        [6.3448e-06],
        [6.7850e-05],
        [1.3834e-03],
        [

target probs tensor([[2.4208e-07],
        [1.6007e-05],
        [2.6588e-04],
        [4.0307e-04],
        [3.7668e-04],
        [1.1120e-03],
        [3.0729e-04],
        [8.2049e-05],
        [6.5653e-03],
        [6.1332e-04],
        [3.1944e-03],
        [3.9247e-05],
        [5.6795e-05],
        [1.1975e-04],
        [8.5479e-05],
        [3.6582e-03]], device='cuda:0', grad_fn=<GatherBackward>), loss: 8.537824630737305: 
target probs tensor([[1.1593e-02],
        [5.4423e-04],
        [2.3079e-03],
        [8.1573e-04],
        [1.0657e-03],
        [1.9502e-03],
        [4.3238e-04],
        [6.6575e-05],
        [1.1969e-03],
        [1.2752e-03],
        [6.4813e-03],
        [4.7918e-04],
        [3.0755e-03],
        [3.0277e-04],
        [7.7579e-02],
        [2.4820e-06]], device='cuda:0', grad_fn=<GatherBackward>), loss: 6.93926477432251: 
target probs tensor([[8.2865e-03],
        [1.4947e-03],
        [1.0752e-02],
        [5.7023e-03],
        [3.2285e-04],
      

In [None]:
#plot histogram
fig, axes = plt.subplots(len(hooks),1, figsize=(30,12))
for ax,h in zip(axes.flatten(), hooks):
  ax.imshow(get_hist(h), origin='lower')
  ax.axis('off')
plt.tight_layout()

In [None]:
# plot mean and std
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks:
  ms, ss, _ = h.stats
  ax0.plot(ms[:100])
  ax1.plot(ss[:100])
plt.legend(range(len(hooks)))

In [None]:
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks:
  ms, ss, _ = h.stats
  ax0.plot(ms)
  ax1.plot(ss)
plt.legend(range(len(hooks)))

In [None]:
# zero precentage:
fig,axes = plt.subplots(len(hooks),1, figsize=(30,30))
for ax,h in zip(axes.flatten(), hooks):
    ax.plot(get_min(h))
    ax.set_ylim(0,1)
plt.tight_layout()

In [None]:
# z1 = torch.empty(10).uniform_(-1,1).cuda()
# z2 = torch.empty(10).uniform_(-1,1).cuda()
z1 = torch.tensor([0.8, -0.5] * 5).cuda()
z2 = torch.tensor([-1.] * 10).cuda()
print("z1: ", z1)
print("z2: ", z2)
print("distance: ", torch.norm(z1-z2,p=2))
model = learn.model.eval()

z_s = interpolate(z1, z2, 0.1)
print(len(z_s))

for i,z in enumerate(z_s):
  img = noise_to_image(model.forward_single_z(z))
  img.show()
  #img.save('./pics/' + str(i) + '.png')

In [27]:
def generate_perturbations(learn, n_perturbations):
  initial_training_mode = learn.model.training
  
  model = learn.model.eval()
  input_img = (learn.data.valid_ds[0][0].data)[None].cuda()
  perturbations = []
  for i in range(n_perturbations):
    perturbation = model(input_img)[0].squeeze()
    perturbations.append(perturbation)
    
  learn.model.train(initial_training_mode)  
  return perturbations

def compute_prediction_histogram(learn, perturbation, verbose=False):
  pred_hist = [0] * 1000
  batch_no = -1
  for batch, _ in learn.data.valid_dl:
    batch_no += 1
    if batch_no % 100 == 0 and verbose: print ("at batch no {}".format(batch_no))
    perturbed_batch = batch + perturbation[None]
    preds = arch(perturbed_batch).argmax(1)
    for pred in preds:
      pred_hist[pred] += 1
  return pred_hist


def compute_mean_prediction_histogram(learn, perturbations):
  pred_histogram = torch.tensor([0] * 1000).detach_()
  for j, perturbation in enumerate(perturbations):
    pred_histogram_j = torch.tensor(compute_prediction_histogram(learn, perturbation, True)).detach_()
    pred_histogram += pred_histogram_j
    print("finished creating histogram for the {}th perturbation".format(j))
  
  pred_histogram = pred_histogram.float() / len(perturbations)
  return pred_histogram.tolist()


def diversity(learn, n_perturbations, percentage = 95):
  pred_histogram = compute_mean_prediction_histogram(
      learn, generate_perturbations(learn, n_perturbations)
  )
  print("finished creating the prediction histogram")
  pred_histogram_sum = np.sum(pred_histogram)

  indexed_pred_histogram = [(i, hist_element) for i,hist_element in  
                            enumerate(pred_histogram)]

  indexed_pred_histogram.sort(key=lambda x: x[1], reverse = True)

  cumulative_percent = 0
  n_used_classes = 0
  top_classes = []
  while cumulative_percent < percentage:
    hist_elem = indexed_pred_histogram[n_used_classes]
    cumulative_percent += (hist_elem[1] / pred_histogram_sum) * 100.
    top_classes.append(hist_elem[0])
    n_used_classes += 1

  return n_used_classes, indexed_pred_histogram, top_classes

# idea : have 200 noises (1 for each class), then start iterating the dataset, and for each image, randomly apply one noise and record the result
def targeted_diversity(learn, n_perturbations = 200, percentage = 95):
  model = learn.model.eval()

  one_hot_conditions = [torch.empty(z_dim).uniform_(0,1).cuda().detach() for _ in range(n_perturbations)]
#   for i in range(z_dim):
#     one_hot_conditions[i][i] = 1.

  perturbations = [model.forward_single_z(z) for z in one_hot_conditions]

  hist = [0.] * z_dim
  batch_no = -1
  for batch, _ in learn.data.valid_dl:
    batch_no += 1
    if batch_no % 100 == 0 : print("at batch_no {}".format(batch_no))
    perturbed_batch = batch + perturbations[np.random.randint(0,len(perturbations))][None]
    preds = arch(perturbed_batch).argmax(1)
    for pred in preds:
      hist[pred] += 1

  pred_histogram_sum = np.sum(hist)
  indexed_pred_histogram = [(i, hist_element) for i,hist_element in  
                            enumerate(hist)]

  indexed_pred_histogram.sort(key=lambda x: x[1], reverse = True)

  cumulative_percent = 0
  n_used_classes = 0
  while cumulative_percent < percentage:
    hist_elem = indexed_pred_histogram[n_used_classes]
    cumulative_percent += (hist_elem[1] / pred_histogram_sum) * 100.
    n_used_classes += 1

  return n_used_classes, indexed_pred_histogram

    

In [25]:
#on validation
%precision 2
# n, hist = targeted_diversity(learn, 150, 95)
# n, hist
n, hist, tk = diversity(learn, 10, 95)
n, hist, tk

at batch no 0
finished creating histogram for the 0th perturbation
at batch no 0
finished creating histogram for the 1th perturbation
at batch no 0
finished creating histogram for the 2th perturbation
at batch no 0
finished creating histogram for the 3th perturbation
at batch no 0
finished creating histogram for the 4th perturbation
at batch no 0
finished creating histogram for the 5th perturbation
at batch no 0
finished creating histogram for the 6th perturbation
at batch no 0
finished creating histogram for the 7th perturbation
at batch no 0
finished creating histogram for the 8th perturbation
at batch no 0
finished creating histogram for the 9th perturbation
finished creating the prediction histogram


(269,
 [(854, 323.60),
  (858, 141.40),
  (79, 48.90),
  (490, 26.90),
  (109, 25.30),
  (898, 14.60),
  (455, 14.00),
  (69, 12.20),
  (878, 10.10),
  (506, 9.20),
  (788, 9.10),
  (815, 8.50),
  (539, 7.80),
  (753, 7.70),
  (987, 6.00),
  (581, 5.50),
  (911, 5.30),
  (340, 4.70),
  (808, 4.60),
  (314, 4.30),
  (363, 4.10),
  (489, 4.10),
  (737, 4.10),
  (794, 4.00),
  (599, 3.80),
  (488, 3.60),
  (640, 3.60),
  (783, 3.60),
  (401, 3.20),
  (533, 3.10),
  (68, 3.00),
  (50, 2.60),
  (790, 2.60),
  (735, 2.50),
  (58, 2.40),
  (76, 2.20),
  (906, 2.20),
  (576, 2.10),
  (626, 2.00),
  (84, 1.90),
  (709, 1.90),
  (398, 1.80),
  (538, 1.80),
  (772, 1.80),
  (464, 1.70),
  (57, 1.60),
  (411, 1.60),
  (440, 1.60),
  (192, 1.50),
  (292, 1.50),
  (334, 1.50),
  (459, 1.50),
  (509, 1.50),
  (512, 1.50),
  (636, 1.50),
  (645, 1.50),
  (868, 1.50),
  (872, 1.50),
  (36, 1.40),
  (151, 1.40),
  (189, 1.40),
  (302, 1.40),
  (342, 1.40),
  (410, 1.40),
  (724, 1.40),
  (953, 1.40),
  

In [None]:
sorted_hist = sorted(hist, key=lambda x: x[0], reverse = False)
values = [elem[1] for elem in sorted_hist]
import matplotlib.pyplot as plt
plt.plot(values)

In [None]:
entropy(values)

In [None]:
distance_from_uniform(values)

In [None]:
def make_triplet_samples(z, margin, r2, r3):
  positive_sample = z + random_vector_volume(z.shape, 0, margin).cuda() 
  negative_sample = z + random_vector_volume(z.shape, r2, r3).cuda()
  return positive_sample, negative_sample

def random_vector_surface(shape, r = 1.):
  mat = torch.randn(size=shape).cuda()
  norm = torch.norm(mat, p=2, dim=1, keepdim = True).cuda()
  return (mat/norm) * r

def random_vector_volume(shape, inner_r, outer_r):
  fraction = torch.empty(shape[0]).uniform_(inner_r, outer_r).cuda()
  fraction = ((fraction / outer_r) ** (1 / shape[1])) * outer_r # volume-normalize the fraction
  fraction.unsqueeze_(-1)
  return random_vector_surface(shape, 1) * fraction

In [None]:
from collections import Counter

def most_frequent(x):
  return Counter(x).most_common(1)[0]

def preds_around(center, radius, n_preds, model, dummy_img):
  z_s = random_vector_volume([n_preds, 10], radius, radius + 0.01) + center[None]
  noises = model.forward_z(z_s)
  perturbed_imgs = noises + dummy_img 
  return torch.argmax(arch(perturbed_imgs), 1)
  
def most_freq_pred_around(center, radius, n_preds, model, dummy_img):
  preds = preds_around(center, radius, n_preds, model, dummy_img)
  most_freq = most_frequent(preds.tolist())
  return (class_index_to_label(most_freq[0]), most_freq[1]/n_preds)

def investigate_neighborhood(z, step, model, dummy_img):
  with torch.no_grad():
    result = []
    for radius in np.arange(0.1, 6., step):
#       print("creating {} more preds".format(int(10 + 5 * (radius ** 2))))
      most_freq_pred = most_freq_pred_around(z, radius, int(10 + 5 * (radius ** 2)), model, dummy_img)
      result.append((radius, most_freq_pred))
    return result

In [None]:
#experiment 1

z = torch.tensor([0.5] * 10).cuda()
# z = torch.empty(10).uniform_(-1, 1).cuda()
# z_s = z[None]

model = learn.model.eval()
x_img = normalize(learn.data.train_ds[50][0].data.cuda())
  
for i in range(6):
  z = torch.empty(10).uniform_(-1, 1).cuda()
  print("investigation for: ", z)
  for elem in investigate_neighborhood(z, 0.5, model, x_img):
    print(elem)
print("done")

In [None]:
#experiment 1-1: modified investigate_z
z_investigate_path = '/root/Derakhshani/adversarial/textual_notes/investigate_z_{}.txt'.format(env.save_filename)
if Path(z_investigate_path).exists(): raise FileExistsError("file already exists")
file = open(str(z_investigate_path), 'w')
        
for i, (z, noise) in enumerate(zip(pruned_z_s, pruned_noises)):
  hist = compute_prediction_histogram(learn, noise)
  indexed_hist = [(i, val) for i, val in enumerate(hist)]
  sorted_hist = sorted(indexed_hist, key=lambda x: x[1], reverse=True)
  labeled_hist = [(class_index_to_label(i), count) for i, count in sorted_hist]
  print("result {}:".format(i))
  print(big_vector_to_str(z))
  print(labeled_hist[:6])
  print("\n\n")
  
  file.write("result {}:\n".format(i))
  file.write(big_vector_to_str(z) + "\n")
  file.write(str(labeled_hist[:6]))
  file.write("\n\n\n")
  file.flush()

In [None]:
#experiment 2
import itertools
z_s = [torch.tensor(t).cuda() for t in itertools.product( *([[-0.33, 0.33]] * 10) )]
model = learn.model.eval()
noises = []
with torch.no_grad():
  for z in z_s:
    noises.append(model.forward_single_z(z))

In [None]:
x_img = normalize(learn.data.train_ds[50][0].data.cuda())

preds = []
for noise in noises:
  perturbed_img = x_img + noise
  preds.append(torch.argmax(arch(perturbed_img[None]), 1)[0].item())

from collections import Counter
result = [(class_index_to_label(index), count) for index, count in Counter(preds).most_common(5)]
result

In [None]:
#experiment 3
import itertools
dimension_values = [[-0.9, 0.9]] * z_dim
for i in range(z_dim):
  if i % 100 != 0:
    dimension_values[i] = [0.]
# dimension_values[0] = [0.]
# dimension_values[3] = [0.]
# dimension_values[6] = [0.]
# dimension_values[9] = [0.]
pruned_z_s = [torch.tensor(t).cuda() for t in itertools.product(*dimension_values)]
model = learn.model.eval()
with torch.no_grad():
  pruned_noises = [model.forward_single_z(z) for z in pruned_z_s]

In [None]:
#experiment 3: for the targeted-attack case
pruned_z_s = []
for i in range(z_dim):
  new_z = torch.zeros(z_dim).cuda()
  new_z[i] = 1.
  pruned_z_s.append(new_z)
  
model = learn.model.eval()
with torch.no_grad():
  pruned_noises = [model.forward_single_z(z) for z in pruned_z_s]

In [None]:
#experiment 3-1: noises for 
pruned_z_s = []
# for i in range(z_dim):
#   new_z = torch.empty(z_dim).uniform_(0,1).cuda().detach()
#   pruned_z_s.append(new_z)

for i in range(z_dim):
  new_z = torch.zeros(z_dim).cuda().detach()
  new_z[i] = 1.
  pruned_z_s.append(new_z)
  
model = learn.model.eval()
with torch.no_grad():
  pruned_noises = [model.forward_single_z(z) for z in pruned_z_s]

In [None]:
for noise in pruned_noises[0:200]:
  img = noise_to_image(noise)
  img.show()

In [None]:
# spider web
z_values = [
  [ 0.33, -0.33,  0.33,  0.33, -0.33,  0.33, -0.33, -0.33, -0.33, -0.33],
  [ 0.33,  0.33, -0.33, -0.33, -0.33,  0.33, -0.33,  0.33, -0.33, -0.33],
  [ 0.33,  0.33, -0.33,  0.33,  0.33, -0.33, -0.33, -0.33,  0.33, -0.33],
  [-0.33, -0.33, -0.33, -0.33, -0.33, -0.33, -0.33, -0.33, -0.33, -0.33],
  [-0.33, -0.33, -0.33, -0.33, -0.33,  0.33,  0.33, -0.33,  0.33,  0.33],
  [-0.33, -0.33, -0.33,  0.33,  0.33, -0.33, -0.33, -0.33, -0.33, -0.33],
  [-0.33, -0.33,  0.33,  0.33, -0.33, -0.33,  0.33, -0.33,  0.33, -0.33],
  [-0.33,  0.33, -0.33,  0.33,  0.33, -0.33,  0.33,  0.33,  0.33,  0.33],
  [-0.33,  0.33,  0.33,  0.33, -0.33, -0.33,  0.33,  0.33, -0.33, -0.33],
  [ 0.33, -0.33, -0.33, -0.33,  0.33,  0.33, -0.33, -0.33, -0.33, -0.33],
  [ 0.33, -0.33,  0.33,  0.33, -0.33, -0.33,  0.33,  0.33, -0.33,  0.33],
  [ 0.33,  0.33, -0.33,  0.33, -0.33, -0.33,  0.33,  0.33, -0.33,  0.33],
  [ 0.33,  0.33,  0.33, -0.33, -0.33, -0.33, -0.33,  0.33, -0.33, -0.33],
  [ 0.33,  0.33,  0.33,  0.33, -0.33, -0.33, -0.33,  0.33, -0.33, -0.33],
  [ 0.33,  0.33,  0.33,  0.33,  0.33,  0.33,  0.33,  0.33,  0.33, -0.33],
  [ 0.33,  0.33,  0.33, -0.33, -0.33, -0.33, -0.33,  0.33,  0.33, -0.33],
]

if any(z_values.count(x) > 1 for x in z_values):
  raise Exception("duplicate")
  
z_s = [torch.tensor(z).cuda() for z in z_values]
model = learn.model.eval()

for z in z_s:
  img = noise_to_image(model.forward_single_z(z))
  img.show()


In [None]:
z_values = [
  # window screen
  [-0.33,  0.33,  0.33, -0.33, -0.33,  0.33, -0.33, -0.33, -0.33, -0.33],
  [-0.33,  0.33,  0.33, -0.33, -0.33, -0.33, -0.33, -0.33,  0.33,  0.33],
]

if any(z_values.count(x) > 1 for x in z_values):
  raise Exception("duplicate")
  
z_s = [torch.tensor(z).cuda() for z in z_values]
model = learn.model.eval()

for z in z_s:
  img = noise_to_image(model.forward_single_z(z))
  img.show()

In [None]:
#vgg-16_12 most repeated labels:
l = [(611, 215.0),
  (474, 194.1),
  (398, 120.3),
  (721, 79.6),
  (741, 73.5),
  (510, 62.5)]

[(class_index_to_label(index), count) for index, count in l]

In [None]:
# learn.recorder.plot_losses()
# learn.recorder.plot_lr()
# learn.recorder.plot_metrics()

In [None]:
fooling_rates = []
model = learn.model.eval()
learn.metrics = [validation_single_perturbation]
for i in range(10):
  global_perturbations = model(torch.rand(1, 3, 224, 244).cuda())[0]
  nag_util.global_perturbations = global_perturbations
  fooling_rates.append(learn.validate()[1].cpu().item())
  print("%d : %f"%(i, fooling_rates[-1]))

mean = np.mean(fooling_rates)
stddev = np.std(fooling_rates)
print(mean, stddev); print(fooling_rates)

In [None]:
#the Image works good for floats in range [0..1]
model = learn.model.eval()

x_img = learn.data.train_ds[200][0]
x = normalize(x_img.data.cuda())
z = torch.tensor([-0.33,  0.33, -0.33, -0.33, -0.33,  0.33,  0.33, -0.33, -0.33, -0.33], dtype=torch.float32).cuda()
# z = torch.empty(z_dim).uniform_(-1,1).cuda()
p = model.forward_single_z(z).detach()

p_x = x + p
# print("img range, noise range")
# print_range(x); print_range(p)
adv_label = class_index_to_label(arch(p_x[None]).argmax(1).item())
print_big_vector(arch(p_x[None])[0])
p_x = denormalize(p_x)
p_x.clamp_(0,1)


#prepare images
p_x_img = Image(p_x)
p = scale_to_range(p, [0., 1.])
p_img = Image(p)
x_img.show()
p_img.show()
p_x_img.show()


# print_range(p)
# print_range(denormalize(x))
# print_range(p_x)

benign_label = class_index_to_label(arch(x[None]).argmax(1).item())

print_big_vector(arch(x[None])[0])
print(benign_label, adv_label)

In [None]:
z1 = torch.tensor([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=torch.float32).cuda()
p1 = model.forward_single_z(z1)

z2 = torch.tensor([1, -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=torch.float32).cuda()
p2 = model.forward_single_z(z2)

z3 = torch.tensor([1, 1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=torch.float32).cuda()
p3 = model.forward_single_z(z3)

l2_distance(p1, p3)

In [None]:
#the Image works good for floats in range [0..1]
model = learn.model.eval()

x_img = learn.data.train_ds[4][0]
x = x_img.data[None].cuda()
p = model(x)[0].squeeze().detach() 
x = x.squeeze()
x = normalize(x)

p_x = x + p
p_x = denormalize(p_x)
p_x.clamp_(0,1)


#prepare images
p_x_img = Image(p_x)
p = scale_to_range(p, [0.,1.])
p_img = Image(p)
# x_img.show()
p_img.show()
# p_x_img.show()

print_range(p)
print_range(x)
print_range(p_x)