## Over parameterized network Non decreasing lr

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torchvision
import torchvision.transforms as transforms
from torch.autograd.functional import hessian
import torch.utils.data as data_utils
import matplotlib.pyplot as plt
import random

from tqdm.notebook import tqdm 
import pickle
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns
import os
import copy
from torch.nn.utils import _stateless

batch_size = 1
num_workers = 1
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
# device

details = {}
details['use_db'] = 'mnist'
details['result_root_dir']='results/t0/'
details['result_path']='try1_t8_w16'
details['g_weight'] = [16]
# details['ratio'] = 15
details['book_keep_freq'] = 20
details['g_times'] = 8
details['g_epochs'] = 10000
details['alpha_0']= 0.003
details['freq_reduce_by'] = 20
details['freq_reduce_after'] = 100

details['training_step_limit'] = 2000000 ## this is to train for max updates per epochs
details['stop_hess_computation'] = 1000000 ## Stop computing hessian after calculated these many times


print(f'selected weight:{details["g_weight"]}')

with open(details['result_root_dir']+'details_'+details['result_path']+'.txt', 'w+') as f:
    for key, val in details.items():
        content = key + ' : '+str(val) + '\n'
        f.write(content)
        
torch.manual_seed(3407)
np.random.seed(3407)
torch.cuda.manual_seed_all(3407)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# torch.use_deterministic_algorithms(True)
# os.environ["CUBLAS_WORKSPACE_CONFIG"]=":4096:2"

train_data_all = torchvision.datasets.MNIST(
    root="./data",
    train=True,
    download=True,
    transform=transforms.ToTensor()
)

test_data_all = torchvision.datasets.MNIST(
    root="./data",
    train=False,
    download=True,
    transform=transforms.ToTensor()
)
# print(f'train data:{train_data}')
# print(f'test data:{test_data}')

def get_random_subset(train_data_all, test_data_all):    
    # train_indices = torch.arange(20000)
    test_indices = torch.arange(256)
    train_indices = torch.randint(60000-1, (2000,))
    # print(f'train indices:{train_indices[:10]}')
    train_data = data_utils.Subset(train_data_all, train_indices)
    test_data = data_utils.Subset(test_data_all, test_indices)
    # print(f'train data:{train_data}')
    # print(f'test data:{test_data}')
    def seed_worker(worker_id):
        worker_seed = torch.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)

    g = torch.Generator()
    g.manual_seed(0)

    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=batch_size,
        num_workers=num_workers,
        worker_init_fn=seed_worker,
        generator=g,
    )
    test_loader = torch.utils.data.DataLoader(
        test_data,
        batch_size=256,
        num_workers=num_workers,
        worker_init_fn=seed_worker,
        generator=g,
    )
    print(f'train data size:{len(train_loader.dataset)}')
    print(f'test data size:{len(test_loader.dataset)}')
    # X_mat, y_mat = torch.Tensor(len(train_loader.dataset),784), torch.Tensor(len(train_loader.dataset)).long()
    # for i, (data, label) in enumerate(train_loader):
    #     X_mat[i] = data.flatten()
    #     y_mat[i] = label.flatten()
    # print(f'X_mat shape:{X_mat.shape}, y_mat shape:{y_mat.shape}')
    return train_loader, test_loader #, X_mat, y_mat


class Net(nn.Module):

    def __init__(self, input_features, hidden_layers, output_size):
        super(Net, self).__init__()
        self.layers = len(hidden_layers) + 1
        self.total_params_len = 0
        self.fc_layers = nn.ModuleList()
        prev_weight = input_features
        
        for i, weight in enumerate(hidden_layers):
            self.fc_layers.append(nn.Linear(prev_weight, weight))
            self.total_params_len += prev_weight*weight + weight
            prev_weight = weight
        
        self.fc_last = nn.Linear(hidden_layers[-1], output_size)
        self.total_params_len += hidden_layers[-1]*output_size + output_size
        
        ### Others required params
        self.param_list = []

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        # print('x shape in forward',x.shape)
        for fc_layer in self.fc_layers:
            x = F.relu(fc_layer(x))
        x = self.fc_last(x)
        return x
    
    def fit(self, X, Y, X_val, Y_val, epochs, batch_size=1, **kwargs):
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(self.parameters())
        

class Train_nn:
    
    def __init__(self, input_features, hidden_layers, output_size, lr, decay=True):
        self.model = Net(input_features, hidden_layers=hidden_layers, output_size=output_size)
        self.model.to(device)
        self.loss_fn = nn.CrossEntropyLoss()
        if decay:
            lr_lambda = lambda it: 1/(it+1)
        else:
            lr_lambda = lambda it: 1
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=lr)
        self.scheduler = torch.optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda= lr_lambda)
        
    def get_loss(self, X, y, params=None):
        # if params is not None:
        assert False, "Model not initialized with given params"
        op = self.model(X)
        loss = self.loss_fn(pred, y)
        return loss
    
    def get_gradient(self):
        params = (self.model.parameters())
        grad_norm_sq = torch.tensor(0, dtype=float).to(device)
        # print('grad Norm init:', grad_norm_sq)
        for param in self.model.parameters():
            temp = param.grad.data.pow(2).sum()
            # print(f'param grad norm \n\tsum:{temp.data}')#,\n\tshape:{param.shape}')
            grad_norm_sq += temp
            
        return grad_norm_sq.sqrt().cpu()
    
    def get_gradientv2(self, X, y):
        names = list(n for n, _ in self.model.named_parameters())
        def loss_fun_grad(*params):
            out: torch.Tensor = _stateless.functional_call(self.model, {n: p for n, p in zip(names, params)}, X)
            local_loss = self.loss_fn(out, y)
            return local_loss
        grad_mat = torch.autograd.grad(loss_fun_grad, tuple(self.model.parameters()))
        # print(f'len of hess mat:{len(hess_mat)}')
        # print(f'hess_mat[0] shape:{len(hess_mat[0])}')
        # print(f'hess_mat[0][0] shape:{hess_mat[0][0].shape}')
        grad_norm = torch.tensor(0.).to(device)
        for i in range(len(grad_mat)):
            for j in range(len(grad_mat[0])):
                grad_norm+= grad_mat[i][j].pow(2).sum()
        grad_norm = grad_norm.sqrt()
        # print(f'v2 hess norm{hess_norm}')
        return grad_norm.cpu()
    
    def try_operator_norm(self, hess_mat):
        for i in len(hess_mat):
            for j in len(hess_mat[0]):
                torch.unsqueeze(hess_mat[i][i],0)
        hess_tensor_dim = list(hess_mat[0][0].shape)
        hess_tensor_dim += [n*2,n*2]
        hess_mat_np = np.zeros(shape=hess_tensor_dim)
        hess_tensor = torch.tensor(hess_mat_np)
        torch.cat(hess_mat, out=hess_tensor)
        
        hess_mat.reshpe(n*2,n*2)
        hess_norm = torch.linalg.norm(hess_mat, 2)
        assert False, "Not working"
    
    def get_hessian(self, X, y):
        prev_params = copy.deepcopy(list(self.model.parameters()))
        n = self.model.layers
        def local_model(*params):
            # print(f'len of params:{len(params)}')
            # print(f'shape of params[0]:{params[0].shape}')
            # with torch.no_grad():
            #initialize model with given params
            i = 0
            for i, param in enumerate(self.model.parameters()):
                param.data = params[i]
            pred = self.model(X)
            loss = self.loss_fn(pred, y)
            # print(f'loss type:{type(loss)}')
            return loss
        p =list(self.model.parameters())
        hess_mat = hessian(local_model, tuple(p))
        hess_norm = torch.tensor(0.).to(device)
        for i in range(len(hess_mat)):
            for j in range(len(hess_mat[0])):
                hess_norm+= hess_mat[i][j].pow(2).sum()
        
        # print(f'Hess mat len:{len(hess_mat)}')
        # print(f'Hess mat[0] len:{len(hess_mat[0])}')
        # print(f'Hess mat[0][0] shape:{hess_mat[0][0].shape}')
        
        hess_norm = hess_norm.sqrt()
        # print(f'hess norm:{hess_norm}')
        
        # Reinitialize the original params to model
        for i, param in enumerate(self.model.parameters()):
                param.data = prev_params[i]
        
        return hess_norm
    
    def get_hessianv2(self, X,y):
        names = list(n for n, _ in self.model.named_parameters())
        def loss_fun_hess(*params):
            out: torch.Tensor = _stateless.functional_call(self.model, {n: p for n, p in zip(names, params)}, X)
            local_loss = self.loss_fn(out, y)
            return local_loss
        hess_mat = hessian(loss_fun_hess, tuple(self.model.parameters()))
        # print(f'len of hess mat:{len(hess_mat)}')
        # print(f'hess_mat[0] shape:{len(hess_mat[0])}')
        # print(f'hess_mat[0][0] shape:{hess_mat[0][0].shape}')
        hess_norm = torch.tensor(0.).to(device)
        for i in range(len(hess_mat)):
            for j in range(len(hess_mat[0])):
                hess_norm+= hess_mat[i][j].pow(2).sum()
        hess_norm = hess_norm.sqrt()
        # print(f'v2 hess norm{hess_norm}')
        return hess_norm.cpu()
        
    def fit(self, train_loader, test_loader, epochs, store_grads=False, store_hessian=False, store_gen_err=False, store_weights=False, store_pt_loss=True, store_freq = 20, freq_reduce_by=None, freq_reduce_after=None):
        
        ## For Book keeping results ##
        self.grads_norms = []
        self.grads_normsv2 = []
        self.param_list = []
        self.hess_norms = []
        self.gen_err = []
        self.train_loss = []
        self.val_loss = []
        self.point_loss = []
        ## Initializing values ##
        terminate_training = False
        store_count = 0
        
        for epoch in tqdm(range(epochs), total=epochs, unit="epoch", disable=True):
            if terminate_training == True:
                break
            for batch, (X, y) in tqdm(enumerate(train_loader), total=len(train_loader), unit='batch'):
                if batch>details['training_step_limit']:
                    terminate_training = True
                    break
                
                X, y =X.to(device), y.to(device)
                pred = self.model(X)
                loss = self.loss_fn(pred, y)

                # Backpropagation
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                ## Saving point loss
                if store_pt_loss and (batch%store_freq==0):
                    self.point_loss.append(loss.item())
                    
                ## Saving the weights
                if store_weights and (batch%store_freq==0):
                    current_params = tuple(self.model.parameters())
                    self.param_list.append(current_params)
                
                ## computing and saving the gradient
                if store_grads and (batch% store_freq == 0):
                    # store_count += 1
                    # # print(f'\tstore_freq:{store_freq}, batch:{batch}')
                    # if store_count%freq_reduce_after==0:
                    #     store_freq += freq_reduce_by
                    #     # print(f'store freq:{store_freq}, batch:{batch}')
                    grad_norm_per_update = self.get_gradient()
                    print('grad:', grad_norm_per_update)
                    # print('\tgrad norm:', grad_norm_per_update)
                    self.grads_norms.append(grad_norm_per_update)
                    # self.grads_normsv2.append(self.get_gradientv2(X,y))
                ## computing and saving hessian
                if store_hessian and (batch% store_freq==0):
                    #assert False, "Not implemented"
                    self.optimizer.zero_grad()
                    hess_val = self.get_hessianv2(X,y)
                    print('hess:',hess_val)
                    self.hess_norms.append(hess_val)
                    store_count += 1
                    if store_count%freq_reduce_after==0:
                        store_freq += freq_reduce_by
                
                ## computing and storing the generalization error
                if store_gen_err and (batch% store_freq == 0):
                    assert False, "fix reducing freq to get it working and fastX, fasty"
                    train_loss, test_loss, point_loss=0, 0, 0
                    with torch.no_grad():
                        for sub_batch, (X_local,y_local) in enumerate(train_loader):
                            # if sub_batch> batch: # only taking the encountered points to calculate train loss
                            #     break
                            X_local, y_local = X_local.to(device), y_local.to(device)
                            pred_local = self.model(X_local)
                            train_loss += self.loss_fn(pred_local, y_local).item()
                    train_loss = train_loss/(batch+1)
                    with torch.no_grad():
                        for sub_batch, (X_local,y_local) in enumerate(test_loader):
                            X_local, y_local = X_local.to(device), y_local.to(device)
                            pred_local = self.model(X_local)
                            test_loss += self.loss_fn(pred_local, y_local).item()
                    test_batch_size = len(test_loader)
                    # print(f"Number of batches in test:{len(test_loader)}")
                    test_loss = test_loss/ len(test_loader)
                    self.train_loss.append(train_loss)
                    self.val_loss.append(test_loss)
                
                if batch % 1000 == 0:
                    loss, current = loss.item(), batch * len(X)
                    correct = 0
                    test_loss = 0
                    with torch.no_grad():
                        for X, y in test_loader:
                            X, y = X.to(device), y.to(device)
                            pred = self.model(X)
                            test_loss += self.loss_fn(pred, y).item()
                            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
                    acc = 100*correct/len(test_loader.dataset)
                    print(f"\taccuracy:{acc}")#, at batch:{batch}")
                    print(f"\tloss: {loss:>7f}")
                
                    # print(f'Learning rate:{self.scheduler.get_last_lr()}')
                self.scheduler.step()
            
def exp_get_lp_sm(train_data_all, test_data_all, op_features, weight = 10, times = 8, epochs = 1, root_dir='', path=None, clear_file = True, freq_reduce_by=10, freq_reduce_after=100):
    grad_list        = []
    hess_norm_list   = []
    if path is not None:
        grad_file_path = root_dir+'grad_'+path
        hess_file_path = root_dir+'hess_'+path
        # gen_file_path = root_dir+'gen_'+path
        if clear_file:
            with open(grad_file_path, 'w+') as f:
                f.write('')
            with open(hess_file_path, 'w+') as f:
                f.write('')
    
    train_loader, test_loader = get_random_subset(train_data_all, test_data_all)
    for t in range(times):
        print(f'Time:{t}')
        train_model = Train_nn(784, weight, op_features, lr= details['alpha_0'], decay=False)
        train_model.fit(train_loader, test_loader, epochs=epochs, store_grads=True, store_hessian=True, store_freq=details['book_keep_freq'],  store_gen_err=False, store_pt_loss=False, store_weights=False, freq_reduce_by = freq_reduce_by, freq_reduce_after=freq_reduce_after, )
        
        with open(grad_file_path,'a+') as f:
            f.write(' '.join([str(grad) for grad in train_model.grads_norms]) + '\n')
        with open(hess_file_path,'a+') as f:
            f.write(' '.join([str(hess) for hess in train_model.hess_norms]) + '\n')
        
        hess_norm_list.append(train_model.hess_norms)
        grad_list.append(train_model.grads_norms)
         
    return grad_list, hess_norm_list

grad_list, hess_norm_list = exp_get_lp_sm(train_data_all, test_data_all, op_features=10, 
              weight=details['g_weight'], times=details['g_times'], 
              epochs=details['g_epochs'], root_dir=details['result_root_dir'], 
              path=details['result_path'], freq_reduce_by=details['freq_reduce_by'], 
              freq_reduce_after=details['freq_reduce_after'])


selected weight:[16]
train data size:2000
test data size:256
Time:0


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(5.6682, dtype=torch.float64)
hess: tensor(47.9316)
	accuracy:12.109375
	loss: 2.321805
grad: tensor(4.1539, dtype=torch.float64)
hess: tensor(30.3194)
grad: tensor(2.7233, dtype=torch.float64)
hess: tensor(21.9250)
grad: tensor(3.9320, dtype=torch.float64)
hess: tensor(33.5948)
grad: tensor(2.4536, dtype=torch.float64)
hess: tensor(18.0610)
grad: tensor(2.9556, dtype=torch.float64)
hess: tensor(24.9243)
grad: tensor(4.2075, dtype=torch.float64)
hess: tensor(42.0566)
grad: tensor(3.3555, dtype=torch.float64)
hess: tensor(25.6617)
grad: tensor(3.8782, dtype=torch.float64)
hess: tensor(34.7406)
grad: tensor(2.4636, dtype=torch.float64)
hess: tensor(21.2966)
grad: tensor(4.7116, dtype=torch.float64)
hess: tensor(33.4358)
grad: tensor(4.7587, dtype=torch.float64)
hess: tensor(42.1473)
grad: tensor(3.0337, dtype=torch.float64)
hess: tensor(28.4914)
grad: tensor(4.1236, dtype=torch.float64)
hess: tensor(31.7487)
grad: tensor(3.6928, dtype=torch.float64)
hess: tensor(26.0059)
grad

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.4987, dtype=torch.float64)
hess: tensor(26.0852)
	accuracy:77.734375
	loss: 0.084191
grad: tensor(2.6057, dtype=torch.float64)
hess: tensor(16.0827)
grad: tensor(3.3794, dtype=torch.float64)
hess: tensor(21.6846)
grad: tensor(4.4570, dtype=torch.float64)
hess: tensor(48.0313)
grad: tensor(8.6845, dtype=torch.float64)
hess: tensor(62.8313)
grad: tensor(9.1012, dtype=torch.float64)
hess: tensor(56.6166)
grad: tensor(5.2375, dtype=torch.float64)
hess: tensor(33.3023)
grad: tensor(6.4423, dtype=torch.float64)
hess: tensor(31.8750)
grad: tensor(5.5797, dtype=torch.float64)
hess: tensor(52.2950)
grad: tensor(4.9951, dtype=torch.float64)
hess: tensor(34.4495)
grad: tensor(1.3686, dtype=torch.float64)
hess: tensor(18.7939)
grad: tensor(13.1814, dtype=torch.float64)
hess: tensor(72.9718)
grad: tensor(2.4902, dtype=torch.float64)
hess: tensor(36.4559)
grad: tensor(3.9532, dtype=torch.float64)
hess: tensor(25.6626)
grad: tensor(5.5795, dtype=torch.float64)
hess: tensor(47.0928)
gra

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(0.2192, dtype=torch.float64)
hess: tensor(5.6381)
	accuracy:85.546875
	loss: 0.009083
grad: tensor(1.4402, dtype=torch.float64)
hess: tensor(12.6312)
grad: tensor(1.7925, dtype=torch.float64)
hess: tensor(17.0339)
grad: tensor(2.7438, dtype=torch.float64)
hess: tensor(46.9670)
grad: tensor(10.8548, dtype=torch.float64)
hess: tensor(97.8794)
grad: tensor(14.5248, dtype=torch.float64)
hess: tensor(86.8952)
grad: tensor(4.8465, dtype=torch.float64)
hess: tensor(39.4935)
grad: tensor(8.3700, dtype=torch.float64)
hess: tensor(43.4366)
grad: tensor(2.0685, dtype=torch.float64)
hess: tensor(30.4789)
grad: tensor(5.0276, dtype=torch.float64)
hess: tensor(44.3745)
grad: tensor(0.1841, dtype=torch.float64)
hess: tensor(3.7885)
grad: tensor(17.5540, dtype=torch.float64)
hess: tensor(104.7053)
grad: tensor(0.5724, dtype=torch.float64)
hess: tensor(12.4992)
grad: tensor(3.5825, dtype=torch.float64)
hess: tensor(32.2651)
grad: tensor(1.7619, dtype=torch.float64)
hess: tensor(24.4238)
gr

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(0.0477, dtype=torch.float64)
hess: tensor(1.3873)
	accuracy:88.671875
	loss: 0.001780
grad: tensor(0.9067, dtype=torch.float64)
hess: tensor(16.6503)
grad: tensor(2.9936, dtype=torch.float64)
hess: tensor(56.7542)
grad: tensor(12.9979, dtype=torch.float64)
hess: tensor(52.9321)
grad: tensor(4.0614, dtype=torch.float64)
hess: tensor(40.0575)
grad: tensor(0.9960, dtype=torch.float64)
hess: tensor(20.8969)
grad: tensor(6.1704, dtype=torch.float64)
hess: tensor(56.3833)
grad: tensor(0.9177, dtype=torch.float64)
hess: tensor(10.8331)
grad: tensor(0.2300, dtype=torch.float64)
hess: tensor(5.7349)
grad: tensor(0.0380, dtype=torch.float64)
hess: tensor(0.9420)
grad: tensor(0.3739, dtype=torch.float64)
hess: tensor(7.2975)
grad: tensor(3.5172, dtype=torch.float64)
hess: tensor(43.6167)
grad: tensor(0.2186, dtype=torch.float64)
hess: tensor(3.2709)
grad: tensor(2.4278, dtype=torch.float64)
hess: tensor(31.5381)
grad: tensor(0.2821, dtype=torch.float64)
hess: tensor(5.4832)
grad: ten

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(0.0127, dtype=torch.float64)
hess: tensor(0.3780)
	accuracy:90.234375
	loss: 0.000464
grad: tensor(0.7513, dtype=torch.float64)
hess: tensor(15.1207)
grad: tensor(3.3000, dtype=torch.float64)
hess: tensor(65.0386)
grad: tensor(13.9320, dtype=torch.float64)
hess: tensor(57.0249)
grad: tensor(3.4147, dtype=torch.float64)
hess: tensor(39.0144)
grad: tensor(0.4599, dtype=torch.float64)
hess: tensor(10.9462)
grad: tensor(7.7036, dtype=torch.float64)
hess: tensor(69.5767)
grad: tensor(0.6569, dtype=torch.float64)
hess: tensor(8.3212)
grad: tensor(0.1292, dtype=torch.float64)
hess: tensor(3.4645)
grad: tensor(0.0154, dtype=torch.float64)
hess: tensor(0.4021)
grad: tensor(0.1697, dtype=torch.float64)
hess: tensor(3.6699)
grad: tensor(2.8001, dtype=torch.float64)
hess: tensor(39.4077)
grad: tensor(0.1471, dtype=torch.float64)
hess: tensor(2.3891)
grad: tensor(2.6254, dtype=torch.float64)
hess: tensor(37.4479)
grad: tensor(0.1953, dtype=torch.float64)
hess: tensor(4.0848)
grad: tens

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(0.0046, dtype=torch.float64)
hess: tensor(0.1384)
	accuracy:90.625
	loss: 0.000163
grad: tensor(0.6520, dtype=torch.float64)
hess: tensor(13.9003)
grad: tensor(3.1405, dtype=torch.float64)
hess: tensor(65.6738)
grad: tensor(14.4390, dtype=torch.float64)
hess: tensor(61.0567)
grad: tensor(2.9088, dtype=torch.float64)
hess: tensor(36.7095)
grad: tensor(0.2453, dtype=torch.float64)
hess: tensor(6.2682)
grad: tensor(8.9290, dtype=torch.float64)
hess: tensor(80.1573)
grad: tensor(0.5181, dtype=torch.float64)
hess: tensor(6.8708)
grad: tensor(0.0864, dtype=torch.float64)
hess: tensor(2.4381)
grad: tensor(0.0085, dtype=torch.float64)
hess: tensor(0.2287)
grad: tensor(0.1012, dtype=torch.float64)
hess: tensor(2.3447)
grad: tensor(2.2582, dtype=torch.float64)
hess: tensor(34.9565)
grad: tensor(0.1064, dtype=torch.float64)
hess: tensor(1.7751)
grad: tensor(2.7962, dtype=torch.float64)
hess: tensor(41.6233)
grad: tensor(0.1436, dtype=torch.float64)
hess: tensor(3.1647)
grad: tensor(1

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(0.0020, dtype=torch.float64)
hess: tensor(0.0636)
	accuracy:90.625
	loss: 0.000069
grad: tensor(0.4850, dtype=torch.float64)
hess: tensor(6.7384)
grad: tensor(15.8012, dtype=torch.float64)
hess: tensor(155.1491)
grad: tensor(2.5120, dtype=torch.float64)
hess: tensor(34.2986)
grad: tensor(0.3943, dtype=torch.float64)
hess: tensor(9.5305)
grad: tensor(0.0075, dtype=torch.float64)
hess: tensor(0.2032)
grad: tensor(0.0636, dtype=torch.float64)
hess: tensor(1.8691)
grad: tensor(0.2059, dtype=torch.float64)
hess: tensor(4.5187)
grad: tensor(0.0808, dtype=torch.float64)
hess: tensor(1.9828)
grad: tensor(0.0840, dtype=torch.float64)
hess: tensor(1.4642)
grad: tensor(0.4238, dtype=torch.float64)
hess: tensor(4.9062)
grad: tensor(0.0430, dtype=torch.float64)
hess: tensor(1.1464)
grad: tensor(3.3167, dtype=torch.float64)
hess: tensor(49.2107)
	accuracy:87.109375
	loss: 0.014347
grad: tensor(0.0550, dtype=torch.float64)
hess: tensor(1.0629)
grad: tensor(0.0477, dtype=torch.float64)
he

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(0.0010, dtype=torch.float64)
hess: tensor(0.0317)
	accuracy:90.625
	loss: 0.000033
grad: tensor(0.4291, dtype=torch.float64)
hess: tensor(6.1710)
grad: tensor(16.7574, dtype=torch.float64)
hess: tensor(161.8229)
grad: tensor(2.2018, dtype=torch.float64)
hess: tensor(32.0465)
grad: tensor(0.3228, dtype=torch.float64)
hess: tensor(8.2086)
grad: tensor(0.0052, dtype=torch.float64)
hess: tensor(0.1450)
grad: tensor(0.0521, dtype=torch.float64)
hess: tensor(1.5756)
grad: tensor(0.1440, dtype=torch.float64)
hess: tensor(3.3090)
grad: tensor(0.0600, dtype=torch.float64)
hess: tensor(1.5254)
grad: tensor(0.0676, dtype=torch.float64)
hess: tensor(1.2247)
grad: tensor(0.3697, dtype=torch.float64)
hess: tensor(4.3934)
grad: tensor(0.0347, dtype=torch.float64)
hess: tensor(0.9486)
grad: tensor(2.6487, dtype=torch.float64)
hess: tensor(42.3472)
	accuracy:87.109375
	loss: 0.010799
grad: tensor(0.0373, dtype=torch.float64)
hess: tensor(0.7447)
grad: tensor(0.0410, dtype=torch.float64)
he

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(0.0005, dtype=torch.float64)
hess: tensor(0.0177)
	accuracy:90.625
	loss: 0.000017
grad: tensor(0.3961, dtype=torch.float64)
hess: tensor(5.8749)
grad: tensor(16.9614, dtype=torch.float64)
hess: tensor(163.5779)
grad: tensor(1.9567, dtype=torch.float64)
hess: tensor(30.0751)
grad: tensor(0.2679, dtype=torch.float64)
hess: tensor(7.1054)
grad: tensor(0.0037, dtype=torch.float64)
hess: tensor(0.1075)
grad: tensor(0.0449, dtype=torch.float64)
hess: tensor(1.3944)
grad: tensor(0.1044, dtype=torch.float64)
hess: tensor(2.5402)
grad: tensor(0.0470, dtype=torch.float64)
hess: tensor(1.2330)
grad: tensor(0.0548, dtype=torch.float64)
hess: tensor(1.0234)
grad: tensor(0.3215, dtype=torch.float64)
hess: tensor(3.9127)
grad: tensor(0.0296, dtype=torch.float64)
hess: tensor(0.8292)
grad: tensor(2.0682, dtype=torch.float64)
hess: tensor(35.4668)
	accuracy:88.28125
	loss: 0.008258
grad: tensor(0.0258, dtype=torch.float64)
hess: tensor(0.5302)
grad: tensor(0.0361, dtype=torch.float64)
hes

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(0.0003, dtype=torch.float64)
hess: tensor(0.0102)
	accuracy:90.625
	loss: 0.000010
grad: tensor(0.3725, dtype=torch.float64)
hess: tensor(5.6883)
grad: tensor(16.9848, dtype=torch.float64)
hess: tensor(163.8259)
grad: tensor(1.7005, dtype=torch.float64)
hess: tensor(27.5211)
grad: tensor(0.2324, dtype=torch.float64)
hess: tensor(6.3990)
grad: tensor(0.0027, dtype=torch.float64)
hess: tensor(0.0789)
grad: tensor(0.0366, dtype=torch.float64)
hess: tensor(1.1395)
grad: tensor(0.0751, dtype=torch.float64)
hess: tensor(1.8861)
grad: tensor(0.0374, dtype=torch.float64)
hess: tensor(1.0077)
grad: tensor(0.0448, dtype=torch.float64)
hess: tensor(0.8618)
grad: tensor(0.2854, dtype=torch.float64)
hess: tensor(3.5490)
grad: tensor(0.0257, dtype=torch.float64)
hess: tensor(0.7362)
grad: tensor(1.6837, dtype=torch.float64)
hess: tensor(30.4495)
	accuracy:88.671875
	loss: 0.006638
grad: tensor(0.0189, dtype=torch.float64)
hess: tensor(0.3948)
grad: tensor(0.0343, dtype=torch.float64)
he

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(0.0002, dtype=torch.float64)
hess: tensor(0.0062)
	accuracy:91.015625
	loss: 0.000006
grad: tensor(0.4356, dtype=torch.float64)
hess: tensor(6.3598)
grad: tensor(15.7868, dtype=torch.float64)
hess: tensor(144.5128)
grad: tensor(0.0298, dtype=torch.float64)
hess: tensor(0.8704)
grad: tensor(0.0020, dtype=torch.float64)
hess: tensor(0.0602)
grad: tensor(10.6108, dtype=torch.float64)
hess: tensor(131.6051)
grad: tensor(0.0253, dtype=torch.float64)
hess: tensor(0.6994)
grad: tensor(0.4028, dtype=torch.float64)
hess: tensor(10.1712)
grad: tensor(0.2548, dtype=torch.float64)
hess: tensor(3.2332)
grad: tensor(1.6485, dtype=torch.float64)
hess: tensor(24.4691)
grad: tensor(0.1049, dtype=torch.float64)
hess: tensor(2.0065)
	accuracy:88.28125
	loss: 0.005615
grad: tensor(7.7291, dtype=torch.float64)
hess: tensor(66.7951)
grad: tensor(1.3648, dtype=torch.float64)
hess: tensor(32.2913)
grad: tensor(15.4304, dtype=torch.float64)
hess: tensor(91.4263)
grad: tensor(0.8383, dtype=torch.fl

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(0.0001, dtype=torch.float64)
hess: tensor(0.0038)
	accuracy:91.015625
	loss: 0.000003
grad: tensor(0.4045, dtype=torch.float64)
hess: tensor(6.0592)
grad: tensor(14.9998, dtype=torch.float64)
hess: tensor(143.4441)
grad: tensor(0.0235, dtype=torch.float64)
hess: tensor(0.6972)
grad: tensor(0.0016, dtype=torch.float64)
hess: tensor(0.0492)
grad: tensor(9.7708, dtype=torch.float64)
hess: tensor(128.7227)
grad: tensor(0.0202, dtype=torch.float64)
hess: tensor(0.5699)
grad: tensor(0.3824, dtype=torch.float64)
hess: tensor(9.9045)
grad: tensor(0.2276, dtype=torch.float64)
hess: tensor(2.9428)
grad: tensor(1.7470, dtype=torch.float64)
hess: tensor(26.5932)
grad: tensor(0.0870, dtype=torch.float64)
hess: tensor(1.7121)
	accuracy:88.28125
	loss: 0.004522
grad: tensor(7.6955, dtype=torch.float64)
hess: tensor(68.7981)
grad: tensor(1.2351, dtype=torch.float64)
hess: tensor(30.1484)
grad: tensor(15.7554, dtype=torch.float64)
hess: tensor(96.1117)
grad: tensor(0.7228, dtype=torch.floa

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(7.3223e-05, dtype=torch.float64)
hess: tensor(0.0026)
	accuracy:91.015625
	loss: 0.000002
grad: tensor(0.3813, dtype=torch.float64)
hess: tensor(5.8518)
grad: tensor(14.3552, dtype=torch.float64)
hess: tensor(142.9986)
grad: tensor(0.0182, dtype=torch.float64)
hess: tensor(0.5491)
grad: tensor(0.0012, dtype=torch.float64)
hess: tensor(0.0378)
grad: tensor(8.8869, dtype=torch.float64)
hess: tensor(120.3968)
grad: tensor(0.0163, dtype=torch.float64)
hess: tensor(0.4717)
grad: tensor(0.3520, dtype=torch.float64)
hess: tensor(9.3369)
grad: tensor(0.2018, dtype=torch.float64)
hess: tensor(2.6558)
grad: tensor(1.7971, dtype=torch.float64)
hess: tensor(28.0179)
grad: tensor(0.0740, dtype=torch.float64)
hess: tensor(1.4955)
	accuracy:88.671875
	loss: 0.003741
grad: tensor(7.6910, dtype=torch.float64)
hess: tensor(71.0520)
grad: tensor(1.1168, dtype=torch.float64)
hess: tensor(28.1080)
grad: tensor(15.9670, dtype=torch.float64)
hess: tensor(101.0754)
grad: tensor(0.6135, dtype=torc

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(4.7356e-05, dtype=torch.float64)
hess: tensor(0.0017)
	accuracy:91.015625
	loss: 0.000001
grad: tensor(0.3636, dtype=torch.float64)
hess: tensor(5.7027)
grad: tensor(13.3054, dtype=torch.float64)
hess: tensor(139.4002)
grad: tensor(0.0146, dtype=torch.float64)
hess: tensor(0.4490)
grad: tensor(0.0010, dtype=torch.float64)
hess: tensor(0.0317)
grad: tensor(8.3298, dtype=torch.float64)
hess: tensor(121.9097)
grad: tensor(0.0130, dtype=torch.float64)
hess: tensor(0.3860)
grad: tensor(0.3302, dtype=torch.float64)
hess: tensor(8.9353)
grad: tensor(0.1815, dtype=torch.float64)
hess: tensor(2.4447)
grad: tensor(1.8400, dtype=torch.float64)
hess: tensor(29.3913)
grad: tensor(0.0634, dtype=torch.float64)
hess: tensor(1.3153)
	accuracy:88.671875
	loss: 0.003121
grad: tensor(7.5861, dtype=torch.float64)
hess: tensor(72.3276)
grad: tensor(1.0352, dtype=torch.float64)
hess: tensor(26.7802)
grad: tensor(16.1811, dtype=torch.float64)
hess: tensor(105.9831)
grad: tensor(0.5232, dtype=torc

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(3.2702e-05, dtype=torch.float64)
hess: tensor(0.0012)
	accuracy:90.625
	loss: 0.000001
grad: tensor(0.3513, dtype=torch.float64)
hess: tensor(5.6084)
grad: tensor(12.4569, dtype=torch.float64)
hess: tensor(136.4137)
grad: tensor(0.0120, dtype=torch.float64)
hess: tensor(0.3761)
grad: tensor(0.0008, dtype=torch.float64)
hess: tensor(0.0263)
grad: tensor(7.8842, dtype=torch.float64)
hess: tensor(120.2331)
grad: tensor(0.0100, dtype=torch.float64)
hess: tensor(0.3021)
grad: tensor(0.3087, dtype=torch.float64)
hess: tensor(8.5398)
grad: tensor(0.1638, dtype=torch.float64)
hess: tensor(2.2413)
grad: tensor(1.8610, dtype=torch.float64)
hess: tensor(30.4254)
grad: tensor(0.0543, dtype=torch.float64)
hess: tensor(1.1535)
	accuracy:89.0625
	loss: 0.002605
grad: tensor(7.4416, dtype=torch.float64)
hess: tensor(73.3108)
grad: tensor(0.9421, dtype=torch.float64)
hess: tensor(25.0544)
grad: tensor(16.1813, dtype=torch.float64)
hess: tensor(110.9274)
grad: tensor(0.4472, dtype=torch.flo

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(2.4132e-05, dtype=torch.float64)
hess: tensor(0.0009)
	accuracy:91.40625
	loss: 0.000001
grad: tensor(0.8875, dtype=torch.float64)
hess: tensor(29.5173)
grad: tensor(0.6725, dtype=torch.float64)
hess: tensor(12.7534)
grad: tensor(10.0433, dtype=torch.float64)
hess: tensor(114.7950)
grad: tensor(0.0149, dtype=torch.float64)
hess: tensor(0.5147)
grad: tensor(0.0080, dtype=torch.float64)
hess: tensor(0.2487)
grad: tensor(0.0132, dtype=torch.float64)
hess: tensor(0.2919)
grad: tensor(0.0290, dtype=torch.float64)
hess: tensor(0.8273)
grad: tensor(0.5475, dtype=torch.float64)
hess: tensor(12.0733)
	accuracy:89.0625
	loss: 0.002191
grad: tensor(0.0034, dtype=torch.float64)
hess: tensor(0.1192)
grad: tensor(0.8505, dtype=torch.float64)
hess: tensor(23.2615)
grad: tensor(0.1273, dtype=torch.float64)
hess: tensor(4.8661)
grad: tensor(2.1218, dtype=torch.float64)
hess: tensor(42.7726)
grad: tensor(0.0598, dtype=torch.float64)
hess: tensor(2.2628)
grad: tensor(0.5977, dtype=torch.floa

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.7707e-05, dtype=torch.float64)
hess: tensor(0.0007)
	accuracy:91.015625
	loss: 0.000000
grad: tensor(0.7191, dtype=torch.float64)
hess: tensor(24.8650)
grad: tensor(0.6136, dtype=torch.float64)
hess: tensor(11.8649)
grad: tensor(9.4586, dtype=torch.float64)
hess: tensor(113.2425)
grad: tensor(0.0127, dtype=torch.float64)
hess: tensor(0.4461)
grad: tensor(0.0064, dtype=torch.float64)
hess: tensor(0.2023)
grad: tensor(0.0111, dtype=torch.float64)
hess: tensor(0.2490)
grad: tensor(0.0249, dtype=torch.float64)
hess: tensor(0.7207)
grad: tensor(0.4627, dtype=torch.float64)
hess: tensor(10.4186)
	accuracy:89.453125
	loss: 0.001802
grad: tensor(0.0032, dtype=torch.float64)
hess: tensor(0.1123)
grad: tensor(0.7906, dtype=torch.float64)
hess: tensor(22.1613)
grad: tensor(0.1200, dtype=torch.float64)
hess: tensor(4.6514)
grad: tensor(2.0193, dtype=torch.float64)
hess: tensor(41.7320)
grad: tensor(0.0443, dtype=torch.float64)
hess: tensor(1.7069)
grad: tensor(0.5252, dtype=torch.fl

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.3707e-05, dtype=torch.float64)
hess: tensor(0.0006)
	accuracy:91.015625
	loss: 0.000000
grad: tensor(0.5762, dtype=torch.float64)
hess: tensor(20.6275)
grad: tensor(0.5521, dtype=torch.float64)
hess: tensor(10.8807)
grad: tensor(8.7673, dtype=torch.float64)
hess: tensor(110.7119)
grad: tensor(0.0114, dtype=torch.float64)
hess: tensor(0.4044)
grad: tensor(0.0049, dtype=torch.float64)
hess: tensor(0.1585)
grad: tensor(0.0088, dtype=torch.float64)
hess: tensor(0.2004)
grad: tensor(0.0222, dtype=torch.float64)
hess: tensor(0.6543)
grad: tensor(0.4085, dtype=torch.float64)
hess: tensor(9.3939)
	accuracy:89.0625
	loss: 0.001541
grad: tensor(0.0028, dtype=torch.float64)
hess: tensor(0.1025)
grad: tensor(0.7300, dtype=torch.float64)
hess: tensor(20.9769)
grad: tensor(0.1117, dtype=torch.float64)
hess: tensor(4.3934)
grad: tensor(1.9506, dtype=torch.float64)
hess: tensor(41.2333)
grad: tensor(0.0330, dtype=torch.float64)
hess: tensor(1.2943)
grad: tensor(0.4760, dtype=torch.float

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(9.9476e-06, dtype=torch.float64)
hess: tensor(0.0005)
	accuracy:91.015625
	loss: 0.000000
grad: tensor(0.4753, dtype=torch.float64)
hess: tensor(17.5294)
grad: tensor(0.4944, dtype=torch.float64)
hess: tensor(9.9359)
grad: tensor(8.0112, dtype=torch.float64)
hess: tensor(107.3242)
grad: tensor(0.0099, dtype=torch.float64)
hess: tensor(0.3590)
grad: tensor(0.0039, dtype=torch.float64)
hess: tensor(0.1277)
grad: tensor(0.0072, dtype=torch.float64)
hess: tensor(0.1675)
grad: tensor(0.0199, dtype=torch.float64)
hess: tensor(0.5972)
grad: tensor(0.3497, dtype=torch.float64)
hess: tensor(8.1809)
	accuracy:89.0625
	loss: 0.001297
grad: tensor(0.0026, dtype=torch.float64)
hess: tensor(0.0957)
grad: tensor(0.6793, dtype=torch.float64)
hess: tensor(19.9939)
grad: tensor(0.1041, dtype=torch.float64)
hess: tensor(4.1809)
grad: tensor(1.8318, dtype=torch.float64)
hess: tensor(39.7500)
grad: tensor(0.0269, dtype=torch.float64)
hess: tensor(1.0728)
grad: tensor(0.4320, dtype=torch.float6

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(6.3064e-06, dtype=torch.float64)
hess: tensor(0.0003)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.3924, dtype=torch.float64)
hess: tensor(14.8651)
grad: tensor(0.4458, dtype=torch.float64)
hess: tensor(9.1276)
grad: tensor(7.5204, dtype=torch.float64)
hess: tensor(105.3076)
grad: tensor(0.0083, dtype=torch.float64)
hess: tensor(0.3040)
grad: tensor(0.0033, dtype=torch.float64)
hess: tensor(0.1095)
grad: tensor(0.0060, dtype=torch.float64)
hess: tensor(0.1415)
grad: tensor(0.0173, dtype=torch.float64)
hess: tensor(0.5267)
grad: tensor(0.3019, dtype=torch.float64)
hess: tensor(7.1877)
	accuracy:89.84375
	loss: 0.001071
grad: tensor(0.0025, dtype=torch.float64)
hess: tensor(0.0912)
grad: tensor(0.6229, dtype=torch.float64)
hess: tensor(18.7525)
grad: tensor(0.0979, dtype=torch.float64)
hess: tensor(3.9503)
grad: tensor(1.7779, dtype=torch.float64)
hess: tensor(39.3211)
grad: tensor(0.0213, dtype=torch.float64)
hess: tensor(0.8624)
grad: tensor(0.3984, dtype=torch.float64)

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(5.7153e-06, dtype=torch.float64)
hess: tensor(0.0003)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.3157, dtype=torch.float64)
hess: tensor(12.2601)
grad: tensor(0.4086, dtype=torch.float64)
hess: tensor(8.5061)
grad: tensor(6.7813, dtype=torch.float64)
hess: tensor(101.0414)
grad: tensor(0.0075, dtype=torch.float64)
hess: tensor(0.2800)
grad: tensor(0.0026, dtype=torch.float64)
hess: tensor(0.0884)
grad: tensor(0.0049, dtype=torch.float64)
hess: tensor(0.1171)
grad: tensor(0.0154, dtype=torch.float64)
hess: tensor(0.4763)
grad: tensor(0.2642, dtype=torch.float64)
hess: tensor(6.4118)
	accuracy:89.84375
	loss: 0.000887
grad: tensor(0.0023, dtype=torch.float64)
hess: tensor(0.0854)
grad: tensor(0.5864, dtype=torch.float64)
hess: tensor(18.0222)
grad: tensor(0.0932, dtype=torch.float64)
hess: tensor(3.8376)
grad: tensor(1.7056, dtype=torch.float64)
hess: tensor(38.5211)
grad: tensor(0.2730, dtype=torch.float64)
hess: tensor(9.6250)
grad: tensor(0.3648, dtype=torch.float64)

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(5.1519e-06, dtype=torch.float64)
hess: tensor(0.0002)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.0745, dtype=torch.float64)
hess: tensor(1.9229)
grad: tensor(10.6120, dtype=torch.float64)
hess: tensor(93.9147)
grad: tensor(0.3852, dtype=torch.float64)
hess: tensor(6.7345)
grad: tensor(0.0054, dtype=torch.float64)
hess: tensor(0.1778)
grad: tensor(0.2371, dtype=torch.float64)
hess: tensor(7.4107)
grad: tensor(0.0138, dtype=torch.float64)
hess: tensor(0.4306)
grad: tensor(0.0096, dtype=torch.float64)
hess: tensor(0.2836)
	accuracy:89.84375
	loss: 0.000763
grad: tensor(0.0177, dtype=torch.float64)
hess: tensor(0.5517)
grad: tensor(3.9107, dtype=torch.float64)
hess: tensor(87.1342)
grad: tensor(0.1669, dtype=torch.float64)
hess: tensor(4.9553)
grad: tensor(0.2534, dtype=torch.float64)
hess: tensor(9.0455)
grad: tensor(0.3440, dtype=torch.float64)
hess: tensor(9.6301)
grad: tensor(0.0505, dtype=torch.float64)
hess: tensor(1.8319)
grad: tensor(1.2860, dtype=torch.float64)
h

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(4.9176e-06, dtype=torch.float64)
hess: tensor(0.0002)
	accuracy:90.234375
	loss: 0.000000
grad: tensor(0.0704, dtype=torch.float64)
hess: tensor(1.8450)
grad: tensor(10.4037, dtype=torch.float64)
hess: tensor(94.2437)
grad: tensor(0.4084, dtype=torch.float64)
hess: tensor(7.2286)
grad: tensor(0.0047, dtype=torch.float64)
hess: tensor(0.1556)
grad: tensor(0.2300, dtype=torch.float64)
hess: tensor(7.3138)
grad: tensor(0.0123, dtype=torch.float64)
hess: tensor(0.3917)
grad: tensor(0.0078, dtype=torch.float64)
hess: tensor(0.2148)
	accuracy:89.0625
	loss: 0.000648
grad: tensor(0.0166, dtype=torch.float64)
hess: tensor(0.5267)
grad: tensor(3.8872, dtype=torch.float64)
hess: tensor(87.5763)
grad: tensor(0.1475, dtype=torch.float64)
hess: tensor(4.4495)
grad: tensor(0.2609, dtype=torch.float64)
hess: tensor(9.4397)
grad: tensor(0.3421, dtype=torch.float64)
hess: tensor(9.7162)
grad: tensor(0.0488, dtype=torch.float64)
hess: tensor(1.7897)
grad: tensor(1.2278, dtype=torch.float64)

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(4.7040e-06, dtype=torch.float64)
hess: tensor(0.0002)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.0657, dtype=torch.float64)
hess: tensor(1.7384)
grad: tensor(10.1244, dtype=torch.float64)
hess: tensor(93.9364)
grad: tensor(0.4320, dtype=torch.float64)
hess: tensor(7.7575)
grad: tensor(0.0039, dtype=torch.float64)
hess: tensor(0.1327)
grad: tensor(0.2213, dtype=torch.float64)
hess: tensor(7.1383)
grad: tensor(0.0110, dtype=torch.float64)
hess: tensor(0.3522)
grad: tensor(0.0067, dtype=torch.float64)
hess: tensor(0.2030)
	accuracy:89.84375
	loss: 0.000545
grad: tensor(0.0148, dtype=torch.float64)
hess: tensor(0.4758)
grad: tensor(3.6917, dtype=torch.float64)
hess: tensor(85.4377)
grad: tensor(0.1288, dtype=torch.float64)
hess: tensor(3.9438)
grad: tensor(0.2461, dtype=torch.float64)
hess: tensor(9.0188)
grad: tensor(0.3170, dtype=torch.float64)
hess: tensor(9.1640)
grad: tensor(0.0472, dtype=torch.float64)
hess: tensor(1.7496)
grad: tensor(1.2290, dtype=torch.float64)
h

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.8448e-06, dtype=torch.float64)
hess: tensor(0.0001)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.0627, dtype=torch.float64)
hess: tensor(1.6793)
grad: tensor(9.6001, dtype=torch.float64)
hess: tensor(92.4618)
grad: tensor(0.4564, dtype=torch.float64)
hess: tensor(8.3113)
grad: tensor(0.0034, dtype=torch.float64)
hess: tensor(0.1147)
grad: tensor(0.2075, dtype=torch.float64)
hess: tensor(6.7721)
grad: tensor(0.0096, dtype=torch.float64)
hess: tensor(0.3120)
grad: tensor(0.0059, dtype=torch.float64)
hess: tensor(0.1794)
	accuracy:89.84375
	loss: 0.000474
grad: tensor(0.0140, dtype=torch.float64)
hess: tensor(0.4541)
grad: tensor(3.5160, dtype=torch.float64)
hess: tensor(83.4614)
grad: tensor(0.1185, dtype=torch.float64)
hess: tensor(3.6793)
grad: tensor(0.2442, dtype=torch.float64)
hess: tensor(9.0619)
grad: tensor(0.3062, dtype=torch.float64)
hess: tensor(8.9721)
grad: tensor(0.0474, dtype=torch.float64)
hess: tensor(1.7776)
grad: tensor(1.1794, dtype=torch.float64)
he

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.4685e-06, dtype=torch.float64)
hess: tensor(9.1883e-05)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.0583, dtype=torch.float64)
hess: tensor(1.5790)
grad: tensor(9.4259, dtype=torch.float64)
hess: tensor(92.5153)
grad: tensor(0.4715, dtype=torch.float64)
hess: tensor(8.7226)
grad: tensor(0.0028, dtype=torch.float64)
hess: tensor(0.0972)
grad: tensor(0.1996, dtype=torch.float64)
hess: tensor(6.6200)
grad: tensor(0.0089, dtype=torch.float64)
hess: tensor(0.2940)
grad: tensor(0.0046, dtype=torch.float64)
hess: tensor(0.1296)
	accuracy:89.84375
	loss: 0.000409
grad: tensor(0.0133, dtype=torch.float64)
hess: tensor(0.4399)
grad: tensor(3.3464, dtype=torch.float64)
hess: tensor(81.4292)
grad: tensor(0.1063, dtype=torch.float64)
hess: tensor(3.3451)
grad: tensor(0.2388, dtype=torch.float64)
hess: tensor(8.9621)
grad: tensor(0.3038, dtype=torch.float64)
hess: tensor(9.0157)
grad: tensor(0.0481, dtype=torch.float64)
hess: tensor(1.8267)
grad: tensor(1.1273, dtype=torch.float64

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.2843e-06, dtype=torch.float64)
hess: tensor(7.8884e-05)
	accuracy:91.015625
	loss: 0.000000
grad: tensor(0.0544, dtype=torch.float64)
hess: tensor(1.4835)
grad: tensor(8.9972, dtype=torch.float64)
hess: tensor(91.0157)
grad: tensor(0.4858, dtype=torch.float64)
hess: tensor(9.1514)
grad: tensor(0.0024, dtype=torch.float64)
hess: tensor(0.0825)
grad: tensor(0.1868, dtype=torch.float64)
hess: tensor(6.2548)
grad: tensor(0.0081, dtype=torch.float64)
hess: tensor(0.2677)
grad: tensor(0.0040, dtype=torch.float64)
hess: tensor(0.1136)
	accuracy:89.84375
	loss: 0.000345
grad: tensor(0.0119, dtype=torch.float64)
hess: tensor(0.3985)
grad: tensor(3.1914, dtype=torch.float64)
hess: tensor(79.4597)
grad: tensor(0.0962, dtype=torch.float64)
hess: tensor(3.0670)
grad: tensor(0.2410, dtype=torch.float64)
hess: tensor(9.1545)
grad: tensor(0.3153, dtype=torch.float64)
hess: tensor(9.5012)
grad: tensor(0.0492, dtype=torch.float64)
hess: tensor(1.8855)
grad: tensor(1.0942, dtype=torch.floa

  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.0185e-06, dtype=torch.float64)
hess: tensor(5.9684e-05)
	accuracy:91.015625
	loss: 0.000000
grad: tensor(0.0515, dtype=torch.float64)
hess: tensor(1.4216)
grad: tensor(8.6907, dtype=torch.float64)
hess: tensor(90.0077)
grad: tensor(0.5152, dtype=torch.float64)
hess: tensor(9.8286)
grad: tensor(0.0020, dtype=torch.float64)
hess: tensor(0.0707)
grad: tensor(0.1797, dtype=torch.float64)
hess: tensor(6.0949)
grad: tensor(0.0479, dtype=torch.float64)
hess: tensor(0.7660)
grad: tensor(0.1310, dtype=torch.float64)
hess: tensor(3.5247)
	accuracy:89.84375
	loss: 0.000298
grad: tensor(0.0108, dtype=torch.float64)
hess: tensor(0.3651)
grad: tensor(0.0203, dtype=torch.float64)
hess: tensor(0.6765)
grad: tensor(1.2033, dtype=torch.float64)
hess: tensor(30.6375)
grad: tensor(2.0126, dtype=torch.float64)
hess: tensor(45.6293)
grad: tensor(3.6498, dtype=torch.float64)
hess: tensor(71.9626)
grad: tensor(2.3519, dtype=torch.float64)
hess: tensor(46.7475)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(9.0401e-07, dtype=torch.float64)
hess: tensor(5.4076e-05)
	accuracy:91.015625
	loss: 0.000000
grad: tensor(5.9960, dtype=torch.float64)
hess: tensor(116.2932)
grad: tensor(0.0329, dtype=torch.float64)
hess: tensor(1.3106)
grad: tensor(0.0040, dtype=torch.float64)
hess: tensor(0.1601)
grad: tensor(0.0067, dtype=torch.float64)
hess: tensor(0.2585)
grad: tensor(0.0439, dtype=torch.float64)
hess: tensor(0.7081)
grad: tensor(0.1166, dtype=torch.float64)
hess: tensor(3.1782)
	accuracy:89.84375
	loss: 0.000259
grad: tensor(0.0098, dtype=torch.float64)
hess: tensor(0.3351)
grad: tensor(0.0188, dtype=torch.float64)
hess: tensor(0.6317)
grad: tensor(1.0653, dtype=torch.float64)
hess: tensor(27.7799)
grad: tensor(1.9339, dtype=torch.float64)
hess: tensor(44.6625)
grad: tensor(3.9360, dtype=torch.float64)
hess: tensor(79.9969)
grad: tensor(2.2765, dtype=torch.float64)
hess: tensor(45.9001)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(7.2882e-07, dtype=torch.float64)
hess: tensor(4.3693e-05)
	accuracy:91.015625
	loss: 0.000000
grad: tensor(5.6738, dtype=torch.float64)
hess: tensor(114.0902)
grad: tensor(0.0324, dtype=torch.float64)
hess: tensor(1.3073)
grad: tensor(0.0037, dtype=torch.float64)
hess: tensor(0.1525)
grad: tensor(0.0068, dtype=torch.float64)
hess: tensor(0.2668)
grad: tensor(0.0408, dtype=torch.float64)
hess: tensor(0.6615)
grad: tensor(0.1088, dtype=torch.float64)
hess: tensor(2.9959)
	accuracy:89.84375
	loss: 0.000226
grad: tensor(0.0093, dtype=torch.float64)
hess: tensor(0.3216)
grad: tensor(0.0175, dtype=torch.float64)
hess: tensor(0.5931)
grad: tensor(1.0414, dtype=torch.float64)
hess: tensor(27.5179)
grad: tensor(1.8988, dtype=torch.float64)
hess: tensor(44.4607)
grad: tensor(4.1263, dtype=torch.float64)
hess: tensor(83.1256)
grad: tensor(2.0979, dtype=torch.float64)
hess: tensor(43.3346)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(6.4344e-07, dtype=torch.float64)
hess: tensor(3.9524e-05)
	accuracy:90.625
	loss: 0.000000
grad: tensor(5.3820, dtype=torch.float64)
hess: tensor(112.0091)
grad: tensor(0.0260, dtype=torch.float64)
hess: tensor(1.0606)
grad: tensor(0.0035, dtype=torch.float64)
hess: tensor(0.1439)
grad: tensor(0.0062, dtype=torch.float64)
hess: tensor(0.2425)
grad: tensor(0.0382, dtype=torch.float64)
hess: tensor(0.6257)
grad: tensor(0.0989, dtype=torch.float64)
hess: tensor(2.7560)
	accuracy:89.84375
	loss: 0.000195
grad: tensor(0.0086, dtype=torch.float64)
hess: tensor(0.2998)
grad: tensor(0.0163, dtype=torch.float64)
hess: tensor(0.5575)
grad: tensor(1.0053, dtype=torch.float64)
hess: tensor(26.9261)
grad: tensor(1.7687, dtype=torch.float64)
hess: tensor(42.3713)
grad: tensor(3.8647, dtype=torch.float64)
hess: tensor(80.3648)
grad: tensor(1.9763, dtype=torch.float64)
hess: tensor(41.5769)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(5.1307e-07, dtype=torch.float64)
hess: tensor(3.3112e-05)
	accuracy:90.625
	loss: 0.000000
grad: tensor(5.2646, dtype=torch.float64)
hess: tensor(111.3574)
grad: tensor(0.0270, dtype=torch.float64)
hess: tensor(1.1131)
grad: tensor(0.0034, dtype=torch.float64)
hess: tensor(0.1411)
grad: tensor(0.0061, dtype=torch.float64)
hess: tensor(0.2424)
grad: tensor(0.0357, dtype=torch.float64)
hess: tensor(0.5876)
grad: tensor(0.0931, dtype=torch.float64)
hess: tensor(2.6155)
	accuracy:89.84375
	loss: 0.000175
grad: tensor(0.0081, dtype=torch.float64)
hess: tensor(0.2870)
grad: tensor(0.0152, dtype=torch.float64)
hess: tensor(0.5234)
grad: tensor(0.9385, dtype=torch.float64)
hess: tensor(25.5289)
grad: tensor(1.7164, dtype=torch.float64)
hess: tensor(41.7329)
grad: tensor(4.0706, dtype=torch.float64)
hess: tensor(83.7186)
grad: tensor(1.8674, dtype=torch.float64)
hess: tensor(39.9823)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(4.5270e-07, dtype=torch.float64)
hess: tensor(3.0155e-05)
	accuracy:90.625
	loss: 0.000000
grad: tensor(5.1081, dtype=torch.float64)
hess: tensor(110.3401)
grad: tensor(0.0244, dtype=torch.float64)
hess: tensor(1.0199)
grad: tensor(0.0032, dtype=torch.float64)
hess: tensor(0.1339)
grad: tensor(0.0056, dtype=torch.float64)
hess: tensor(0.2258)
grad: tensor(0.0339, dtype=torch.float64)
hess: tensor(0.5616)
grad: tensor(0.0872, dtype=torch.float64)
hess: tensor(2.4780)
	accuracy:90.234375
	loss: 0.000152
grad: tensor(0.0078, dtype=torch.float64)
hess: tensor(0.2776)
grad: tensor(0.0142, dtype=torch.float64)
hess: tensor(0.4922)
grad: tensor(0.8923, dtype=torch.float64)
hess: tensor(24.6107)
grad: tensor(1.6149, dtype=torch.float64)
hess: tensor(40.0460)
grad: tensor(3.8031, dtype=torch.float64)
hess: tensor(80.7714)
grad: tensor(1.7357, dtype=torch.float64)
hess: tensor(37.8956)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(3.6259e-07, dtype=torch.float64)
hess: tensor(2.2228e-05)
	accuracy:91.015625
	loss: 0.000000
grad: tensor(4.8508, dtype=torch.float64)
hess: tensor(108.1689)
grad: tensor(0.0198, dtype=torch.float64)
hess: tensor(0.8359)
grad: tensor(0.0033, dtype=torch.float64)
hess: tensor(0.1374)
grad: tensor(0.0058, dtype=torch.float64)
hess: tensor(0.2366)
grad: tensor(0.0322, dtype=torch.float64)
hess: tensor(0.5372)
grad: tensor(0.0808, dtype=torch.float64)
hess: tensor(2.3078)
	accuracy:90.234375
	loss: 0.000137
grad: tensor(0.0075, dtype=torch.float64)
hess: tensor(0.2706)
grad: tensor(0.0136, dtype=torch.float64)
hess: tensor(0.4749)
grad: tensor(0.9788, dtype=torch.float64)
hess: tensor(23.3069)
grad: tensor(1.6376, dtype=torch.float64)
hess: tensor(40.8401)
grad: tensor(3.9103, dtype=torch.float64)
hess: tensor(82.7851)
grad: tensor(1.6164, dtype=torch.float64)
hess: tensor(35.9233)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(3.3235e-07, dtype=torch.float64)
hess: tensor(2.0684e-05)
	accuracy:91.015625
	loss: 0.000000
grad: tensor(4.5913, dtype=torch.float64)
hess: tensor(105.8161)
grad: tensor(0.0175, dtype=torch.float64)
hess: tensor(0.7456)
grad: tensor(0.0030, dtype=torch.float64)
hess: tensor(0.1277)
grad: tensor(0.0052, dtype=torch.float64)
hess: tensor(0.2141)
grad: tensor(0.0304, dtype=torch.float64)
hess: tensor(0.5102)
grad: tensor(0.0752, dtype=torch.float64)
hess: tensor(2.1681)
	accuracy:90.234375
	loss: 0.000119
grad: tensor(0.0070, dtype=torch.float64)
hess: tensor(0.2539)
grad: tensor(0.0127, dtype=torch.float64)
hess: tensor(0.4457)
grad: tensor(0.8539, dtype=torch.float64)
hess: tensor(24.0443)
grad: tensor(1.5146, dtype=torch.float64)
hess: tensor(38.5896)
grad: tensor(3.5892, dtype=torch.float64)
hess: tensor(78.8178)
grad: tensor(1.5484, dtype=torch.float64)
hess: tensor(34.8727)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(2.6194e-07, dtype=torch.float64)
hess: tensor(1.6764e-05)
	accuracy:90.625
	loss: 0.000000
grad: tensor(1.2079, dtype=torch.float64)
hess: tensor(28.6409)
grad: tensor(2.4699, dtype=torch.float64)
hess: tensor(61.6703)
grad: tensor(0.0003, dtype=torch.float64)
hess: tensor(0.0107)
grad: tensor(0.0004, dtype=torch.float64)
hess: tensor(0.0123)
grad: tensor(1.0573, dtype=torch.float64)
hess: tensor(25.6248)
	accuracy:90.234375
	loss: 0.000107
grad: tensor(0.0010, dtype=torch.float64)
hess: tensor(0.0430)
grad: tensor(1.9531, dtype=torch.float64)
hess: tensor(57.7162)
grad: tensor(0.9395, dtype=torch.float64)
hess: tensor(23.0558)
grad: tensor(0.5928, dtype=torch.float64)
hess: tensor(16.9878)
grad: tensor(0.0042, dtype=torch.float64)
hess: tensor(0.1582)
grad: tensor(0.0192, dtype=torch.float64)
hess: tensor(0.5063)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(2.3928e-07, dtype=torch.float64)
hess: tensor(1.4017e-05)
	accuracy:90.625
	loss: 0.000000
grad: tensor(1.1210, dtype=torch.float64)
hess: tensor(27.0338)
grad: tensor(2.3509, dtype=torch.float64)
hess: tensor(59.8981)
grad: tensor(0.0002, dtype=torch.float64)
hess: tensor(0.0101)
grad: tensor(0.0004, dtype=torch.float64)
hess: tensor(0.0112)
grad: tensor(1.0080, dtype=torch.float64)
hess: tensor(24.7669)
	accuracy:90.234375
	loss: 0.000097
grad: tensor(0.0009, dtype=torch.float64)
hess: tensor(0.0420)
grad: tensor(1.9077, dtype=torch.float64)
hess: tensor(56.8312)
grad: tensor(0.8491, dtype=torch.float64)
hess: tensor(21.1828)
grad: tensor(0.5715, dtype=torch.float64)
hess: tensor(16.5629)
grad: tensor(0.0039, dtype=torch.float64)
hess: tensor(0.1458)
grad: tensor(0.0178, dtype=torch.float64)
hess: tensor(0.4719)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(2.0651e-07, dtype=torch.float64)
hess: tensor(1.2163e-05)
	accuracy:90.625
	loss: 0.000000
grad: tensor(1.0719, dtype=torch.float64)
hess: tensor(26.1889)
grad: tensor(2.2661, dtype=torch.float64)
hess: tensor(58.6791)
grad: tensor(0.0002, dtype=torch.float64)
hess: tensor(0.0094)
grad: tensor(0.0004, dtype=torch.float64)
hess: tensor(0.0106)
grad: tensor(0.9544, dtype=torch.float64)
hess: tensor(23.7746)
	accuracy:90.625
	loss: 0.000087
grad: tensor(0.0009, dtype=torch.float64)
hess: tensor(0.0398)
grad: tensor(1.8886, dtype=torch.float64)
hess: tensor(53.9405)
grad: tensor(0.8470, dtype=torch.float64)
hess: tensor(27.7757)
grad: tensor(0.5793, dtype=torch.float64)
hess: tensor(16.9323)
grad: tensor(0.0036, dtype=torch.float64)
hess: tensor(0.1359)
grad: tensor(0.0171, dtype=torch.float64)
hess: tensor(0.4573)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.7385e-07, dtype=torch.float64)
hess: tensor(1.0264e-05)
	accuracy:90.625
	loss: 0.000000
grad: tensor(1.0161, dtype=torch.float64)
hess: tensor(25.1697)
grad: tensor(2.1775, dtype=torch.float64)
hess: tensor(57.3273)
grad: tensor(0.0002, dtype=torch.float64)
hess: tensor(0.0092)
grad: tensor(0.0003, dtype=torch.float64)
hess: tensor(0.0096)
grad: tensor(0.9092, dtype=torch.float64)
hess: tensor(22.9311)
	accuracy:90.625
	loss: 0.000079
grad: tensor(0.0009, dtype=torch.float64)
hess: tensor(0.0388)
grad: tensor(1.8406, dtype=torch.float64)
hess: tensor(52.9970)
grad: tensor(0.7842, dtype=torch.float64)
hess: tensor(26.1357)
grad: tensor(0.5526, dtype=torch.float64)
hess: tensor(16.3319)
grad: tensor(0.0032, dtype=torch.float64)
hess: tensor(0.1236)
grad: tensor(0.0160, dtype=torch.float64)
hess: tensor(0.4295)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.5229e-07, dtype=torch.float64)
hess: tensor(8.9994e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.9736, dtype=torch.float64)
hess: tensor(24.3984)
grad: tensor(2.0473, dtype=torch.float64)
hess: tensor(55.0465)
grad: tensor(0.0002, dtype=torch.float64)
hess: tensor(0.0087)
grad: tensor(0.0003, dtype=torch.float64)
hess: tensor(0.0088)
grad: tensor(0.8496, dtype=torch.float64)
hess: tensor(21.7205)
	accuracy:90.625
	loss: 0.000072
grad: tensor(0.0008, dtype=torch.float64)
hess: tensor(0.0382)
grad: tensor(1.6481, dtype=torch.float64)
hess: tensor(51.1099)
grad: tensor(0.7736, dtype=torch.float64)
hess: tensor(26.0130)
grad: tensor(0.5646, dtype=torch.float64)
hess: tensor(16.8039)
grad: tensor(0.0032, dtype=torch.float64)
hess: tensor(0.1215)
grad: tensor(0.0158, dtype=torch.float64)
hess: tensor(0.4258)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.3005e-07, dtype=torch.float64)
hess: tensor(7.7133e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.9286, dtype=torch.float64)
hess: tensor(23.5514)
grad: tensor(1.9679, dtype=torch.float64)
hess: tensor(53.7299)
grad: tensor(0.0002, dtype=torch.float64)
hess: tensor(0.0082)
grad: tensor(0.0003, dtype=torch.float64)
hess: tensor(0.0082)
grad: tensor(0.8262, dtype=torch.float64)
hess: tensor(21.3301)
	accuracy:90.625
	loss: 0.000066
grad: tensor(0.0008, dtype=torch.float64)
hess: tensor(0.0375)
grad: tensor(1.7196, dtype=torch.float64)
hess: tensor(50.4551)
grad: tensor(0.7489, dtype=torch.float64)
hess: tensor(25.4592)
grad: tensor(0.5511, dtype=torch.float64)
hess: tensor(16.5518)
grad: tensor(0.0029, dtype=torch.float64)
hess: tensor(0.1103)
grad: tensor(0.0151, dtype=torch.float64)
hess: tensor(0.4089)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.1461e-07, dtype=torch.float64)
hess: tensor(6.8269e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.8799, dtype=torch.float64)
hess: tensor(22.5745)
grad: tensor(1.8608, dtype=torch.float64)
hess: tensor(51.7513)
grad: tensor(0.0002, dtype=torch.float64)
hess: tensor(0.0077)
grad: tensor(0.0003, dtype=torch.float64)
hess: tensor(0.0078)
grad: tensor(0.7846, dtype=torch.float64)
hess: tensor(20.4796)
	accuracy:90.625
	loss: 0.000060
grad: tensor(0.0008, dtype=torch.float64)
hess: tensor(0.0364)
grad: tensor(1.5599, dtype=torch.float64)
hess: tensor(49.2071)
grad: tensor(0.6921, dtype=torch.float64)
hess: tensor(23.8366)
grad: tensor(0.5519, dtype=torch.float64)
hess: tensor(16.6972)
grad: tensor(0.0028, dtype=torch.float64)
hess: tensor(0.1067)
grad: tensor(0.0141, dtype=torch.float64)
hess: tensor(0.3860)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(9.8757e-08, dtype=torch.float64)
hess: tensor(5.9070e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.8401, dtype=torch.float64)
hess: tensor(21.7931)
grad: tensor(1.8732, dtype=torch.float64)
hess: tensor(52.3226)
grad: tensor(0.0002, dtype=torch.float64)
hess: tensor(0.0074)
grad: tensor(0.0003, dtype=torch.float64)
hess: tensor(0.0072)
grad: tensor(0.7371, dtype=torch.float64)
hess: tensor(19.4635)
	accuracy:90.234375
	loss: 0.000055
grad: tensor(0.0008, dtype=torch.float64)
hess: tensor(0.0347)
grad: tensor(1.5698, dtype=torch.float64)
hess: tensor(47.2127)
grad: tensor(0.6321, dtype=torch.float64)
hess: tensor(22.0843)
grad: tensor(0.5441, dtype=torch.float64)
hess: tensor(16.5930)
grad: tensor(0.0025, dtype=torch.float64)
hess: tensor(0.0980)
grad: tensor(0.0132, dtype=torch.float64)
hess: tensor(0.3621)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(9.0481e-08, dtype=torch.float64)
hess: tensor(5.4335e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.8035, dtype=torch.float64)
hess: tensor(21.0551)
grad: tensor(1.8031, dtype=torch.float64)
hess: tensor(51.0631)
grad: tensor(0.0002, dtype=torch.float64)
hess: tensor(0.0069)
grad: tensor(0.0002, dtype=torch.float64)
hess: tensor(0.0069)
grad: tensor(0.0203, dtype=torch.float64)
hess: tensor(0.3564)
grad: tensor(0.0016, dtype=torch.float64)
hess: tensor(0.0493)
	accuracy:90.234375
	loss: 0.000052
grad: tensor(0.1633, dtype=torch.float64)
hess: tensor(6.5456)
grad: tensor(0.0423, dtype=torch.float64)
hess: tensor(1.5572)
grad: tensor(1.0591, dtype=torch.float64)
hess: tensor(30.1710)
grad: tensor(0.0023, dtype=torch.float64)
hess: tensor(0.0910)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(7.7600e-08, dtype=torch.float64)
hess: tensor(4.6790e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(2.0476, dtype=torch.float64)
hess: tensor(54.3258)
grad: tensor(9.6674e-06, dtype=torch.float64)
hess: tensor(0.0004)
grad: tensor(0.0001, dtype=torch.float64)
hess: tensor(0.0047)
grad: tensor(0.0193, dtype=torch.float64)
hess: tensor(0.3370)
grad: tensor(0.0015, dtype=torch.float64)
hess: tensor(0.0467)
	accuracy:90.234375
	loss: 0.000048
grad: tensor(0.1538, dtype=torch.float64)
hess: tensor(6.1976)
grad: tensor(0.0417, dtype=torch.float64)
hess: tensor(1.5451)
grad: tensor(1.0426, dtype=torch.float64)
hess: tensor(29.9202)
grad: tensor(0.0022, dtype=torch.float64)
hess: tensor(0.0872)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(6.9753e-08, dtype=torch.float64)
hess: tensor(4.2216e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(2.0090, dtype=torch.float64)
hess: tensor(53.8166)
grad: tensor(9.3021e-06, dtype=torch.float64)
hess: tensor(0.0004)
grad: tensor(9.6699e-05, dtype=torch.float64)
hess: tensor(0.0043)
grad: tensor(0.0185, dtype=torch.float64)
hess: tensor(0.3242)
grad: tensor(0.0014, dtype=torch.float64)
hess: tensor(0.0430)
	accuracy:90.234375
	loss: 0.000044
grad: tensor(0.1458, dtype=torch.float64)
hess: tensor(5.9122)
grad: tensor(0.0408, dtype=torch.float64)
hess: tensor(1.5205)
grad: tensor(0.9992, dtype=torch.float64)
hess: tensor(28.9793)
grad: tensor(0.0021, dtype=torch.float64)
hess: tensor(0.0810)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(6.1429e-08, dtype=torch.float64)
hess: tensor(3.7311e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(1.9974, dtype=torch.float64)
hess: tensor(53.8843)
grad: tensor(6.2239e-06, dtype=torch.float64)
hess: tensor(0.0003)
grad: tensor(9.2280e-05, dtype=torch.float64)
hess: tensor(0.0042)
grad: tensor(0.0178, dtype=torch.float64)
hess: tensor(0.3135)
grad: tensor(0.0013, dtype=torch.float64)
hess: tensor(0.0408)
	accuracy:91.015625
	loss: 0.000041
grad: tensor(0.1409, dtype=torch.float64)
hess: tensor(5.7419)
grad: tensor(0.0383, dtype=torch.float64)
hess: tensor(1.4334)
grad: tensor(0.9772, dtype=torch.float64)
hess: tensor(28.5642)
grad: tensor(0.0019, dtype=torch.float64)
hess: tensor(0.0749)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(5.5369e-08, dtype=torch.float64)
hess: tensor(3.3752e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(1.9510, dtype=torch.float64)
hess: tensor(53.1335)
grad: tensor(5.9149e-06, dtype=torch.float64)
hess: tensor(0.0003)
grad: tensor(8.6312e-05, dtype=torch.float64)
hess: tensor(0.0039)
grad: tensor(0.0171, dtype=torch.float64)
hess: tensor(0.3017)
grad: tensor(0.0012, dtype=torch.float64)
hess: tensor(0.0384)
	accuracy:91.015625
	loss: 0.000038
grad: tensor(0.1323, dtype=torch.float64)
hess: tensor(5.4184)
grad: tensor(0.0373, dtype=torch.float64)
hess: tensor(1.4008)
grad: tensor(0.9363, dtype=torch.float64)
hess: tensor(27.6405)
grad: tensor(0.0018, dtype=torch.float64)
hess: tensor(0.0710)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(4.6800e-08, dtype=torch.float64)
hess: tensor(2.8627e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(1.9612, dtype=torch.float64)
hess: tensor(53.5984)
grad: tensor(5.6632e-06, dtype=torch.float64)
hess: tensor(0.0003)
grad: tensor(7.8476e-05, dtype=torch.float64)
hess: tensor(0.0036)
grad: tensor(0.0164, dtype=torch.float64)
hess: tensor(0.2911)
grad: tensor(0.0011, dtype=torch.float64)
hess: tensor(0.0363)
	accuracy:91.015625
	loss: 0.000036
grad: tensor(0.1251, dtype=torch.float64)
hess: tensor(5.1491)
grad: tensor(0.0355, dtype=torch.float64)
hess: tensor(1.3396)
grad: tensor(0.8926, dtype=torch.float64)
hess: tensor(26.6223)
grad: tensor(0.0017, dtype=torch.float64)
hess: tensor(0.0665)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(4.2548e-08, dtype=torch.float64)
hess: tensor(2.6115e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(1.9627, dtype=torch.float64)
hess: tensor(53.9072)
grad: tensor(5.3625e-06, dtype=torch.float64)
hess: tensor(0.0002)
grad: tensor(7.5313e-05, dtype=torch.float64)
hess: tensor(0.0034)
grad: tensor(0.0158, dtype=torch.float64)
hess: tensor(0.2814)
grad: tensor(0.0011, dtype=torch.float64)
hess: tensor(0.0344)
	accuracy:91.015625
	loss: 0.000034
grad: tensor(0.1208, dtype=torch.float64)
hess: tensor(4.9917)
grad: tensor(0.0356, dtype=torch.float64)
hess: tensor(1.3503)
grad: tensor(0.9173, dtype=torch.float64)
hess: tensor(27.4039)
grad: tensor(0.0016, dtype=torch.float64)
hess: tensor(0.0621)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(3.7257e-08, dtype=torch.float64)
hess: tensor(2.2943e-06)
	accuracy:90.234375
	loss: 0.000000
grad: tensor(1.9437, dtype=torch.float64)
hess: tensor(53.7163)
grad: tensor(5.1471e-06, dtype=torch.float64)
hess: tensor(0.0002)
grad: tensor(7.3439e-05, dtype=torch.float64)
hess: tensor(0.0033)
grad: tensor(0.0151, dtype=torch.float64)
hess: tensor(0.2714)
grad: tensor(0.0010, dtype=torch.float64)
hess: tensor(0.0327)
	accuracy:91.015625
	loss: 0.000032
grad: tensor(0.1139, dtype=torch.float64)
hess: tensor(4.7269)
grad: tensor(0.0327, dtype=torch.float64)
hess: tensor(1.2475)
grad: tensor(0.8736, dtype=torch.float64)
hess: tensor(26.3597)
grad: tensor(0.0015, dtype=torch.float64)
hess: tensor(0.0581)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(3.3076e-08, dtype=torch.float64)
hess: tensor(2.0434e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(1.9410, dtype=torch.float64)
hess: tensor(53.8969)
grad: tensor(4.9828e-06, dtype=torch.float64)
hess: tensor(0.0002)
grad: tensor(6.4146e-05, dtype=torch.float64)
hess: tensor(0.0030)
grad: tensor(0.0146, dtype=torch.float64)
hess: tensor(0.2622)
grad: tensor(0.0010, dtype=torch.float64)
hess: tensor(0.0310)
	accuracy:91.40625
	loss: 0.000030
grad: tensor(0.1091, dtype=torch.float64)
hess: tensor(4.5496)
grad: tensor(0.0315, dtype=torch.float64)
hess: tensor(1.2069)
grad: tensor(0.8302, dtype=torch.float64)
hess: tensor(25.2943)
grad: tensor(0.0014, dtype=torch.float64)
hess: tensor(0.0555)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(2.8865e-08, dtype=torch.float64)
hess: tensor(1.7888e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(1.9271, dtype=torch.float64)
hess: tensor(53.8060)
grad: tensor(4.8667e-06, dtype=torch.float64)
hess: tensor(0.0002)
grad: tensor(5.9089e-05, dtype=torch.float64)
hess: tensor(0.0028)
grad: tensor(0.0140, dtype=torch.float64)
hess: tensor(0.2522)
grad: tensor(0.0009, dtype=torch.float64)
hess: tensor(0.0296)
	accuracy:91.40625
	loss: 0.000028
grad: tensor(0.1033, dtype=torch.float64)
hess: tensor(4.3266)
grad: tensor(0.0304, dtype=torch.float64)
hess: tensor(1.1704)
grad: tensor(0.8250, dtype=torch.float64)
hess: tensor(25.2613)
grad: tensor(0.0013, dtype=torch.float64)
hess: tensor(0.0527)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(2.5671e-08, dtype=torch.float64)
hess: tensor(1.5956e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(1.9206, dtype=torch.float64)
hess: tensor(53.8873)
grad: tensor(4.7188e-06, dtype=torch.float64)
hess: tensor(0.0002)
grad: tensor(5.4868e-05, dtype=torch.float64)
hess: tensor(0.0026)
grad: tensor(0.0015, dtype=torch.float64)
hess: tensor(0.0499)
grad: tensor(0.0014, dtype=torch.float64)
hess: tensor(0.0646)
	accuracy:91.40625
	loss: 0.000026
grad: tensor(2.0543, dtype=torch.float64)
hess: tensor(45.1217)
grad: tensor(0.0085, dtype=torch.float64)
hess: tensor(0.4420)
grad: tensor(0.1599, dtype=torch.float64)
hess: tensor(7.4889)
grad: tensor(2.8254, dtype=torch.float64)
hess: tensor(73.7460)
grad: tensor(0.0081, dtype=torch.float64)
hess: tensor(0.2312)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(2.2614e-08, dtype=torch.float64)
hess: tensor(1.4098e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(5.7793e-05, dtype=torch.float64)
hess: tensor(0.0023)
grad: tensor(7.4860, dtype=torch.float64)
hess: tensor(124.7300)
grad: tensor(0.0015, dtype=torch.float64)
hess: tensor(0.0492)
grad: tensor(0.0013, dtype=torch.float64)
hess: tensor(0.0604)
	accuracy:91.40625
	loss: 0.000025
grad: tensor(2.0227, dtype=torch.float64)
hess: tensor(44.7823)
grad: tensor(0.0081, dtype=torch.float64)
hess: tensor(0.4243)
grad: tensor(0.1585, dtype=torch.float64)
hess: tensor(7.4443)
grad: tensor(2.8474, dtype=torch.float64)
hess: tensor(74.3806)
grad: tensor(0.0078, dtype=torch.float64)
hess: tensor(0.2234)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(2.0660e-08, dtype=torch.float64)
hess: tensor(1.2913e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(5.3327e-05, dtype=torch.float64)
hess: tensor(0.0021)
grad: tensor(7.3674, dtype=torch.float64)
hess: tensor(124.2744)
grad: tensor(0.0014, dtype=torch.float64)
hess: tensor(0.0446)
grad: tensor(0.0012, dtype=torch.float64)
hess: tensor(0.0553)
	accuracy:91.40625
	loss: 0.000024
grad: tensor(1.9813, dtype=torch.float64)
hess: tensor(44.2514)
grad: tensor(0.0079, dtype=torch.float64)
hess: tensor(0.4112)
grad: tensor(0.1592, dtype=torch.float64)
hess: tensor(7.5251)
grad: tensor(2.6425, dtype=torch.float64)
hess: tensor(70.8877)
grad: tensor(0.0075, dtype=torch.float64)
hess: tensor(0.2169)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.7951e-08, dtype=torch.float64)
hess: tensor(1.1252e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(4.8991e-05, dtype=torch.float64)
hess: tensor(0.0019)
grad: tensor(7.1017, dtype=torch.float64)
hess: tensor(123.0506)
grad: tensor(0.0013, dtype=torch.float64)
hess: tensor(0.0420)
grad: tensor(0.0011, dtype=torch.float64)
hess: tensor(0.0519)
	accuracy:91.40625
	loss: 0.000023
grad: tensor(1.9458, dtype=torch.float64)
hess: tensor(43.8184)
grad: tensor(0.0075, dtype=torch.float64)
hess: tensor(0.3941)
grad: tensor(0.1444, dtype=torch.float64)
hess: tensor(6.8602)
grad: tensor(2.6958, dtype=torch.float64)
hess: tensor(72.1265)
grad: tensor(0.0071, dtype=torch.float64)
hess: tensor(0.2051)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.6481e-08, dtype=torch.float64)
hess: tensor(1.0359e-06)
	accuracy:90.625
	loss: 0.000000
grad: tensor(4.4640e-05, dtype=torch.float64)
hess: tensor(0.0018)
grad: tensor(6.9666, dtype=torch.float64)
hess: tensor(122.4483)
grad: tensor(0.0012, dtype=torch.float64)
hess: tensor(0.0395)
grad: tensor(0.0011, dtype=torch.float64)
hess: tensor(0.0483)
	accuracy:91.015625
	loss: 0.000021
grad: tensor(1.9135, dtype=torch.float64)
hess: tensor(43.4242)
grad: tensor(0.0072, dtype=torch.float64)
hess: tensor(0.3792)
grad: tensor(0.1470, dtype=torch.float64)
hess: tensor(7.0137)
grad: tensor(2.5890, dtype=torch.float64)
hess: tensor(70.3464)
grad: tensor(0.0068, dtype=torch.float64)
hess: tensor(0.1970)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.4139e-08, dtype=torch.float64)
hess: tensor(8.9112e-07)
	accuracy:90.625
	loss: 0.000000
grad: tensor(3.7900e-05, dtype=torch.float64)
hess: tensor(0.0015)
grad: tensor(6.7513, dtype=torch.float64)
hess: tensor(121.3553)
grad: tensor(0.0011, dtype=torch.float64)
hess: tensor(0.0376)
grad: tensor(0.0010, dtype=torch.float64)
hess: tensor(0.0457)
	accuracy:91.40625
	loss: 0.000020
grad: tensor(1.8803, dtype=torch.float64)
hess: tensor(43.0088)
grad: tensor(0.0069, dtype=torch.float64)
hess: tensor(0.3626)
grad: tensor(0.1403, dtype=torch.float64)
hess: tensor(6.7250)
grad: tensor(2.6447, dtype=torch.float64)
hess: tensor(71.6252)
grad: tensor(0.0065, dtype=torch.float64)
hess: tensor(0.1898)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.3269e-08, dtype=torch.float64)
hess: tensor(8.3832e-07)
	accuracy:90.625
	loss: 0.000000
hess: tensor(120.9443)
grad: tensor(0.0010, dtype=torch.float64)
hess: tensor(0.0335)
grad: tensor(0.0009, dtype=torch.float64)
hess: tensor(0.0397)
	accuracy:91.40625
	loss: 0.000018
grad: tensor(1.8178, dtype=torch.float64)
hess: tensor(42.1986)
grad: tensor(0.0066, dtype=torch.float64)
hess: tensor(0.3487)
grad: tensor(0.1238, dtype=torch.float64)
hess: tensor(5.9824)
grad: tensor(2.5001, dtype=torch.float64)
hess: tensor(69.2721)
grad: tensor(0.0059, dtype=torch.float64)
hess: tensor(0.1727)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(1.0511e-08, dtype=torch.float64)
hess: tensor(6.6741e-07)
	accuracy:90.625
	loss: 0.000000
grad: tensor(3.3411e-05, dtype=torch.float64)
hess: tensor(0.0013)
grad: tensor(6.3578, dtype=torch.float64)
hess: tensor(119.3409)
grad: tensor(0.0010, dtype=torch.float64)
hess: tensor(0.0332)
grad: tensor(0.0008, dtype=torch.float64)
hess: tensor(0.0373)
	accuracy:91.40625
	loss: 0.000018
grad: tensor(1.7968, dtype=torch.float64)
hess: tensor(41.9797)
grad: tensor(0.0064, dtype=torch.float64)
hess: tensor(0.3408)
grad: tensor(0.1261, dtype=torch.float64)
hess: tensor(6.1236)
grad: tensor(2.4772, dtype=torch.float64)
hess: tensor(69.0255)
grad: tensor(0.0057, dtype=torch.float64)
hess: tensor(0.1670)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(9.2750e-09, dtype=torch.float64)
hess: tensor(5.9034e-07)
	accuracy:90.625
	loss: 0.000000
grad: tensor(3.2472e-05, dtype=torch.float64)
hess: tensor(0.0012)
grad: tensor(6.2692, dtype=torch.float64)
hess: tensor(118.8811)
grad: tensor(0.0009, dtype=torch.float64)
hess: tensor(0.0300)
grad: tensor(0.0007, dtype=torch.float64)
hess: tensor(0.0349)
	accuracy:91.40625
	loss: 0.000017
grad: tensor(1.7644, dtype=torch.float64)
hess: tensor(41.5236)
grad: tensor(0.0061, dtype=torch.float64)
hess: tensor(0.3243)
grad: tensor(0.1162, dtype=torch.float64)
hess: tensor(5.6633)
grad: tensor(2.4163, dtype=torch.float64)
hess: tensor(68.0105)
grad: tensor(0.0054, dtype=torch.float64)
hess: tensor(0.1587)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(8.4059e-09, dtype=torch.float64)
hess: tensor(5.3629e-07)
	accuracy:90.625
	loss: 0.000000
grad: tensor(2.5943e-05, dtype=torch.float64)
hess: tensor(0.0011)
grad: tensor(5.9921, dtype=torch.float64)
hess: tensor(117.0862)
grad: tensor(0.0024, dtype=torch.float64)
hess: tensor(0.1125)
grad: tensor(8.6440e-05, dtype=torch.float64)
hess: tensor(0.0027)
grad: tensor(0.0229, dtype=torch.float64)
hess: tensor(0.7327)
	accuracy:91.40625
	loss: 0.000016
grad: tensor(0.0641, dtype=torch.float64)
hess: tensor(2.7968)
grad: tensor(0.1868, dtype=torch.float64)
hess: tensor(6.5929)
grad: tensor(0.3645, dtype=torch.float64)
hess: tensor(14.3366)
grad: tensor(0.4484, dtype=torch.float64)
hess: tensor(12.9771)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(7.4177e-09, dtype=torch.float64)
hess: tensor(4.7438e-07)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.0774, dtype=torch.float64)
hess: tensor(2.6464)
grad: tensor(0.0023, dtype=torch.float64)
hess: tensor(0.1099)
grad: tensor(8.3308e-05, dtype=torch.float64)
hess: tensor(0.0026)
grad: tensor(0.0220, dtype=torch.float64)
hess: tensor(0.7077)
	accuracy:91.40625
	loss: 0.000015
grad: tensor(0.0612, dtype=torch.float64)
hess: tensor(2.6800)
grad: tensor(0.1951, dtype=torch.float64)
hess: tensor(6.9013)
grad: tensor(0.3775, dtype=torch.float64)
hess: tensor(14.8908)
grad: tensor(0.4289, dtype=torch.float64)
hess: tensor(12.4675)


  0%|          | 0/2000 [00:00<?, ?batch/s]

grad: tensor(6.8866e-09, dtype=torch.float64)
hess: tensor(4.4142e-07)
	accuracy:90.625
	loss: 0.000000
grad: tensor(0.0755, dtype=torch.float64)
hess: tensor(2.5933)
grad: tensor(0.0023, dtype=torch.float64)
hess: tensor(0.1104)
grad: tensor(7.7660e-05, dtype=torch.float64)
hess: tensor(0.0024)
grad: tensor(0.0215, dtype=torch.float64)
hess: tensor(0.6921)
	accuracy:91.40625
	loss: 0.000015
grad: tensor(0.0586, dtype=torch.float64)
hess: tensor(2.5735)
grad: tensor(0.1893, dtype=torch.float64)
hess: tensor(6.7239)
grad: tensor(0.3749, dtype=torch.float64)
hess: tensor(14.8610)
grad: tensor(0.4219, dtype=torch.float64)


KeyboardInterrupt: 