The objective of this notebook is to do some simple hyperparamters optimization with a callback to Optuna

In [2]:
import os
import neptune.new as neptune
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import LambdaLR, ExponentialLR
from torch import Generator
import math
from collections import OrderedDict
import numpy as np
from itertools import chain
from tqdm import tqdm
import neptune.new.integrations.optuna as optuna_utils
import optuna

# Set the Neptune Logger variables
NEPTUNE_API_TOKEN = os.environ.get('NEPTUNE_API_TOKEN')
NEPTUNE_PROJECT = os.environ.get('NEPTUNE_ALIAS') + '/2to2scattering'

In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

##### Define the networks

In [4]:
class PhiNet(nn.Module):
    def __init__(self, ffnlayers, activation, fund_dom, device='cpu', bound_phi = math.pi, final_layer='Sigmoid'):
        """
        Initialize the network with the number of FC layers, the activation function, and the device to use
        """
        super(PhiNet, self).__init__()
        
        self.device = device
        self.fund_dom = fund_dom
        self.bound_phi = bound_phi
        
        if activation=='ReLU':
            self.activation = nn.ReLU()
        elif activation == 'Tanh':
            self.activation = nn.Tanh()
        else:
            raise ValueError("Activation function '{}' not supported. Supported activation functions are 'ReLU' and 'Tanh'".format(activation))

        
        if final_layer=='Sigmoid':
            self.final_layer = nn.Sigmoid()
            self.final_l = 'Sigmoid'
        elif final_layer == 'Tanh':
            self.final_layer = nn.Tanh()
            self.final_l = 'Tanh'
        elif final_layer == 'Identity':
            self.final_layer = nn.Identity()
            self.final_l = 'Identity'
        else:
            raise ValueError("Final layer '{}' not supported. Supported final layers are 'Sigmoid', 'Tanh' and 'Identity'".format(final_layer))

        
        # Create a list of layers from the parameters given. Add the appropriate activation function
        self.layer_sizes = ffnlayers
        layer_list = []
        
        for i, layer_size in enumerate(self.layer_sizes):
            if i == 0:
                layer_list.append(('layer_%d' % (i+1), nn.Linear(1, layer_size)))
                layer_list.append(('activation_%d' % (i+1), self.activation))
            else:
                layer_list.append(('layer_%d' % (i+1), nn.Linear(self.layer_sizes[i-1], layer_size)))
                layer_list.append(('activation_%d' %(i+1), self.activation))
        
        layer_list.append(('layer_%d' % (len(self.layer_sizes)+1), nn.Linear(self.layer_sizes[-1], 1)))
        
        # The last layer is sigmoid to constrain outputs between 0 and 1 or tanh for [-1,1]
        layer_list.append(('final_layer', self.final_layer))
            
        
        self.model = nn.Sequential(OrderedDict(layer_list))
    
    
    def forward(self, x):
        """
        Network forward pass where the outputs for the phase are in [-pi/2, pi/2] or [-pi, pi]
        """
        #x = (x+1)/2 # Normalize the the cosine input variable
        x = self.model(x)
        if self.final_l == 'Identity':
            return x
        const_mult = 2 if self.final_l == 'Sigmoid' else 1
        if self.fund_dom:
            x = torch.tensor(const_mult*math.pi/2, device=self.device)*x - torch.tensor((const_mult*math.pi- self.bound_phi)/2, device=self.device)
        else:
            x = torch.tensor(const_mult*math.pi, device=self.device)*x - torch.tensor(const_mult*math.pi- self.bound_phi, device=self.device)
        return x
    
    
class ModuleNetManual(nn.Module):
    """
    Class for parametrizing the given differential cross section
    """
    def __init__(self, scan_param=None):
        super(ModuleNetManual, self).__init__()
        self.scan_param = scan_param
        
    def forward(self, x):
        """
        Network forward pass is given by a specific function. Can iterate on the overall scale if we want to
        """
        
        denom = self.scan_param if self.scan_param is not None else 1
        return (-torch.pow(x,3) + torch.pow(x,2) + x + 1)/denom
        #return torch.abs(3*x + 1)/denom
        #return torch.pow(x,2)/denom

class PhaseNetSolver():
    """
    Main class for crafting the simulation
    """
    def __init__(self, params_simu, phi_net, mod_net, optimizer, random_generator, scheduler, device='cpu'):
        """
        Initialize the simulation with the networks, the loss function and the optimizer
        """
        
        self.epochs = params_simu['epochs_num']
        self.batch_size = params_simu['batch_size']
        self.eval_points_num = params_simu['integral_points']
        self.scaled_loss = params_simu['scaled_loss']
        self.method_int = params_simu['method_int']
        self.p_value = params_simu['p_value']
        self.lambda_repulsive = params_simu['lambda_repulsive']
        self.loss_func = params_simu['loss']
        
        if isinstance(phi_net, list):
            self.multi_nets = [net.to(device) for net in phi_net]
            self.phi_net = None
            self.multiple_nets = True
        else:
            self.multi_nets = None
            self.phi_net = phi_net.to(device)
            self.multiple_nets = False
        self.mod_net = mod_net.to(device)
        self.device = device
        self.rand_gen = random_generator
        
        self.optimizer = optimizer
        self.scheduler = scheduler
    
    def z2(self, z, z1, phi):
        """
        Compute the z2 parameter (formula in motivation)
        Place the batch size on first dim, z1 on the second and phi on the third
        """
        term1 = z.unsqueeze(dim=0).transpose(0, 1) * (z1.unsqueeze(dim=0))
        term2 = torch.sqrt(1 - torch.square(z)).unsqueeze(dim=0).transpose(0, 1) * torch.sqrt(1 - torch.square(z1)).unsqueeze(dim=0)
        return term1.unsqueeze(dim=2) + term2.unsqueeze(dim=2)*torch.cos(phi.unsqueeze(dim=0)).view(1,1,-1)
    
    def integral_approximator(self, z, method='trapz'):
        """
        Approximate the value of the RHS of the integral equation.
        Compute the grids of z1 and phi points for given z values and integrate over the grids
        """
        
        # Linear partition of the space
        z1_points = torch.linspace(-1,1,steps=self.eval_points_num, device=self.device, requires_grad=False)
        phi_points = torch.linspace(0, 2*math.pi, steps=self.eval_points_num, device=self.device, requires_grad=False)
        dx_z1 = 2/(self.eval_points_num-1)
        dx_phi = 2*math.pi/(self.eval_points_num-1)
        
        grid_points = self.mod_net(z1_points).view(1,-1,1)*self.mod_net(self.z2(z, z1_points, phi_points))*torch.cos(self.phi_net(z1_points.view(1,-1,1)) - self.phi_net(self.z2(z, z1_points, phi_points).unsqueeze(dim=-1)).squeeze())
        
        # Simple trapezoid exists in PyTorch
        if method == 'trapz':
            integral = torch.trapezoid(torch.trapezoid(grid_points, dx=dx_phi), dx=dx_z1)/(4*math.pi) 
        
        return integral
    
    def loss_function(self, zsamples):
        """
        Compute a loss based on the average residuals for the integral equation
        """
        
        # For each value of z we estimate the integral equation
        target = torch.ones(zsamples.size(), device=self.device) if self.scaled_loss else self.mod_net(zsamples)*torch.sin(self.phi_net(zsamples.view(-1,1)).squeeze())
        model_input = self.integral_approximator(zsamples, method=self.method_int)
        
        # If scaled then all targets should be 1 irrespective of the z point considered
        if self.scaled_loss:
             model_input = model_input / (self.mod_net(zsamples)*torch.sin(self.phi_net(zsamples.view(-1,1)).squeeze()))
            
        # We have different choices of losses. Huber is more stable to outliers.
        if self.loss_func =='Huber':
            loss_out = F.huber_loss(model_input, target, delta=0.1)
        elif self.loss_func == 'MSE':
            loss_out = F.mse_loss(model_input, target)
        else:
            raise NotImplemented
        
        return loss_out
    
    def multi_loss_function(self, zsamples, logger=None):
        
        if not self.multiple_nets:
            raise NotImplemented
        
        total_loss = 0
        for i, net in enumerate(self.multi_nets):
            self.phi_net = net
            individual_loss = self.loss_function(zsamples)
            
            if logger is not None:
                logger['metrics/solution_loss_{}'.format(i)].log(individual_loss)
            total_loss = total_loss + individual_loss
        
        return total_loss
    
    def dual_loss(self, zsamples):
             
        if not self.multiple_nets and len(self.multi_nets)!=2:
            raise NotImplemented
        target = torch.stack((torch.cos(self.multi_nets[0](zsamples.view(-1,1)).squeeze()), torch.sin(self.multi_nets[0](zsamples.view(-1,1)).squeeze())))
        model_input = torch.stack((torch.cos(self.multi_nets[1](zsamples.view(-1,1)).squeeze()), torch.sin(self.multi_nets[1](zsamples.view(-1,1)).squeeze())))
        
        # Maybe MSE makes more sense here since we are just comparing functions in the first place
        loss_out = torch.pow(F.mse_loss(model_input, target), -self.p_value)
        
        return loss_out
    
    def dual_loss_ambiguity(self, zsamples):
             
        if not self.multiple_nets and len(self.multi_nets)!=2:
            raise NotImplemented
        
        target = torch.stack((torch.cos(self.multi_nets[0](zsamples.view(-1,1)).squeeze()), torch.sin(self.multi_nets[0](zsamples.view(-1,1)).squeeze())))
        model_input1 = torch.stack((torch.cos(self.multi_nets[1](zsamples.view(-1,1)).squeeze()), torch.sin(self.multi_nets[1](zsamples.view(-1,1)).squeeze())))
        model_input2 = torch.stack((-torch.cos(self.multi_nets[1](zsamples.view(-1,1)).squeeze()), torch.sin(self.multi_nets[1](zsamples.view(-1,1)).squeeze())))
 
        # Maybe MSE makes more sense here since we are just comparing functions in the first place
        loss_out1 = torch.pow(F.mse_loss(model_input1, target), -self.p_value)
        loss_out2 = torch.pow(F.mse_loss(model_input2, target), -self.p_value)
        
        return loss_out1 + loss_out2
    
    def complete_loss(self, zsamples, dual_active, logger=None):
        
        if not self.multiple_nets:
            return self.loss_function(zsamples)
        else:
            if len(self.multi_nets)!=2:
                raise NotImplemented
            if dual_active:
                repulsive_loss = self.dual_loss_ambiguity(zsamples)
            else: 
                repulsive_loss = 0.0
            
            if logger is not None:
                logger['metrics/repulsive_loss'].log(repulsive_loss)
                
            return self.multi_loss_function(zsamples, logger=logger) + self.lambda_repulsive * repulsive_loss
    
    def point_loss(self, zsamples):
        """
        If we want to know the residuals at each given z point. This is useful for evaluation purposes
        """
        target = torch.ones(zsamples.size(), device=self.device) if self.scaled_loss else self.mod_net(zsamples)*torch.sin(self.phi_net(zsamples.view(-1,1)).squeeze())
        model_input = self.integral_approximator(zsamples)
        
        if self.scaled_loss:
            model_input = model_input / (self.mod_net(zsamples)*torch.sin(self.phi_net(zsamples.view(-1,1)).squeeze()))
        return torch.square(model_input - target)
        
    def train(self, neptune_run):
        """
        Main training loop. 
        Each epoch is a single optimization step over a unique batch.
        A batch is composed of a number of randomly selected z points between -1 and 1
        """
        
       
        tq_iterator = tqdm(range(self.epochs), unit='epoch')
        
        for epoch_num in tq_iterator:
            
            dual_active = False if epoch_num < 0.1*self.epochs or epoch_num > 0.2*self.epochs else True
            
            # Set the network in train mode
            if self.multiple_nets:
                for net in self.multi_nets:
                    net.train()
            else:
                self.phi_net.train()
            
            if self.device is not torch.device('cuda'):
                zpoints = torch.rand(self.batch_size, requires_grad=False, generator=self.rand_gen, device='cpu').to(self.device)*2 - 1
            else:
                zpoints = torch.rand(self.batch_size, requires_grad=False, generator=self.rand_gen, device=self.device)*2 - 1
            
            # Zero the grads, get the loss and backpropagate
            
            if type(self.optimizer).__name__ == 'LBFGS':
                
                loss = self.complete_loss(zpoints, dual_active, logger=neptune_run)
                
                def closure():
                    if torch.is_grad_enabled():
                        self.optimizer.zero_grad()
                    loss_comp = self.complete_loss(zpoints, dual_active, logger=neptune_run)
                    if loss_comp.requires_grad:
                        loss_comp.backward()
                    return loss_comp
                
                if neptune_run is not None:
                    neptune_run['metrics/learning_rate'].log(self.optimizer.param_groups[0]["lr"])
                self.optimizer.step(closure)
                   
            else:
                self.optimizer.zero_grad()
                a =  torch.cuda.memory_allocated(device)
                loss = self.complete_loss(zpoints, dual_active, logger=neptune_run)
                b = torch.cuda.memory_allocated(device)
                loss.backward()
                if neptune_run is not None:
                    neptune_run['metrics/learning_rate'].log(self.optimizer.param_groups[0]["lr"])
                self.optimizer.step()
                
            if self.scheduler is not None:
                self.scheduler.step()
            if neptune_run is not None:
                neptune_run['metrics/train_loss'].log(loss)
            
            tq_iterator.set_description('Train Epoch: {} ; tLoss: {:.6f}'.format(epoch_num,loss.item()))

def scheduler_rate(step, factor, size_param, warmup):
    """
    For the Learning Rate scheduler we implement a warmup start, followed by a square root decay
    """
    if step == 0:
        step = 1
    return factor * size_param**(-0.5) * min(step**(-0.5), step*warmup ** (-1.5))

##### Define the training and the evaluation runs

In [5]:
from torchinfo import summary

def train_run(parameters, device, run_neptune=None, module_net=None, seed_num=42, optim_name='Adam'):
    """
    Function to call for launching the training run
    """
    # Fix for the generator : For reproducibility between cpu and MPS
    random_gen_cpu = Generator(device='cpu')
    random_gen_cpu.manual_seed(seed_num)
    torch.manual_seed(seed_num)   
    torch.cuda.manual_seed(seed_num)
    
    num_phi_nets = parameters['num_nets']
    
    # Define and register the networks
    if num_phi_nets>1:
        phinn = [PhiNet(parameters['layer_list'], parameters['activation'], parameters['fund_dom'], device=device, bound_phi=parameters['bound_phi'], final_layer=parameters['final_layer']) for i in range(num_phi_nets)]
        phidisp = phinn[0]
    else:
        phinn = PhiNet(parameters['layer_list'], parameters['activation'], parameters['fund_dom'], device=device, bound_phi=parameters['bound_phi'], final_layer=parameters['final_layer'])
        phidisp = phinn
    
    if module_net is None:
        modnn = ModuleNetManual(scan_param=parameters['scan_param'])
    else:
        modnn = module_net
    
    # Define and register the optimizer and scheduler
    if optim_name == 'Adam':
        if num_phi_nets>1:
            optimizer = torch.optim.Adam(chain.from_iterable([phinet.parameters() for phinet in phinn]), lr=parameters['learning_rate'], betas=(parameters['beta1'], parameters['beta2']))
        else:
            optimizer = torch.optim.Adam(phinn.parameters(), lr=parameters['learning_rate'], betas=(parameters['beta1'], parameters['beta2']))
    elif optim_name == 'LBFGS':
        if num_phi_nets>1:
            optimizer = torch.optim.LBFGS(chain.from_iterable([phinet.parameters() for phinet in phinn]), lr=parameters['learning_rate'], max_iter=4)
        else:
            optimizer = torch.optim.LBFGS(phinn.parameters(), lr=parameters['learning_rate'], max_iter=4)
    
    
    if parameters['lr_scheduler']:
        #lr_scheduler = LambdaLR(optimizer=optimizer, lr_lambda=lambda step: scheduler_rate(step, 1/parameters['learning_rate'], 2048, 750))
        lr_scheduler = ExponentialLR(optimizer=optimizer, gamma=0.995)
    else:
        lr_scheduler = None
    
    # Create the simulation, train it and return the trained result
    netsolver = PhaseNetSolver(parameters, phinn, modnn, optimizer, random_gen_cpu, lr_scheduler, device=device)
    netsolver.train(run_neptune)

    return netsolver

In [6]:
def eval_run_loss(trained_solver, device, steps_eval=100):
    """
    At evaluation we verify if the integral equation is satisfied. 
    For this we take a larger sample of z points
    """
    
    # Make sure we are in evaluation mode
    trained_solver.phi_net.eval()
    with torch.no_grad(): 
        
        # Create the evaluation points and compute the phase at those points
        zpointstest = torch.linspace(-1,1,steps=steps_eval, device=device)
        
        # Also log the final loss values. Including the loss at individual z values       
        eval_loss_base = trained_solver.multi_loss_function(zpointstest)
        
    return eval_loss_base

In [7]:
class ModuleInterpCrichton(nn.Module):
    """
    Class for parametrizing the given differential cross section
    """
    def __init__(self, lambda_param=1):
        super(ModuleInterpCrichton, self).__init__()
        self.lambda_param = lambda_param
        
    def forward(self, x):
        """
        Network forward pass is given by a specific function. Can iterate on the overall scale if we want to
        """
        #return torch.abs(np.exp(1j *659 * math.pi /1200) * np.cos(59*math.pi /1200) + 5*np.exp(1j*self.lambda_param*math.pi /9)*(3*torch.square(x)-1) * np.sin(self.lambda_param*math.pi /9)/2 - 3*np.exp(-59*1j*self.lambda_param*math.pi /400)* x * np.sin(59 * self.lambda_param*math.pi /400))
        return torch.sqrt((16*np.cos((59*math.pi)/1200.)**4 + 4*np.sin((59*math.pi)/600.)**2 + 100*(1 - 3*torch.square(x))**2*np.sin((math.pi*self.lambda_param)/9.)**4 + 96*x*np.cos((59*math.pi)/1200.)**2*np.sin((59*math.pi*self.lambda_param)/400.)**2 + 144*torch.square(x)*np.sin((59*math.pi*self.lambda_param)/400.)**4 + 80*(-1 + 3*torch.square(x))*np.sin((math.pi*self.lambda_param)/9.)**2*(np.cos((59*math.pi)/1200.)**2 + 3*x*np.sin((59*math.pi*self.lambda_param)/400.)**2) + 20*np.sin((59*math.pi)/600.)*np.sin((2*math.pi*self.lambda_param)/9.) - 60*torch.square(x)*np.sin((59*math.pi)/600.)*np.sin((2*math.pi*self.lambda_param)/9.) + 25*np.sin((2*math.pi*self.lambda_param)/9.)**2 - 150*torch.square(x)*np.sin((2*math.pi*self.lambda_param)/9.)**2 + 225*torch.pow(x,4)*np.sin((2*math.pi*self.lambda_param)/9.)**2 + 24*x*np.sin((59*math.pi)/600.)*np.sin((59*math.pi*self.lambda_param)/200.) + 60*x*np.sin((2*math.pi*self.lambda_param)/9.)*np.sin((59*math.pi*self.lambda_param)/200.) - 180*torch.pow(x,3)*np.sin((2*math.pi*self.lambda_param)/9.)*np.sin((59*math.pi*self.lambda_param)/200.) + 36*torch.square(x)*np.sin((59*math.pi*self.lambda_param)/200.)**2)/16.)

##### Define the Optuna Objective

In [40]:
def objective(trial):
    
    params = {'batch_size': trial.suggest_int("batch_size",4, 128, 4),
             'epochs_num': 2000,
             'learning_rate': trial.suggest_float("learning_rate", 0.0001, 0.01, log=True),
             'lr_scheduler': False, 
             'beta1' : trial.suggest_float("beta1", 0.8, 0.95, step=0.025),
             'beta2' : trial.suggest_float("beta2", 0.9, 0.9999, log=True),
             'integral_points': trial.suggest_int("integral_points", 15, 50, 5),
             'method_int': 'trapz',
             'activation': trial.suggest_categorical("activation", ['ReLU', 'Tanh']),
             'fund_dom': False,
             'n_layers': trial.suggest_int("n_layers", 1, 5),
             'n_neurons': trial.suggest_int("n_neurons", 4, 136, 12),
             'final_layer': trial.suggest_categorical("final_layer", ['Sigmoid', 'Tanh']),
             'loss': 'MSE',
             'scaled_loss': False,
             'scan_param': 1,
             'bound_phi': math.pi,
             'num_nets': 2,
             'p_value': trial.suggest_float("p_value", 1, 3, step=0.25),
             'lambda_repulsive': trial.suggest_float("lambda_repulsive", 0.1, 5, log=True)}
    
    params['layer_list'] = params['n_layers'] * [params['n_neurons']]
    
    crichtonnet = ModuleInterpCrichton(lambda_param=params['scan_param'])
    netsolver = train_run(params, device, module_net=crichtonnet, seed_num=42, optim_name='Adam')
    
    eval_loss = eval_run_loss(netsolver, device)
    
    return eval_loss

In [41]:
# Start the Neptune run
run = neptune.init_run(project=NEPTUNE_PROJECT, api_token=NEPTUNE_API_TOKEN)
tags = {'Optimization': 'Full', 'optimizer': 'Adam', 'Huber': '0.1'}
run["sys/tags"].add(list(tags.values()))

# Start the callback for Optuna
#neptune_callback = optuna_utils.NeptuneCallback(run,log_plot_contour=False)

# Start the parameter study
study = optuna.create_study(direction="minimize")
#study.optimize(objective, n_trials=10, callbacks=[neptune_callback])
study.optimize(objective, n_trials=50)

# Log Optuna Study metadata
optuna_utils.log_study_metadata(study, run, log_plot_contour=False)
    
# Make sure to kill the Neptune logger run
run.stop()

https://app.neptune.ai/zulap/2to2scattering/e/TOS-1015
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


[32m[I 2023-01-31 19:27:37,611][0m A new study created in memory with name: no-name-3e70497c-680d-423d-84b3-8dd45e898669[0m
Train Epoch: 1999 ; tLoss: 0.000202: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [06:22<00:00,  5.23epoch/s]
[32m[I 2023-01-31 19:34:00,148][0m Trial 0 finished with value: 0.00013260490959510207 and parameters: {'batch_size': 120, 'learning_rate': 0.005769743573038201, 'beta1': 0.925, 'beta2': 0.9208945856594608, 'integral_points': 15, 'activation': 'ReLU', 'n_layers': 3, 'n_neurons': 52, 'final_layer': 'Sigmoid', 'p_value': 2.25, 'lambda_repulsive': 0.5633962588012414}. Best is trial 0 with value: 0.00013260490959510207.[0m
Train Epoch: 1999 ; tLoss: 0.000037: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [13:54<00:00,  2.40epoch/s]
[32m[I 2023-01-31 19:47:55,371][0m Trial 1 finished with value: 1.3852983101969585e-05 and parameters

Train Epoch: 1999 ; tLoss: 0.831416: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [03:36<00:00,  9.22epoch/s]
[32m[I 2023-01-31 21:42:39,121][0m Trial 14 finished with value: 0.49173545837402344 and parameters: {'batch_size': 76, 'learning_rate': 0.00018035411231854664, 'beta1': 0.875, 'beta2': 0.9799223150373702, 'integral_points': 30, 'activation': 'ReLU', 'n_layers': 5, 'n_neurons': 4, 'final_layer': 'Sigmoid', 'p_value': 2.0, 'lambda_repulsive': 0.1006878594487525}. Best is trial 6 with value: 2.842416733983555e-06.[0m
Train Epoch: 1999 ; tLoss: 0.000442: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [09:48<00:00,  3.40epoch/s]
[32m[I 2023-01-31 21:52:28,028][0m Trial 15 finished with value: 0.0001208117391797714 and parameters: {'batch_size': 108, 'learning_rate': 0.0006716892390874636, 'beta1': 0.9, 'beta2': 0.9563303845240164, 'integral_points': 25, 'ac

    return recv(self.sock, bufsize)
  File "/Users/aurelien/miniforge3/envs/spinorhelicity/lib/python3.9/site-packages/websocket/_socket.py", line 110, in recv
    raise WebSocketTimeoutException("Connection timed out")
websocket._exceptions.WebSocketTimeoutException: Connection timed out
Train Epoch: 1999 ; tLoss: 0.005926: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [04:55<00:00,  6.76epoch/s]
[32m[I 2023-01-31 22:29:35,395][0m Trial 17 finished with value: 6.312422556220554e-06 and parameters: {'batch_size': 32, 'learning_rate': 0.0005867881865135276, 'beta1': 0.9, 'beta2': 0.9532579374311311, 'integral_points': 40, 'activation': 'ReLU', 'n_layers': 3, 'n_neurons': 76, 'final_layer': 'Tanh', 'p_value': 1.75, 'lambda_repulsive': 0.40461815599115825}. Best is trial 6 with value: 2.842416733983555e-06.[0m
Train Epoch: 1999 ; tLoss: 0.000191: 100%|███████████████████████████████████████████████████████████████████████████

Train Epoch: 1999 ; tLoss: 0.000006: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [10:25<00:00,  3.20epoch/s]
[32m[I 2023-02-01 03:13:00,684][0m Trial 44 finished with value: 6.246526481845649e-06 and parameters: {'batch_size': 56, 'learning_rate': 0.00029688134215772954, 'beta1': 0.95, 'beta2': 0.9633350495150731, 'integral_points': 45, 'activation': 'ReLU', 'n_layers': 3, 'n_neurons': 64, 'final_layer': 'Tanh', 'p_value': 1.25, 'lambda_repulsive': 0.12718655372029114}. Best is trial 41 with value: 2.067258947135997e-06.[0m
Train Epoch: 1999 ; tLoss: 0.001261: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [06:47<00:00,  4.91epoch/s]
[32m[I 2023-02-01 03:19:48,214][0m Trial 45 finished with value: 0.0013443351490423083 and parameters: {'batch_size': 60, 'learning_rate': 0.00030542305187745386, 'beta1': 0.95, 'beta2': 0.9649800894680594, 'integral_points': 45, 

Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1054 operations to synchronize with Neptune. Do not kill this process.
All 1054 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/zulap/2to2scattering/e/TOS-1015


In [44]:
run.stop()

Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/zulap/2to2scattering/e/TOS-1017


In [7]:
def objective2(trial):
    
    params = {'batch_size': trial.suggest_int("batch_size", 28, 128, 10),
             'epochs_num': 2500,
             'learning_rate': trial.suggest_float("learning_rate", 0.0005, 0.0075),
             'lr_scheduler': False, 
             'beta1' : 0.9,
             'beta2' : 0.999,
             'integral_points': trial.suggest_int("integral_points", 20, 50, 5),
             'method_int': 'trapz',
             'activation': 'ReLU',
             'fund_dom': False,
             'n_layers': trial.suggest_int("n_layers", 3, 5),
             'n_neurons': trial.suggest_int("n_neurons", 16, 96, 8),
             'final_layer': 'Tanh',
             'loss': 'MSE',
             'scaled_loss': False,
             'scan_param': 1,
             'bound_phi': math.pi,
             'num_nets': 2,
             'p_value': trial.suggest_float("p_value", 1, 2, step=0.25),
             'lambda_repulsive': trial.suggest_float("lambda_repulsive", 0.1, 3)}
    
    params['layer_list'] = params['n_layers'] * [params['n_neurons']]
    
    crichtonnet = ModuleInterpCrichton(lambda_param=params['scan_param'])
    netsolver = train_run(params, device, module_net=crichtonnet, seed_num=42, optim_name='Adam')
    
    eval_loss = eval_run_loss(netsolver, device)
    
    return eval_loss

In [8]:
# Start the Neptune run
run = neptune.init_run(project=NEPTUNE_PROJECT, api_token=NEPTUNE_API_TOKEN)
tags = {'Optimization': 'Full', 'optimizer': 'Adam', 'Huber': '0.1'}
run["sys/tags"].add(list(tags.values()))

# Start the callback for Optuna
#neptune_callback = optuna_utils.NeptuneCallback(run,log_plot_contour=False)

# Start the parameter study
study = optuna.create_study(direction="minimize")
#study.optimize(objective, n_trials=10, callbacks=[neptune_callback])
study.optimize(objective2, n_trials=50)

# Log Optuna Study metadata
optuna_utils.log_study_metadata(study, run, log_plot_contour=True)
    
# Make sure to kill the Neptune logger run
run.stop()

https://app.neptune.ai/zulap/2to2scattering/e/TOS-1025


Info (NVML): NVML Shared Library Not Found. GPU usage metrics may not be reported. For more information, see https://docs.neptune.ai/help/nvml_error/


Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


[32m[I 2023-02-01 12:41:30,150][0m A new study created in memory with name: no-name-ed10882a-e424-44d4-a8bb-477e0661c3de[0m
Train Epoch: 2499 ; tLoss: 0.000205: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [17:41<00:00,  2.36epoch/s]
[32m[I 2023-02-01 12:59:12,241][0m Trial 0 finished with value: 0.00019795855041593313 and parameters: {'batch_size': 88, 'learning_rate': 0.0018165671976861242, 'integral_points': 30, 'n_layers': 4, 'n_neurons': 40, 'p_value': 1.25, 'lambda_repulsive': 1.7068824136252545}. Best is trial 0 with value: 0.00019795855041593313.[0m
Train Epoch: 2499 ; tLoss: 0.000857: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [26:36<00:00,  1.57epoch/s]
[32m[I 2023-02-01 13:25:48,616][0m Trial 1 finished with value: 0.001166922622360289 and parameters: {'batch_size': 68, 'learning_rate': 0.006796276340648397, 'integral_points': 45, 'n_layers': 

KeyboardInterrupt: 

In [None]:
run = neptune.init_run(project=NEPTUNE_PROJECT, api_token=NEPTUNE_API_TOKEN)
tags = {'Optimization': 'Crichton', 'optimizer': 'Adam', 'Huber': '0.1'}
run["sys/tags"].add(list(tags.values()))
params = {'batch_size': 64,
         'epochs_num': 2500,
         'learning_rate': 0.001,
         'lr_scheduler': False, 
         'beta1' : 0.9,
         'beta2' : 0.999,
         'integral_points': 25,
         'method_int': 'trapz',
         'activation': 'ReLU',
         'fund_dom': False,
         'n_layers': 4,
         'n_neurons': 64,
         'final_layer': 'Tanh',
         'loss': 'MSE',
         'scaled_loss': False,
         'scan_param': 1,
         'bound_phi': math.pi,
         'num_nets': 2,
         'p_value': 2,
         'lambda_repulsive': 1}

params['layer_list'] = params['n_layers'] * [params['n_neurons']]

crichtonnet = ModuleInterpCrichton(lambda_param=params['scan_param'])
netsolver = train_run(params, device, module_net=crichtonnet, seed_num=42, optim_name='Adam')

run.stop()

https://app.neptune.ai/zulap/2to2scattering/e/TOS-1033


Info (NVML): NVML Shared Library Not Found. GPU usage metrics may not be reported. For more information, see https://docs.neptune.ai/help/nvml_error/


Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


Train Epoch: 974 ; tLoss: 0.005758:  39%|███████████████████████████████████▉                                                        | 975/2500 [02:05<03:14,  7.82epoch/s]