# BO runs

In [1]:
import torch
from botorch.models import FixedNoiseGP, SingleTaskGP
from gpytorch.kernels import ScaleKernel
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch import fit_gpytorch_model
from botorch.acquisition.analytic import ExpectedImprovement
import numpy as np
import pickle
import sys
import time

load data from `prepare_Xy.ipynb`

In [2]:
X = pickle.load(open('inputs_and_outputs.pkl', 'rb'))['X']
y = pickle.load(open('inputs_and_outputs.pkl', 'rb'))['y']
y = np.reshape(y, (np.size(y), 1)) # for the GP
nb_data = np.size(y)
nb_data

69839

convert to torch tensors

In [3]:
X = torch.from_numpy(X)
y = torch.from_numpy(y)

In [4]:
X.size()

torch.Size([69839, 12])

In [5]:
y.size()

torch.Size([69839, 1])

In [6]:
X_unsqueezed = X.unsqueeze(1)

In [7]:
batch_size = 10000
acquisition_values = np.zeros((nb_data))
acquisition_values[:] = np.NaN
nb_batches = nb_data // batch_size
for ba in range(nb_batches+1):
    id_start = ba * batch_size
    id_end   = id_start + batch_size
    if id_end > nb_data:
        id_end = nb_data
    acquisition_values[id_start:id_end] = range(id_start, id_end)
    
np.sum(np.isnan(acquisition_values))

0

In [8]:
nb_data

69839

number of COFs for initialization

In [9]:
nb_COFs_initialization = 10

In [10]:
def bo_run(nb_iterations):
    assert nb_iterations > nb_COFs_initialization
    
    # select initial COFs for training data randomly.
    # idea is to keep populating this ids_acquired and return it for analysis.
    ids_acquired = np.random.choice(np.arange((nb_data)), size=nb_COFs_initialization, replace=False)

    # initialize acquired y, since it requires normalization
    y_acquired = y[ids_acquired]
    # standardize outputs
    y_acquired = (y_acquired - torch.mean(y_acquired)) / torch.std(y_acquired)
    
    for i in range(nb_COFs_initialization, nb_iterations):
        print("iteration:", i)
        # construct and fit GP model
        model = SingleTaskGP(X[ids_acquired, :], y_acquired)
        mll = ExactMarginalLogLikelihood(model.likelihood, model)
        fit_gpytorch_model(mll)

        # set up acquisition function
        acquisition_function = ExpectedImprovement(model, best_f=y_acquired.max().item())
        
        # compute aquisition function at each COF in the database. need to do in batches to avoid mem issues
        batch_size = 20000
        acquisition_values = torch.zeros((nb_data))
        acquisition_values[:] = np.NaN # for safety
        nb_batches = nb_data // batch_size
        for ba in range(nb_batches+1):
            id_start = ba * batch_size
            id_end   = id_start + batch_size
            if id_end > nb_data:
                id_end = nb_data
            acquisition_values[id_start:id_end] = acquisition_function.forward(X_unsqueezed[id_start:id_end])
#         acquisition_values = acquisition_function.forward(X_unsqueezed)
        assert acquisition_values.isnan().sum().item() == 0 # so that all are filled properly.
        del acquisition_function

        # select COF to acquire with maximal aquisition value, which is not in the acquired set already
        ids_sorted_by_aquisition = acquisition_values.argsort(descending=True)
        for id_max_aquisition_all in ids_sorted_by_aquisition:
            if not id_max_aquisition_all.item() in ids_acquired:
                id_max_aquisition = id_max_aquisition_all.item()
                break

        # acquire this COF
        ids_acquired = np.concatenate((ids_acquired, [id_max_aquisition]))
        assert np.size(ids_acquired) == i + 1

        # update y aquired; start over to normalize properly
        del y_acquired
        y_acquired = y[ids_acquired, :] # start over to normalize y properly
        y_acquired = (y_acquired - torch.mean(y_acquired)) / torch.std(y_acquired)

        print("\tacquired COF", id_max_aquisition, "with y = ", y[id_max_aquisition].item())
        print("\tbest y acquired:", y[ids_acquired].max().item())
        
        del model
        del mll
        del acquisition_values
        
    assert np.size(ids_acquired) == nb_iterations
    return ids_acquired

`ids_acquired[r, i]` will give ID of COF acquired during iteration `i` from run `r`.

In [None]:
bo_res = dict()
bo_res['nb_runs']       = 1
bo_res['nb_iterations'] = 200
bo_res['ids_acquired'] = []
for r in range(bo_res['nb_runs']):
    print("\n\nRUN", r)
    t0 = time.time()
    ids_acquired = bo_run(bo_res['nb_iterations'])
    bo_res['ids_acquired'].append(ids_acquired)
    print("took time t = ", (time.time() - t0) / 60, "min")



RUN 0
iteration: 10
	acquired COF 56259 with y =  182.416471606
	best y acquired: 182.416471606
iteration: 11
	acquired COF 44551 with y =  188.642146113
	best y acquired: 188.642146113
iteration: 12
	acquired COF 59749 with y =  183.06633314099997
	best y acquired: 188.642146113
iteration: 13
	acquired COF 43434 with y =  166.762639788
	best y acquired: 188.642146113
iteration: 14
	acquired COF 57294 with y =  166.196918004
	best y acquired: 188.642146113
iteration: 15
	acquired COF 65585 with y =  173.44669686900002
	best y acquired: 188.642146113
iteration: 16
	acquired COF 441 with y =  186.034221186
	best y acquired: 188.642146113
iteration: 17
	acquired COF 20675 with y =  167.532168988
	best y acquired: 188.642146113
iteration: 18
	acquired COF 12418 with y =  176.910634695
	best y acquired: 188.642146113
iteration: 19
	acquired COF 13260 with y =  153.441277223
	best y acquired: 188.642146113
iteration: 20
	acquired COF 12402 with y =  175.504448723
	best y acquired: 188.6421

In [14]:
with open('bo_results.pkl', 'wb') as file:
    pickle.dump(bo_res, file)