## Multi-Fidelity BO with Discrete Fidelities using KG

### Set dtype and device

In [1]:
import os
import torch

torch.set_default_dtype(torch.double)
tkwargs = {
    "dtype": torch.double,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}

print('Running on PyTorch {}'.format(torch.__version__))

# Get cpu or gpu device for training
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

SMOKE_TEST = os.environ.get("SMOKE_TEST")

Running on PyTorch 1.13.0+cpu
Using cpu device


### Problem setup

In [2]:
import pandas as pd
#df = pd.read_csv('Case3_2nd_launch_WingsConvCoeffs_Info.csv', sep=";")
df = pd.read_excel('Case5_1st_launch_WingsConvCoeffs_Info.xlsx')
df_conv = df[df['Unnamed: 47'] == 'CONVERGED']

In [19]:
from sklearn.model_selection import train_test_split

dataset, hifi = train_test_split(df_conv, test_size=0.02)
lofi, test = train_test_split(dataset, test_size=0.2, random_state=42 )

hifi_y = hifi.Cy0Mean.to_numpy()
lofi_y = lofi.Cy0Mean.to_numpy()
hifi_x = hifi.alpha0.to_numpy()
lofi_x = lofi.alpha0.to_numpy()
test_y = test.Cy0Mean.to_numpy()
test_x = test.alpha0.to_numpy()

# normalize features
hifi_x = (hifi_x - lofi_x.min()) / (lofi_x.max() - lofi_x.min()) 
test_x = (test_x - lofi_x.min()) / (lofi_x.max() - lofi_x.min()) 
lofi_x = (lofi_x - lofi_x.min()) / (lofi_x.max() - lofi_x.min()) 

# normalize labels
mean, std = lofi_y.mean(), lofi_y.std()
hifi_y = (hifi_y - mean) / std
lofi_y = (lofi_y - mean) / std
test_y = (test_y - mean) / std

# Cast them
noise = torch.rand(lofi_y.shape).unsqueeze(-1)

X_hifi = torch.Tensor(hifi_x).unsqueeze(-1)
X_lofi = torch.Tensor(lofi_x).unsqueeze(-1)
Y_hifi = torch.Tensor(hifi_y).unsqueeze(-1)
Y_lofi = noise + torch.Tensor(lofi_y).unsqueeze(-1)
X_test = torch.Tensor(test_x).unsqueeze(-1)
Y_test = torch.Tensor(test_y).unsqueeze(-1)

print("Shape of low fidelity  X and y: ",X_lofi.shape, Y_lofi.shape)
print("Shape of high fidelity X and y: ",X_hifi.shape, Y_hifi.shape)
print("Shape of test set      X and y: ",X_test.shape, Y_test.shape)

Shape of low fidelity  X and y:  torch.Size([938, 1]) torch.Size([938, 1])
Shape of high fidelity X and y:  torch.Size([24, 1]) torch.Size([24, 1])
Shape of test set      X and y:  torch.Size([235, 1]) torch.Size([235, 1])


In [20]:
from torch import nn
# define the neural network
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(2, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        x = self.fc2(x)
        return x

In [5]:
# train the neural network
nn_model = NeuralNetwork()

#--- OPTIMIZING THE PARAMETERS OF THE NN ---#

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(nn_model.parameters(), lr=1e-2)
# '''
# In a single training epoch, the model makes predictions on the training dataset (fed to it in batches)
# and backpropagates the prediction error to update the model’s parameters
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad() # clears old gradients from the last step
        loss.backward()
        optimizer.step()

        if batch % 10 == 0:
           loss, current = loss.item(), batch * len(X)
           print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

# Check the model’s performance against the test dataset to ensure it is learning
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error ->  Avg loss: {test_loss:>8f} \n")


def pred(new_point, model):
    model.eval()
    with torch.no_grad():
            pred_point = model(new_point)
    return pred_point

#### Model initialization

We use a `SingleTaskMultiFidelityGP` as the surrogate model, which uses a kernel from [2] that is well-suited for multi-fidelity applications. The `SingleTaskMultiFidelityGP` models the design and fidelity parameters jointly, so its domain is $[0,1]^7$.

In [8]:
from botorch.models.gp_regression_fidelity import SingleTaskMultiFidelityGP
from botorch.models.transforms.outcome import Standardize
from gpytorch.mlls.exact_marginal_log_likelihood import ExactMarginalLogLikelihood

lo_fi = 0.33
hi_fi = 1
def generate_initial_data():
    # generate training data
    '''train_x = torch.rand(n, 1, **tkwargs)
    train_f = lofidelities[torch.ones(2)]'''
    train_f = lo_fi*torch.ones(X_lofi.shape, **tkwargs)
    train_x_lofi = torch.cat((X_lofi, train_f), dim=1)
    train_x_hifi = torch.cat((X_hifi, torch.ones(X_hifi.shape, **tkwargs)), dim=1)
    train_x_full = torch.cat([train_x_lofi, train_x_hifi])
    train_obj = torch.cat([Y_lofi, Y_hifi])  # add output dimension
    test_X = torch.cat((X_test, torch.ones(X_test.shape, **tkwargs)), dim=1) 

    return train_x_full, train_obj, test_X


def initialize_model(train_x, train_obj):
    # define a surrogate model suited for a "training data"-like fidelity parameter
    # in dimension 6, as in [2]
    model = SingleTaskMultiFidelityGP(train_x, train_obj, outcome_transform=Standardize(m=1), data_fidelity=1)
    mll = ExactMarginalLogLikelihood(model.likelihood, model)
    return mll, model

In [9]:
train_x, train_obj, test_X = generate_initial_data()

from torch.utils.data import TensorDataset, DataLoader
hifi_dataset = TensorDataset(train_x, train_obj)
test_dataset = TensorDataset(test_X, Y_test)
# Create data loaders
hifi_dataloader = DataLoader(hifi_dataset, batch_size = 32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size = 32, shuffle=True)

In [10]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(hifi_dataloader, nn_model, loss_fn, optimizer)
    test(test_dataloader, nn_model, loss_fn)
print("Finish")

Epoch 1
-------------------------------
loss: 1.782607  [    0/ 1002]
loss: 0.854304  [  320/ 1002]
loss: 0.784380  [  640/ 1002]
loss: 0.896530  [  960/ 1002]
Test Error ->  Avg loss: 0.853241 

Epoch 2
-------------------------------
loss: 1.226382  [    0/ 1002]
loss: 0.814704  [  320/ 1002]
loss: 0.634191  [  640/ 1002]
loss: 0.574358  [  960/ 1002]
Test Error ->  Avg loss: 0.646475 

Epoch 3
-------------------------------
loss: 0.395952  [    0/ 1002]
loss: 0.367285  [  320/ 1002]
loss: 0.336117  [  640/ 1002]
loss: 0.363503  [  960/ 1002]
Test Error ->  Avg loss: 0.580859 

Epoch 4
-------------------------------
loss: 0.222086  [    0/ 1002]
loss: 0.206795  [  320/ 1002]
loss: 0.272136  [  640/ 1002]
loss: 0.238652  [  960/ 1002]
Test Error ->  Avg loss: 0.654549 

Epoch 5
-------------------------------
loss: 0.226314  [    0/ 1002]
loss: 0.233278  [  320/ 1002]
loss: 0.218726  [  640/ 1002]
loss: 0.213913  [  960/ 1002]
Test Error ->  Avg loss: 0.589970 

Finish


#### Define a helper function to construct the MFKG acquisition function
The helper function illustrates how one can initialize an $q$MFKG acquisition function. In this example, we assume that the affine cost is known. We then use the notion of a `CostAwareUtility` in BoTorch to scalarize the "competing objectives" of information gain and cost. The MFKG acquisition function optimizes the ratio of information gain to cost, which is captured by the `InverseCostWeightedUtility`.

In order for MFKG to evaluate the information gain, it uses the model to predict the function value at the highest fidelity after conditioning on the observation. This is handled by the `project` argument, which specifies how to transform a tensor `X` to its target fidelity. We use a default helper function called `project_to_target_fidelity` to achieve this.

An important point to keep in mind: in the case of standard KG, one can ignore the current value and simply optimize the expected maximum posterior mean of the next stage. However, for MFKG, since the goal is optimize information *gain* per cost, it is important to first compute the current value (i.e., maximum of the posterior mean at the target fidelity). To accomplish this, we use a `FixedFeatureAcquisitionFunction` on top of a `PosteriorMean`.

In [11]:
from botorch import fit_gpytorch_mll
from botorch.models.cost import AffineFidelityCostModel
from botorch.acquisition.cost_aware import InverseCostWeightedUtility
from botorch.acquisition import PosteriorMean
from botorch.acquisition.knowledge_gradient import qMultiFidelityKnowledgeGradient
from botorch.acquisition.fixed_feature import FixedFeatureAcquisitionFunction
from botorch.optim.optimize import optimize_acqf
from botorch.acquisition.utils import project_to_target_fidelity

bounds = torch.Tensor([[0]*(1 + X_lofi.shape[1]), [1]*(1 + X_lofi.shape[1])])
print("Buonds shape: ", bounds.shape)

target_fidelities = {1: 1.0}

cost_model = AffineFidelityCostModel(fidelity_weights={1: 1.0}, fixed_cost=5.0)
cost_aware_utility = InverseCostWeightedUtility(cost_model=cost_model)


def project(X):
    return project_to_target_fidelity(X=X, target_fidelities=target_fidelities)

def get_mfkg(model):
    
    curr_val_acqf = FixedFeatureAcquisitionFunction(
        acq_function=PosteriorMean(model),
        d=2,
        columns=[0],
        values=[1],
    )
    
    _, current_value = optimize_acqf(
        acq_function=curr_val_acqf,
        bounds=bounds[:,:-1],
        q=1,
        num_restarts=10 if not SMOKE_TEST else 2,
        raw_samples=1024 if not SMOKE_TEST else 4,
        options={"batch_limit": 10, "maxiter": 200},
    ) 
        
    return qMultiFidelityKnowledgeGradient(
        model=model,
        num_fantasies=128 if not SMOKE_TEST else 2,
        current_value=current_value,
        cost_aware_utility=cost_aware_utility,
        project=project,
    )

Buonds shape:  torch.Size([2, 2])


#### Define a helper function that performs the essential BO step
This helper function optimizes the acquisition function and returns the batch $\{x_1, x_2, \ldots x_q\}$ along with the observed function values. The function `optimize_acqf_mixed` sequentially optimizes the acquisition function over $x$ for each value of the fidelity $s \in \{0, 0.5, 1.0\}$.

In [12]:
from botorch.optim.optimize import optimize_acqf_mixed


torch.set_printoptions(precision=3, sci_mode=False)

NUM_RESTARTS = 5 if not SMOKE_TEST else 2
RAW_SAMPLES = 128 if not SMOKE_TEST else 4
BATCH_SIZE = 1


def optimize_mfkg_and_get_observation(mfkg_acqf):
    """Optimizes MFKG and returns a new candidate, observation, and cost."""

    # generate new candidates
    candidates, _ = optimize_acqf_mixed(
        acq_function=mfkg_acqf,
        bounds=bounds,
        fixed_features_list=[{1: 0.33}, {1: 1.}],
        q=BATCH_SIZE,
        num_restarts=NUM_RESTARTS,
        raw_samples=RAW_SAMPLES,
        # batch_initial_conditions=X_init,
        options={"batch_limit": 5, "maxiter": 200},
    )

    # observe new values
    cost = cost_model(candidates).sum()
    new_x = candidates.detach()
    new_obj = pred(new_x, nn_model)
    print(f"candidates:\n{new_x}\n")
    print(f"observations:\n{new_obj}\n\n")
    return new_x, new_obj, cost

### Perform a few steps of multi-fidelity BO
First, let's generate some initial random data and fit a surrogate model.

We can now use the helper functions above to run a few iterations of BO.

In [13]:
cumulative_cost = 0.0
N_ITER = 3 if not SMOKE_TEST else 1

for i in range(N_ITER):
    mll, model = initialize_model(train_x, train_obj)
    fit_gpytorch_mll(mll)
    mfkg_acqf = get_mfkg(model)
    new_x, new_obj, cost = optimize_mfkg_and_get_observation(mfkg_acqf)
    train_x = torch.cat([train_x, new_x])
    train_obj = torch.cat([train_obj, new_obj])
    cumulative_cost += cost

candidates:
tensor([[0.939, 0.330]])

observations:
tensor([[-0.595]])


candidates:
tensor([[0.000, 0.330]])

observations:
tensor([[2.133]])


candidates:
tensor([[0.000, 0.330]])

observations:
tensor([[2.133]])




### Make a final recommendation
In multi-fidelity BO, there are usually fewer observations of the function at the target fidelity, so it is important to use a recommendation function that uses the correct fidelity. Here, we maximize the posterior mean with the fidelity dimension fixed to the target fidelity of 1.0.

In [14]:
def get_recommendation(model):
    rec_acqf = FixedFeatureAcquisitionFunction(
        acq_function=PosteriorMean(model),
        d=2,
        columns=[0],
        values=[1],
    )

    final_rec, _ = optimize_acqf(
        acq_function=rec_acqf,
        bounds=bounds[:,:-1],
        q=1,
        num_restarts=10,
        raw_samples=512,
        options={"batch_limit": 5, "maxiter": 200},
    )
    
    final_rec = rec_acqf._construct_X_full(final_rec)
    
    objective_value = pred(final_rec,nn_model)
    print(f"recommended point:\n{final_rec}\n\nobjective value:\n{objective_value}")
    return final_rec

In [15]:
final_rec = get_recommendation(model)
print(f"\ntotal cost: {cumulative_cost}\n")

NameError: name 'problem' is not defined

### Comparison to standard EI (always use target fidelity)
Let's now repeat the same steps using a standard EI acquisition function (note that this is not a rigorous comparison as we are only looking at one trial in order to keep computational requirements low).

In [None]:
from botorch.acquisition import qExpectedImprovement


def get_ei(model, best_f):

    return FixedFeatureAcquisitionFunction(
        acq_function=qExpectedImprovement(model=model, best_f=best_f),
        d=2,
        columns=[0],
        values=[1],
    )


def optimize_ei_and_get_observation(ei_acqf):
    """Optimizes EI and returns a new candidate, observation, and cost."""

    candidates, _ = optimize_acqf(
        acq_function=ei_acqf,
        bounds=bounds[:, :-1],
        q=BATCH_SIZE,
        num_restarts=10,
        raw_samples=512,
        options={"batch_limit": 5, "maxiter": 200},
    )

    # add the fidelity parameter
    candidates = ei_acqf._construct_X_full(candidates)

    # observe new values
    cost = cost_model(candidates).sum()
    new_x = candidates.detach()
    new_obj = problem(new_x).unsqueeze(-1)
    print(f"candidates:\n{new_x}\n")
    print(f"observations:\n{new_obj}\n\n")
    return new_x, new_obj, cost

In [None]:
cumulative_cost = 0.0

train_x, train_obj = generate_initial_data(n=16)

for _ in range(N_ITER):
    mll, model = initialize_model(train_x, train_obj)
    fit_gpytorch_mll(mll)
    ei_acqf = get_ei(model, best_f=train_obj.max())
    new_x, new_obj, cost = optimize_ei_and_get_observation(ei_acqf)
    train_x = torch.cat([train_x, new_x])
    train_obj = torch.cat([train_obj, new_obj])
    cumulative_cost += cost

candidates:
tensor([[0.247, 0.687, 0.581, 0.760, 0.093, 0.132, 1.000],
        [0.319, 0.850, 0.639, 0.865, 0.000, 0.120, 1.000],
        [0.349, 0.666, 0.555, 0.986, 0.000, 0.126, 1.000],
        [0.297, 0.792, 0.450, 0.889, 0.034, 0.028, 1.000]], device='cuda:0',
       dtype=torch.float64)

observations:
tensor([[0.973],
        [1.091],
        [0.340],
        [0.902]], device='cuda:0', dtype=torch.float64)




candidates:
tensor([[0.194, 0.858, 0.622, 0.799, 0.000, 0.095, 1.000],
        [0.341, 0.854, 0.590, 0.767, 0.000, 0.085, 1.000],
        [0.999, 0.439, 0.828, 0.975, 0.633, 0.176, 1.000],
        [0.296, 0.859, 0.677, 0.806, 0.119, 0.054, 1.000]], device='cuda:0',
       dtype=torch.float64)

observations:
tensor([[    0.862],
        [    1.975],
        [    0.000],
        [    1.514]], device='cuda:0', dtype=torch.float64)





A not p.d., added jitter of 1.0e-08 to the diagonal



candidates:
tensor([[0.360, 0.891, 0.588, 0.749, 0.019, 0.036, 1.000],
        [0.049, 0.894, 0.345, 0.210, 0.482, 0.463, 1.000],
        [0.398, 0.970, 0.504, 0.213, 0.814, 0.724, 1.000],
        [0.817, 0.879, 0.691, 0.842, 0.455, 0.937, 1.000]], device='cuda:0',
       dtype=torch.float64)

observations:
tensor([[2.271],
        [0.216],
        [0.055],
        [0.036]], device='cuda:0', dtype=torch.float64)




In [None]:
final_rec = get_recommendation(model)
print(f"\ntotal cost: {cumulative_cost}\n")

recommended point:
tensor([[0.352, 0.874, 0.589, 0.756, 0.008, 0.060, 1.000]], device='cuda:0',
       dtype=torch.float64)

objective value:
tensor([2.166], device='cuda:0', dtype=torch.float64)

total cost: 72.0

