# Hartmann function

**Goal**: apply Bayesian Optimisation (BO) strategy to minimize Hartmann function with pyro.

In [1]:
import copy
import math
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.autograd as autograd
import torch.optim as optim
from torch.distributions import constraints, transform_to

import pyro
import pyro.contrib.gp as gp
import pyro.distributions as dist

pyro.enable_validation(True)

In [2]:
print("torch:", torch.__version__)
print("pyro:", pyro.__version__)

if not pyro.__version__.startswith("1"):
    raise ValueError("incompatible version of pyro")

torch: 1.3.0.post2
pyro: 1.0.0


In [3]:
seed_number = 444

def set_random_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
set_random_seed(seed_number)

## Objective function

In [4]:
const_hf_a = torch.tensor([[10.0, 3.0, 17.0, 3.5, 1.7, 8.0],
                           [0.05, 10.0, 17.0, 0.1, 8.0, 14.0],
                           [3.0, 3.5, 1.7, 10.0, 17.0, 8.0],
                           [17.0, 8.0, 0.05, 10.0, 0.1, 14.0]])

const_hf_c = torch.tensor([1.0, 1.2, 3.0, 3.2])

const_hf_p = torch.tensor([[0.1312, 0.1696, 0.5569, 0.0124, 0.8283, 0.5886], 
                           [0.2329, 0.4135, 0.8307, 0.3736, 0.1004, 0.9991],
                           [0.2348, 0.1451, 0.3522, 0.2883, 0.3047, 0.6650],
                           [0.4047, 0.8828, 0.8732, 0.5743, 0.1091, 0.0381]])

const_hf_x_min = 0
const_hf_x_max = 1

def hartmann_func(x):
    """
    Compute Hartmann function
    Args:
        x - [N, 6] dimensional torch tensor.
    Returns:
        function value
    """
    
    no_dims = x.shape[0]
    
    result = torch.zeros((no_dims))
    
    for d in range(no_dims):
                
        for i in range(4):            
            sm = torch.dot(const_hf_a[i], (x[d] - const_hf_p[i])**2)
        
            result[d] += const_hf_c[i]*torch.exp(-sm)
            
    return -result

# Checking GM
assert np.allclose(
        hartmann_func(torch.tensor([[0.20169, 0.150011, 0.476874, 0.275332, 0.311652, 0.6573]])).numpy(),
        np.array([-3.32237], dtype=np.float32))

# Constructing BO strategy

### Defining acquisition function

In [5]:
def lower_confidence_bound(gpmodel, x, kappa=2):
    """
    Lower Confidence Bound (LCB): $\alpha(x)=\mu(x) - \kappa\sigma(x)$
    
    """
    mu, variance = gpmodel(x, full_cov=False, noiseless=False)
    sigma = variance.sqrt()
    
    return mu - kappa * sigma

normal_phi = lambda x: torch.exp(-x.pow(2)/2)/np.sqrt(2*np.pi)
normal_Phi = lambda x: (1 + torch.erf(x / np.sqrt(2))) / 2

def expected_improvement(gpmodel, x):
    """
    Brooks' implementation of expected improvement (EI).
    
    """
    y_min = gpmodel.y.min()
    
    mu, variance = gpmodel(x, full_cov=False, noiseless=False)
    
    sigma = variance.sqrt()
    
    delta = y_min - mu
    
    EI = delta.clamp_min(0.0) + sigma*normal_phi(delta/sigma) - delta.abs()*normal_Phi(delta/sigma)
    
    return -EI

def acquisition_func(gpmodel, x, af='EI'):
    """
    Defines acquisition function.
    """
    
    if af == "EI":
        return expected_improvement(gpmodel, x)
    
    elif af == "LCB":
        return lower_confidence_bound(gpmodel, x)
    
    else:
        return None

## Minimalistic BO Algorithm

### Function to find minimizing points for an acquisition function

In [6]:
def find_a_candidate(gpmodel, x_init):
   
    # Creating constrains
    constraint_x = constraints.interval(const_hf_x_min, const_hf_x_max)
    
    # transform x_init to an unconstrained domain as we use an unconstrained optimizer
    unconstrained_x1_init = transform_to(constraint_x).inv(x_init[:, 0])
    unconstrained_x2_init = transform_to(constraint_x).inv(x_init[:, 1])
    unconstrained_x3_init = transform_to(constraint_x).inv(x_init[:, 2])
    unconstrained_x4_init = transform_to(constraint_x).inv(x_init[:, 3])
    unconstrained_x5_init = transform_to(constraint_x).inv(x_init[:, 4])
    unconstrained_x6_init = transform_to(constraint_x).inv(x_init[:, 5])
    
    x_uncon_init = torch.stack((unconstrained_x1_init, 
                                unconstrained_x2_init,
                                unconstrained_x3_init,
                                unconstrained_x4_init,
                                unconstrained_x5_init,
                                unconstrained_x6_init), dim=1)
    
    x_uncon = x_uncon_init.clone().detach().requires_grad_(True)
    
    # unconstrained minimiser
    minimizer = optim.Adam([x_uncon])

    def closure():
        # clear gradients
        minimizer.zero_grad()
                
        x1_tmp = transform_to(constraint_x)(x_uncon[:, 0])
        x2_tmp = transform_to(constraint_x)(x_uncon[:, 1])
        x3_tmp = transform_to(constraint_x)(x_uncon[:, 2])
        x4_tmp = transform_to(constraint_x)(x_uncon[:, 3])
        x5_tmp = transform_to(constraint_x)(x_uncon[:, 4])
        x6_tmp = transform_to(constraint_x)(x_uncon[:, 5])
        
        x = torch.stack((x1_tmp, x2_tmp, x3_tmp, x4_tmp, x5_tmp, x6_tmp), dim=1)
        
        y = acquisition_func(gpmodel, x)
        
        autograd.backward(x_uncon, autograd.grad(y, x_uncon))
                
        return y
    
    for _ in range(100):
        minimizer.step(closure)
        
    # after finding a candidate in the unconstrained domain,
    # convert it back to original domain.
    x1_tmp = transform_to(constraint_x)(x_uncon[:, 0])
    x2_tmp = transform_to(constraint_x)(x_uncon[:, 1])
    x3_tmp = transform_to(constraint_x)(x_uncon[:, 2])
    x4_tmp = transform_to(constraint_x)(x_uncon[:, 3])
    x5_tmp = transform_to(constraint_x)(x_uncon[:, 4])
    x6_tmp = transform_to(constraint_x)(x_uncon[:, 5])
    
    x = torch.stack((x1_tmp, x2_tmp, x3_tmp, x4_tmp, x5_tmp, x6_tmp), dim=1)
    
    return x.detach()

### A single step of BO

LBFGS optimiser used in `find_a_candidate` is a gradient based method and can get stuck at a local minimum. A simple approach to address this is to try several attemps (5) to find the best candidate to minimize the acquisition function.

In [7]:
def next_x(gpmodel, num_candidates=5):
    
    candidates = []
    values = []
    
    # take the best (lowest) point as the first attempt
    x_init = gpmodel.X[[gpmodel.y.argmin()], :].detach().requires_grad_(True)
        
    for i in range(num_candidates):
        
        x = find_a_candidate(gpmodel, x_init)
        y = acquisition_func(gpmodel, x)
    
        candidates.append(x)
        values.append(y)
        
        # a new random attempt initial point
        x_init = torch.stack((
                    x[:,0].new_empty(1).uniform_(const_hf_x_min, const_hf_x_max),
                    x[:,1].new_empty(1).uniform_(const_hf_x_min, const_hf_x_max),
                    x[:,2].new_empty(1).uniform_(const_hf_x_min, const_hf_x_max),
                    x[:,3].new_empty(1).uniform_(const_hf_x_min, const_hf_x_max),
                    x[:,4].new_empty(1).uniform_(const_hf_x_min, const_hf_x_max),
                    x[:,5].new_empty(1).uniform_(const_hf_x_min, const_hf_x_max)), dim=1)
    
        print("Candidate ", i, x, y)
        
    argmin = torch.min(torch.cat(values), dim=0)[1].item()
    
    print("Result: ", candidates[argmin], values[argmin])
    
    return candidates[argmin], candidates

### Updating posterior

Each time we evaluate `f` at a new value x, we update the `gpmodel`.

In [8]:
def update_posterior(gpmodel, x_new, svi_mode=False):
        
    # evaluate f at new point
    bh_y = hartmann_func(x_new) 
    
    # incorporate new evaluation
    X = torch.cat([gpmodel.X, x_new]) 
    y = torch.cat([gpmodel.y, bh_y])
        
    gpmodel.set_data(X, y)
    
    # optimising hyper paramters
    
    optimiser = torch.optim.Adam(gpmodel.parameters(), lr=0.001)
    
    if svi_mode:
        loss_fn = pyro.infer.TraceMeanField_ELBO().differentiable_loss
        gp.util.train(gpmodel, optimiser, loss_fn, num_steps=2000)
    else:
        gp.util.train(gpmodel, optimiser) 

# Training data

In [9]:
train_cnt = 30

X_train = torch.rand(train_cnt, 6)
Y_train = hartmann_func(X_train)

# Deterministic approach

In [10]:
set_random_seed(333)

gp_model = gp.models.GPRegression(X_train, Y_train, 
    gp.kernels.Matern52(input_dim=6, lengthscale=torch.ones(6)))

optimizer = torch.optim.Adam(gp_model.parameters(), lr=0.001)
losses = gp.util.train(gp_model, optimizer);

bo_steps = 20

for i in range(bo_steps):
    print("-"*50)
    print("BO STEP: ", i)
    print("-"*50)
    
    gp_model_ = copy.copy(gp_model)
    
    xmin, xcans = next_x(gp_model)
    
    update_posterior(gp_model, xmin)


--------------------------------------------------
BO STEP:  0
--------------------------------------------------
Candidate  0 tensor([[0.4310, 0.7526, 0.3725, 0.7096, 0.6665, 0.0046]]) tensor([-0.0016], grad_fn=<NegBackward>)
Candidate  1 tensor([[0.0753, 0.8654, 0.5869, 0.7903, 0.3471, 0.1851]]) tensor([-0.0010], grad_fn=<NegBackward>)
Candidate  2 tensor([[0.5416, 0.9229, 0.3689, 0.6293, 0.3431, 0.3446]]) tensor([-0.0008], grad_fn=<NegBackward>)
Candidate  3 tensor([[0.7392, 0.1793, 0.9383, 0.8214, 0.8374, 0.3040]]) tensor([-0.0004], grad_fn=<NegBackward>)
Candidate  4 tensor([[0.5888, 0.4643, 0.1963, 0.1713, 0.4578, 0.9334]]) tensor([-0.0003], grad_fn=<NegBackward>)
Result:  tensor([[0.4310, 0.7526, 0.3725, 0.7096, 0.6665, 0.0046]]) tensor([-0.0016], grad_fn=<NegBackward>)
--------------------------------------------------
BO STEP:  1
--------------------------------------------------
Candidate  0 tensor([[0.4335, 0.7709, 0.3491, 0.7302, 0.6886, 0.0042]]) tensor([-0.0210], grad_fn=

Candidate  1 tensor([[0.9265, 0.2576, 0.3642, 0.1772, 0.3089, 0.9994]]) tensor([-7.4444e-19], grad_fn=<NegBackward>)
Candidate  2 tensor([[0.3462, 0.8991, 0.3599, 0.6164, 0.2959, 0.7436]]) tensor([-1.3918e-13], grad_fn=<NegBackward>)
Candidate  3 tensor([[0.2846, 0.4429, 0.1432, 0.0341, 0.6348, 0.0013]]) tensor([-0.0234], grad_fn=<NegBackward>)
Candidate  4 tensor([[0.4674, 0.8096, 0.9618, 0.9360, 0.5053, 0.5390]]) tensor([-5.2337e-18], grad_fn=<NegBackward>)
Result:  tensor([[0.3797, 0.7961, 0.2794, 0.6784, 0.5599, 0.0030]]) tensor([-0.0341], grad_fn=<NegBackward>)
--------------------------------------------------
BO STEP:  11
--------------------------------------------------
Candidate  0 tensor([[0.3785, 0.7990, 0.2593, 0.6771, 0.5341, 0.0027]]) tensor([-0.0486], grad_fn=<NegBackward>)
Candidate  1 tensor([[0.8064, 0.7688, 0.8100, 0.3800, 0.1871, 0.6969]]) tensor([-1.1308e-29], grad_fn=<NegBackward>)
Candidate  2 tensor([[0.3730, 0.0993, 0.3069, 0.4249, 0.4430, 0.6283]]) tensor([-3

# Stochastic approach

In [11]:
set_random_seed(333)

pyro.clear_param_store()
gp_model_svi = gp.models.GPRegression(X_train, Y_train, gp.kernels.Matern52(input_dim=6, lengthscale=torch.ones(6)))

# Set priors
gp_model_svi.kernel.lengthscale = pyro.nn.PyroSample(dist.LogNormal(0, 1).expand([6]).to_event())
gp_model_svi.kernel.variance = pyro.nn.PyroSample(dist.LogNormal(0, 1))
gp_model_svi.noise = pyro.nn.PyroSample(dist.LogNormal(0, 1))

# Set guides
gp_model_svi.kernel.autoguide("lengthscale", dist.Normal)
gp_model_svi.kernel.autoguide("variance", dist.Normal)
gp_model_svi.autoguide("noise", dist.Normal)

# optimise
optimizer = torch.optim.Adam(gp_model_svi.parameters(), lr=0.005)
loss_fn = pyro.infer.TraceMeanField_ELBO().differentiable_loss
losses = gp.util.train(gp_model_svi, optimizer, loss_fn, num_steps=2000)

bo_steps = 20

for i in range(bo_steps):
    print("-"*50)
    print("BO STEP: ", i)
    print("-"*50)
    
    gp_model_svi_ = copy.copy(gp_model_svi)
    
    xmin, xcans = next_x(gp_model_svi)
    
    update_posterior(gp_model_svi, xmin, svi_mode=True)
        

Model sites:
  kernel.lengthscale
  kernel.variance
  noiseGuide sites:
  noise
  kernel.lengthscale
  kernel.variance
  "Guide sites:\n  " + "\n  ".join(guide_sites))


--------------------------------------------------
BO STEP:  0
--------------------------------------------------
Candidate  0 tensor([[0.4699, 0.7447, 0.3841, 0.6920, 0.6353, 0.0048]]) tensor([-0.0353], grad_fn=<NegBackward>)
Candidate  1 tensor([[0.4679, 0.3049, 0.5816, 0.2563, 0.4632, 0.0345]]) tensor([-1.2469e-07], grad_fn=<NegBackward>)
Candidate  2 tensor([[0.4056, 0.3312, 0.2102, 0.7912, 0.8868, 0.6790]]) tensor([-6.7885e-06], grad_fn=<NegBackward>)
Candidate  3 tensor([[2.7676e-01, 3.9869e-04, 7.8953e-01, 3.4166e-02, 3.5278e-01, 8.9763e-01]]) tensor([-4.0613e-06], grad_fn=<NegBackward>)
Candidate  4 tensor([[0.6033, 0.5760, 0.0712, 0.6738, 0.8409, 0.3037]]) tensor([-6.9344e-06], grad_fn=<NegBackward>)
Result:  tensor([[0.4699, 0.7447, 0.3841, 0.6920, 0.6353, 0.0048]]) tensor([-0.0353], grad_fn=<NegBackward>)
--------------------------------------------------
BO STEP:  1
--------------------------------------------------
Candidate  0 tensor([[0.4695, 0.7458, 0.3775, 0.6907, 0.62

Candidate  1 tensor([[0.1821, 0.4349, 0.3009, 0.9712, 0.5084, 0.3888]]) tensor([-8.5361e-07], grad_fn=<NegBackward>)
Candidate  2 tensor([[0.2149, 0.9227, 0.0193, 0.3689, 0.6019, 0.4095]]) tensor([-4.5501e-05], grad_fn=<NegBackward>)
Candidate  3 tensor([[0.2168, 0.1710, 0.5498, 0.0197, 0.6153, 0.5227]]) tensor([-8.9280e-09], grad_fn=<NegBackward>)
Candidate  4 tensor([[0.3865, 0.5748, 0.1963, 0.6617, 0.4324, 0.1784]]) tensor([-3.6837e-07], grad_fn=<NegBackward>)
Result:  tensor([[0.4124, 0.8216, 0.2913, 0.6577, 0.5551, 0.0030]]) tensor([-0.0316], grad_fn=<NegBackward>)
--------------------------------------------------
BO STEP:  11
--------------------------------------------------
Candidate  0 tensor([[0.4019, 0.8323, 0.2728, 0.6475, 0.5393, 0.0027]]) tensor([-0.0323], grad_fn=<NegBackward>)
Candidate  1 tensor([[0.5433, 0.5827, 0.6691, 0.6629, 0.0451, 0.3136]]) tensor([-6.8619e-10], grad_fn=<NegBackward>)
Candidate  2 tensor([[0.0061, 0.9186, 0.0090, 0.3332, 0.5563, 0.1864]]) tensor