In [2]:
import os
import torch
import gpytorch as gp
import numpy as np

if os.path.abspath('..') not in sys.path:
    sys.path.insert(0, os.path.abspath('..'))

# os.environ['CUDA_VISIBLE_DEVICES'] = "7"
device = 'cuda:4' if torch.cuda.is_available() else None

torch.cuda.set_device(device)

In [3]:
from bi_gp.bilateral_kernel import BilateralKernel, MaternLattice, RBFLattice

class BilateralGPModel(gp.models.ExactGP):
    def __init__(self, train_x, train_y, nu=None, order=1, min_noise=1e-4):
        likelihood = gp.likelihoods.GaussianLikelihood(
                      noise_constraint=gp.constraints.GreaterThan(min_noise))
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = gp.means.ConstantMean()
        self.base_covar_module = MaternLattice(nu=nu, order=order) \
          if nu is not None else RBFLattice(order=order)
        self.covar_module = gp.kernels.ScaleKernel(self.base_covar_module)

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gp.distributions.MultivariateNormal(mean_x, covar_x)

In [4]:
from experiments.utils import prepare_dataset

data_iter = prepare_dataset('houseelectric', uci_data_dir=None, device=device)
_, train_x, train_y = next(data_iter)

## Autograd

This currently uses the approximation to the gradient, defined as another collection of filtering operations.

In [5]:
with gp.settings.cg_tolerance(1.0), \
    gp.settings.max_preconditioner_size(50), \
    gp.settings.max_root_decomposition_size(100):
    model = BilateralGPModel(train_x, train_y, nu=1.5, order=1).to(device)
    mll = gp.mlls.ExactMarginalLogLikelihood(model.likelihood, model)

    loss = -mll(model(train_x), train_y)
    print(f'Loss: {loss.item()}')

    loss.backward()

for idx, p in enumerate(model.parameters()):
    print(f'[{idx}] {p} ---> {p.grad}')

Discretized kernel coeffs: tensor([0.3404, 1.0000, 0.3404])
Discretized kernel deriv coeffs: tensor([0.3463, 1.0000, 0.3463])
Using /home/sanyam_s/.cache/torch_extensions as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/sanyam_s/.cache/torch_extensions/gpu_lattice/build.ninja...
Building extension module gpu_lattice...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module gpu_lattice...
Loss: 0.8699606657028198
[0] Parameter containing:
tensor([0.], device='cuda:4', requires_grad=True) ---> tensor([0.3078], device='cuda:4')
[1] Parameter containing:
tensor([0.], device='cuda:4', requires_grad=True) ---> tensor([-0.0941], device='cuda:4')
[2] Parameter containing:
tensor([[0.]], device='cuda:4', requires_grad=True) ---> tensor([[-0.2925]], device='cuda:4')
[3] 0.0 ---> -0.010579816997051239


## Center Difference

In [6]:
def f(eps=1e-2, p_idx=None):
    model = BilateralGPModel(train_x, train_y, nu=1.5, order=1).to(device)
    mll = gp.mlls.ExactMarginalLogLikelihood(model.likelihood, model)

    for i, p in enumerate(model.parameters()):
        if p_idx == i:
            print(f'[{p_idx}] {p} + {eps}')
            ## Assumes no ARD, scalar params
            p += eps

    loss = -mll(model(train_x), train_y)
    print(f'Loss: {loss.item()}')
    return loss

In [7]:
eps = 1e-2
p_idx = 2  ## lengthscale at index 2 (see above)
grads = []
for _ in range(5):
    with torch.no_grad():
        grad = (f(p_idx=p_idx, eps=eps) - f(p_idx=p_idx, eps=-eps)) / (2. * eps)
        print(f'Finite Diff: {grad}')
        grads.append(grad.item())

print(f'{np.mean(grads)} +/- {2 * np.std(grads)}')

Discretized kernel coeffs: tensor([0.3404, 1.0000, 0.3404])
Discretized kernel deriv coeffs: tensor([0.3463, 1.0000, 0.3463])
[2] Parameter containing:
tensor([[0.]], device='cuda:4', requires_grad=True) + 0.01
Loss: 0.8672400712966919
Discretized kernel coeffs: tensor([0.3404, 1.0000, 0.3404])
Discretized kernel deriv coeffs: tensor([0.3463, 1.0000, 0.3463])
[2] Parameter containing:
tensor([[0.]], device='cuda:4', requires_grad=True) + -0.01
Loss: 0.8714877963066101
Finite Diff: -0.21238625049591064
Discretized kernel coeffs: tensor([0.3404, 1.0000, 0.3404])
Discretized kernel deriv coeffs: tensor([0.3463, 1.0000, 0.3463])
[2] Parameter containing:
tensor([[0.]], device='cuda:4', requires_grad=True) + 0.01
Loss: 0.8674075603485107
Discretized kernel coeffs: tensor([0.3404, 1.0000, 0.3404])
Discretized kernel deriv coeffs: tensor([0.3463, 1.0000, 0.3463])
[2] Parameter containing:
tensor([[0.]], device='cuda:4', requires_grad=True) + -0.01
Loss: 0.8715829253196716
Finite Diff: -0.2087