In [1]:
import torch
import gpytorch

from gpytorch.distributions import MultivariateNormal
from gpytorch.kernels import AdditiveKernel, NewtonGirardAdditiveKernel, RBFKernel, ScaleKernel
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.means import ConstantMean
from gpytorch.mlls import ExactMarginalLogLikelihood
from gpytorch.models import ExactGP
from sklearn.preprocessing import StandardScaler
import numpy as np





In [2]:
import random
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)
# Define the number of samples
num_samples = 200

# Generate features X1 to X10 from N(0,1)
X = np.random.normal(0, 1, (num_samples, 10))
y = np.sin(X[:, 0]**2) + 2 * X[:, 0] + X[:, 1] + np.exp(X[:, 0]**2 * X[:, 1])


# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
train_x, test_x, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 20% data as test

# Initialize the scaler for features and target
scaler_x = StandardScaler()
scaler_y = StandardScaler()
# Fit and transform the training data
train_x_scaled = scaler_x.fit_transform(train_x)
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()

# Transform the test data
test_x_scaled = scaler_x.transform(test_x)
y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

# Convert to torch tensors
train_x = torch.tensor(train_x_scaled, dtype=torch.float32)
test_x = torch.tensor(test_x_scaled, dtype=torch.float32)
y_train = torch.tensor(y_train_scaled, dtype=torch.float32)
y_test = torch.tensor(y_test_scaled, dtype=torch.float32)

print(train_x.shape[-1])
print(test_x.shape)
print(y_test)
# train_x = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
# test_x = torch.tensor([[2.0,3.0]])
# y_train = torch.tensor([1,2])



10
torch.Size([40, 10])
tensor([-0.0794, -0.0794, -0.0795, -0.0794, -0.0794, -0.0795, -0.0794, -0.0794,
        -0.0795, -0.0794, -0.0795, -0.0794, -0.0794, -0.0795, -0.0794, -0.0794,
        -0.0794, -0.0794, -0.0794, -0.0794, -0.0795, -0.0794, -0.0794, -0.0794,
        -0.0791, -0.0794, -0.0794, -0.0794, -0.0794, -0.0794, -0.0794, -0.0794,
        -0.0794, -0.0795, -0.0794, -0.0794, -0.0795, -0.0794, -0.0794, -0.0795])


In [5]:
from typing import Optional, Tuple

import torch
from linear_operator import to_dense

from gpytorch.kernels import Kernel
from gpytorch.constraints import Positive


class NewtonGirardAdditiveKernel(Kernel):
    def __init__(
        self,
        base_kernel: Kernel,
        num_dims: int,
        max_degree: Optional[int] = None,
        active_dims: Optional[Tuple[int, ...]] = None,
        **kwargs,
    ):
        """Create an Additive Kernel a la https://arxiv.org/abs/1112.4394 using Newton-Girard Formulae

        :param base_kernel: a base 1-dimensional kernel. NOTE: put ard_num_dims=d in the base kernel...
        :param max_degree: the maximum numbers of kernel degrees to compute
        :param active_dims:
        :param kwargs:
        """
        super(NewtonGirardAdditiveKernel, self).__init__(active_dims=active_dims, **kwargs)

        self.base_kernel = base_kernel
        self.num_dims = num_dims
        if max_degree is None:
            self.max_degree = self.num_dims
        elif max_degree > self.num_dims:  # force cap on max_degree (silently)
            self.max_degree = self.num_dims
        else:
            self.max_degree = max_degree

        self.register_parameter(
            name="raw_outputscale", parameter=torch.nn.Parameter(torch.zeros(*self.batch_shape, self.max_degree))
        )
        outputscale_constraint = Positive()
        self.register_constraint("raw_outputscale", outputscale_constraint)
        self.outputscale_constraint = outputscale_constraint
        self.outputscale = [1 / self.max_degree for _ in range(self.max_degree)]

    @property
    def outputscale(self):
        return self.raw_outputscale_constraint.transform(self.raw_outputscale)

    @outputscale.setter
    def outputscale(self, value):
        self._set_outputscale(value)

    def _set_outputscale(self, value):
        if not torch.is_tensor(value):
            value = torch.as_tensor(value).to(self.raw_outputscale)

        self.initialize(raw_outputscale=self.outputscale_constraint.inverse_transform(value))

    def forward(self, x1, x2, diag=False, last_dim_is_batch=False, **params):
        """Forward proceeds by Newton-Girard formulae"""
        if last_dim_is_batch:
            raise RuntimeError("NewtonGirardAdditiveKernel does not accept the last_dim_is_batch argument.")

        # NOTE: comments about shape are only correct for the single-batch cases.
        # kern_values is just the order-1 terms
        # kern_values = D x n x n unless diag=True
        kern_values = to_dense(self.base_kernel(x1, x2, diag=diag, last_dim_is_batch=True, **params))
        # last dim is batch, which gets moved up to pos. 1

        kernel_dim = -3 if not diag else -2

        shape = [1 for _ in range(len(kern_values.shape) + 1)]
        shape[kernel_dim - 1] = -1
        kvals = torch.arange(1, self.max_degree + 1, device=kern_values.device).reshape(*shape)
        # kvals = R x 1 x 1 x 1 (these are indexes only)

        # e_n = torch.ones(self.max_degree+1, *kern_values.shape[1:], device=kern_values.device)  # includes 0
        # e_n: elementary symmetric polynomial of degree n (e.g. z1 z2 + z1 z3 + z2 z3)
        # e_n is R x n x n, and the array is properly 0 indexed.
        shape = [d_ for d_ in kern_values.shape]
        shape[kernel_dim] = self.max_degree + 1
        e_n = torch.empty(*shape, device=kern_values.device)
        if kernel_dim == -3:
            e_n[..., 0, :, :] = 1.0
        else:
            e_n[..., 0, :] = 1.0

        # power sums s_k (e.g. sum_i^num_dims z_i^k
        # s_k is R x n x n
        s_k = kern_values.unsqueeze(kernel_dim - 1).pow(kvals).sum(dim=kernel_dim)

        # just the constant -1
        m1 = torch.tensor([-1], dtype=torch.float, device=kern_values.device)

        shape = [1 for _ in range(len(kern_values.shape))]
        shape[kernel_dim] = -1
        for deg in range(1, self.max_degree + 1):  # deg goes from 1 to R (it's 1-indexed!)
            # we avg over k [1, ..., deg] (-1)^(k-1)e_{deg-k} s_{k}

            ks = torch.arange(1, deg + 1, device=kern_values.device, dtype=torch.float).reshape(*shape)  # use for pow
            kslong = torch.arange(1, deg + 1, device=kern_values.device, dtype=torch.long)  # use for indexing

            # note that s_k is 0-indexed, so we must subtract 1 from kslong
            sum_ = (
                m1.pow(ks - 1) * e_n.index_select(kernel_dim, deg - kslong) * s_k.index_select(kernel_dim, kslong - 1)
            ).sum(dim=kernel_dim) / deg
            if kernel_dim == -3:
                e_n[..., deg, :, :] = sum_
            else:
                e_n[..., deg, :] = sum_

        if kernel_dim == -3:
            return (self.outputscale.unsqueeze(-1).unsqueeze(-1) * e_n.narrow(kernel_dim, 1, self.max_degree)).sum(
                dim=kernel_dim
            ) +1
        else:
            return (self.outputscale.unsqueeze(-1) * e_n.narrow(kernel_dim, 1, self.max_degree)).sum(dim=kernel_dim) +1


In [6]:

class ConstrainedRBFKernel(gpytorch.kernels.Kernel):
    has_lengthscale = True
    
    def __init__(self, mu=0, var=1, **kwargs):
        super(ConstrainedRBFKernel, self).__init__(**kwargs)
        self.mu = torch.tensor(mu, dtype=torch.float)
        self.var = torch.tensor(var, dtype=torch.float)
        self.register_parameter(name="raw_lengthscale",  parameter= torch.nn.Parameter(torch.tensor(1.0).view(1, 1, 1))) #initialize the lenghthsacel to 1 #torch.nn.Parameter(torch.ones(*self.batch_shape, 1, 1)))
        self.register_constraint("raw_lengthscale", gpytorch.constraints.Positive())

    @property
    def lengthscale(self):
        return self.raw_lengthscale_constraint.transform(self.raw_lengthscale)
    
    # @lengthscale.setter
    # def lengthscale(self, value):
    #     # self.raw_lengthscale = self.raw_lengthscale_constraint.inverse_transform(value)
    #     self.raw_lengthscale = self.raw_lengthscale_constraint
 
    @lengthscale.setter
    def lengthscale(self, value):
        # Transform the value using the inverse of the constraint and set it to raw_lengthscale
        self.raw_lengthscale.data = self.raw_lengthscale_constraint.inverse_transform(torch.tensor(value).view(1, 1, 1))

    def forward(self, x1, x2, diag = False, **params):
        x1_ = x1.unsqueeze(1)  # Shape: [B1, 1, D]
        x2_ = x2.unsqueeze(0)  # Shape: [1, B2, D]
        mu_ = self.mu.expand_as(x1_)  # Ensures mu is the same shape as x1_
        
        l = self.lengthscale
        l_sq = l**2#.pow(2)
        var_sq = self.var**2#.pow(2)

        # Base RBF kernel calculation
        diff = x1_ - x2_
        dists = torch.sum(diff ** 2, -1)
        base = torch.exp(-0.5 * dists / l_sq)

        # Constraint term calculation
        term1 = (x1_ - mu_)**2 + (x2_ - mu_)**2
        scaled_l_sq = l_sq + var_sq
        constraint = torch.exp(-0.5 * term1 / scaled_l_sq).prod(dim=-1)
        scaling_factor = (l* torch.sqrt(l_sq +2*var_sq)) / torch.sqrt(scaled_l_sq )

        # Constrained kernel
        constrained_kernel = base - scaling_factor * constraint

#
        return constrained_kernel if not diag else constrained_kernel.diag()

    def evaluate(self, x1, x2=None):
        if x2 is None:
            x2 = x1
        return self.forward(x1, x2)
    
class TestGPModel(ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ZeroMean()
        self.covar_module = NewtonGirardAdditiveKernel(base_kernel=ConstrainedRBFKernel(), num_dims=train_x.shape[-1])

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return MultivariateNormal(mean_x, covar_x)

likelihood = GaussianLikelihood()
model = TestGPModel(train_x, y_train, likelihood)

# Initialize lengthscale to 1.0
# model.covar_module.base_kernel.raw_lengthscale = torch.nn.Parameter(torch.tensor(1.0).view(1, 1, 1))

# Training procedure
model.train()
likelihood.train()

optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
mll = ExactMarginalLogLikelihood(likelihood, model)

for i in range(1000):
    optimizer.zero_grad()
    output = model(train_x)
    loss = -mll(output, y_train)
    loss.backward()
    optimizer.step()
    print(f'Iteration {i+1}/1000 - Loss: {loss.item()}')

Iteration 1/1000 - Loss: 1.451851487159729
Iteration 2/1000 - Loss: 1.4428479671478271
Iteration 3/1000 - Loss: 1.437516212463379
Iteration 4/1000 - Loss: 1.4351696968078613
Iteration 5/1000 - Loss: 1.4349397420883179
Iteration 6/1000 - Loss: 1.4358514547348022
Iteration 7/1000 - Loss: 1.4370301961898804
Iteration 8/1000 - Loss: 1.4378963708877563
Iteration 9/1000 - Loss: 1.4382150173187256
Iteration 10/1000 - Loss: 1.4379994869232178
Iteration 11/1000 - Loss: 1.4373937845230103
Iteration 12/1000 - Loss: 1.436592936515808
Iteration 13/1000 - Loss: 1.4357882738113403
Iteration 14/1000 - Loss: 1.4351394176483154
Iteration 15/1000 - Loss: 1.4347490072250366
Iteration 16/1000 - Loss: 1.4346510171890259
Iteration 17/1000 - Loss: 1.4348021745681763
Iteration 18/1000 - Loss: 1.435101866722107
Iteration 19/1000 - Loss: 1.4354150295257568
Iteration 20/1000 - Loss: 1.43562650680542
Iteration 21/1000 - Loss: 1.4356725215911865
Iteration 22/1000 - Loss: 1.4355543851852417
Iteration 23/1000 - Loss:

In [7]:
for param_name, param in model.named_parameters():
    print(f'Parameter name: {param_name:42} value = {param.data}')
    # Evaluating with standard deviations
model.eval()
likelihood.eval()
with torch.no_grad():
    output = model(test_x)
    
    # Extracting means and standard deviations
    predicted_means = output.loc
    predicted_covariance_matrix = output.covariance_matrix
    predicted_stddevs = output.stddev.numpy()  # Extract standard deviations
print("Predicted Means:")
print(predicted_means)

print("Predicted Standard Deviations:")
print(predicted_stddevs.shape)


print(predicted_covariance_matrix.shape)

Parameter name: likelihood.noise_covar.raw_noise           value = tensor([-1.8949])
Parameter name: covar_module.raw_outputscale               value = tensor([ 0.3248, -2.2522, -2.3655, -2.1463, -2.3094, -2.2029, -2.2836, -2.2094,
        -2.2805, -2.2200])
Parameter name: covar_module.base_kernel.raw_lengthscale   value = tensor([[[-0.5972]]])
Predicted Means:
tensor([ 0.0225, -0.0037,  0.0614,  0.0645, -0.0288, -0.0300,  0.0131, -0.0116,
         0.0661,  0.0020, -0.0294,  0.0682,  0.0265, -0.0082, -0.0280,  0.0081,
        -0.0296, -0.0278, -0.0283,  0.0384, -0.0049, -0.0241, -0.0294, -0.0272,
        -0.0268, -0.0297, -0.0246, -0.0292, -0.0255, -0.0037,  0.0779, -0.0058,
         0.1828, -0.0291,  0.0352, -0.0277, -0.0294, -0.0286, -0.0263, -0.0296])
Predicted Standard Deviations:
(40,)
torch.Size([40, 40])


In [8]:
trained_kernel = model.covar_module.base_kernel
# l = model.covar_module.base_kernel.raw_lengthscale.data
# base_kernel = ConstrainedRBFKernel()
# base_kernel.lengthscale = l
outputscale = model.covar_module.outputscale
kernel = NewtonGirardAdditiveKernel(base_kernel=trained_kernel, num_dims=train_x.shape[-1])
kernel.outputscale = outputscale
# output = kernel(train_x, train_x).evaluate()
# print(output)
noise_variance = likelihood.noise.data
# Compute training and test kernel matrices

with torch.no_grad():
    model.eval()

    likelihood.eval()
    K_train = kernel(train_x, train_x).evaluate() + noise_variance * torch.eye(train_x.size(0))
    K_s = kernel(test_x,train_x).evaluate()
    K_ss = kernel(test_x, test_x).evaluate() +  torch.eye(test_x.size(0))  # Added jitter for numerical stability

    K_inv =  torch.inverse(K_train)
    alpha = torch.matmul(K_inv, y_train)
    predicted_mean =torch.matmul(K_s,alpha)
    v = torch.linalg.solve_triangular(torch.linalg.cholesky(K_train), K_s.t(), upper=False)
    predicted_covariance_matrix = K_ss - v.t().matmul(v)


In [9]:
predicted_mean

tensor([ 0.0225, -0.0037,  0.0614,  0.0645, -0.0288, -0.0300,  0.0131, -0.0116,
         0.0661,  0.0020, -0.0294,  0.0682,  0.0265, -0.0082, -0.0280,  0.0081,
        -0.0296, -0.0278, -0.0283,  0.0384, -0.0049, -0.0241, -0.0294, -0.0272,
        -0.0268, -0.0297, -0.0246, -0.0292, -0.0255, -0.0037,  0.0779, -0.0058,
         0.1828, -0.0291,  0.0352, -0.0277, -0.0294, -0.0286, -0.0263, -0.0296])

In [10]:
print(type(model.covar_module.outputscale))

<class 'torch.Tensor'>


# start of Shapley calculations

## K per feature

In [11]:
import torch
import gpytorch

# Assuming train_x and other necessary imports and initializations are done above

n_samples, n_features = train_x.shape

K_per_feature = torch.zeros(n_samples, n_features)

with torch.no_grad():
    model.eval()
    likelihood.eval()
    
    # Get lengthscale from model's kernel if it's already defined
    l = model.covar_module.base_kernel.lengthscale.item()
    
    # Define the ConstrainedRBFKernel

    constrained_kernel = ConstrainedRBFKernel()

    constrained_kernel.lengthscale = l
    
    # Extract the instance's features; assuming you want the 4th sample (index 3)
    instance_features = train_x[3].unsqueeze(0)  # Shape (1, d)

    # Loop over each feature dimension
    for i in range(n_features):
        # Extract the i-th feature across all samples
        feature_column = train_x[:, i].unsqueeze(1)  # Shape (n, 1)
        instance_feature = instance_features[:, i].unsqueeze(1)  # Shape (1, 1)
       
        # Compute the kernel matrix for the i-th feature
        # K_per_feature[:, i] = constrained_kernel.evaluate(instance_feature, feature_column)
        K_per_feature[:, i] = constrained_kernel.evaluate(instance_feature, feature_column)
        # print(K_per_feature)
# Do something with K_per_feature or print it
print(K_per_feature)


tensor([[-0.3874,  0.2199,  0.3397,  ..., -0.3648, -0.1430,  0.1106],
        [-0.3671, -0.0310,  0.4900,  ..., -0.2932, -0.0905,  0.4345],
        [ 0.6923,  0.0877, -0.0707,  ..., -0.0735,  0.0042, -0.3816],
        ...,
        [-0.2566, -0.2379,  0.1517,  ..., -0.4040,  0.8251, -0.0142],
        [-0.0453,  0.3056, -0.2298,  ...,  0.6157,  0.1103,  0.0995],
        [-0.3780,  0.1142, -0.0707,  ..., -0.3244, -0.1171, -0.3112]])


In [12]:

def Omega(X, i,sigmas,q_additivity=None):
    
    n, d = X.shape
    if q_additivity is None:
        q_additivity = d
    
    # Reorder columns so that the i-th column is first
    idx = torch.arange(d)
    idx[i] = 0
    idx[0] = i
    X = X[:, idx]

    # Initialize dp array
    dp = torch.zeros((q_additivity, d, n))

    # Initial sum of features across the dataset
    sum_current = torch.zeros((n,))
    
    # Fill the first order dp (base case)
    for j in range(d):
        dp[0, j, :] = X[:, j]
        sum_current += X[:, j]

    # Fill the dp table for higher orders
    for i in range(1, q_additivity):
        temp_sum = torch.zeros((n,))
        for j in range(d):
            # Subtract the previous contribution of this feature when moving to the next order
            sum_current -= dp[i - 1, j, :]

            dp[i, j, :] =  X[:,j] * sum_current
            dp[i, j, :] = dp[i, j, :] * (i/(i+1)) 
            dp[i,j,:] = dp[i,j,:]
            temp_sum += dp[i, j, :]
        
        sum_current = temp_sum
    for i in range(q_additivity):
        dp[i,:,:] = dp[i,:,:] * sigmas[i]
    # Sum the first row of each slice
    omega = torch.sum(dp[:, 0, :], axis=0)

    return omega , dp

sigmas = outputscale.data.unsqueeze(-1)
val = torch.zeros(n_features)
for i in range(n_features):
    omega_dp, dp = Omega(K_per_feature, i, sigmas,q_additivity=None)
    val[i] = torch.matmul(omega_dp, alpha)

In [13]:
print(torch.sum(val))

tensor(-12.0453)


In [14]:
with torch.no_grad():
    instance_features = train_x[3].unsqueeze(0) 
    model.eval()

    likelihood.eval()
    K_sample = kernel(instance_features, train_x).evaluate()
    prediction = torch.matmul(K_sample,alpha)

prediction

tensor([-0.0692])

In [15]:
prediction- torch.sum(alpha)

tensor([-0.0390])