In [1]:
import torch
import random
import gpytorch

from gpytorch.distributions import MultivariateNormal
from gpytorch.kernels import AdditiveKernel, NewtonGirardAdditiveKernel, RBFKernel, ScaleKernel
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.means import ConstantMean
from gpytorch.mlls import ExactMarginalLogLikelihood
from gpytorch.models import ExactGP
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split




In [2]:

SEED = 8
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)

## Generate normally distributed data

In [3]:

# Define the number of samples
num_samples = 200


# Generate features X1 to X10 from N(0,1)
X = np.random.normal(0, 1, (num_samples, 10))
y = np.sin(X[:, 0]**2) + 2 * X[:, 0] + X[:, 1] + np.exp(X[:, 0]**2 * X[:, 1])



# Split the data into training and testing sets

train_x, test_x, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 20% data as test

# Initialize the scaler for features and target
scaler_x = StandardScaler()
scaler_y = StandardScaler()
# Fit and transform the training data
train_x_scaled = scaler_x.fit_transform(train_x)
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()

# Transform the test data
test_x_scaled = scaler_x.transform(test_x)
y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

# Convert to torch tensors
train_x = torch.tensor(train_x_scaled, dtype=torch.float32)
test_x = torch.tensor(test_x_scaled, dtype=torch.float32)
y_train = torch.tensor(y_train_scaled, dtype=torch.float32)
y_test = torch.tensor(y_test_scaled, dtype=torch.float32)


print(train_x.shape[-1])
print(test_x.shape)
# print(y_test_scaled.shape)
# print(y_test_scaled)
# train_x = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
# test_x = torch.tensor([[2.0,3.0]])
# y_train = torch.tensor([1,2])



10
torch.Size([40, 10])


## Kernel definitions

In [4]:
class ConstrainedRBFKernel(gpytorch.kernels.Kernel):
    has_lengthscale = True
    
    def __init__(self, mu=0, var=1, **kwargs):
        super(ConstrainedRBFKernel, self).__init__(**kwargs)
        self.mu = torch.tensor(mu, dtype=torch.float)
        self.var = torch.tensor(var, dtype=torch.float)
        self.register_parameter(name="raw_lengthscale",  parameter= torch.nn.Parameter(torch.tensor(1.0).view(1, 1, 1))) #initialize the lenghthsacel to 1 #torch.nn.Parameter(torch.ones(*self.batch_shape, 1, 1)))
        self.register_constraint("raw_lengthscale", gpytorch.constraints.Positive())

    @property
    def lengthscale(self):
        return self.raw_lengthscale_constraint.transform(self.raw_lengthscale)
 
    @lengthscale.setter
    def lengthscale(self, value):
        # Transform the value using the inverse of the constraint and set it to raw_lengthscale
        if not torch.is_tensor(value):
            value = torch.as_tensor(value).to(self.raw_lengthscale)
        self.initialize(raw_lengthscale=self.raw_lengthscale_constraint.inverse_transform(torch.tensor(value).view(1, 1, 1)))

    def forward(self, x1, x2, diag = False, **params):
        x1_ = x1.unsqueeze(1)  # Shape: [B1, 1, D], in our case it should be [1 batch, 160 examples, 10 features]
        x2_ = x2.unsqueeze(0)  # Shape: [1, B2, D]
        mu_ = self.mu
        
        l = self.lengthscale
        l_sq = l**2#.pow(2)
        variance = self.var#.pow(2)

        # Base RBF kernel calculation
        diff = x1_ - x2_
        dists = torch.sum(diff ** 2, -1)
        base = torch.exp(-0.5 * dists / l_sq)

        # Constraint term calculation
        term1 = torch.sum((x1_ - mu_)**2, -1) + torch.sum((x2_ - mu_)**2, -1)
        scaled_l_sq = l_sq + variance
        constraint = torch.exp(-0.5 * term1 / scaled_l_sq)
        scaling_factor = (l* torch.sqrt(l_sq +2*variance)) / scaled_l_sq

        # Constrained kernel
        constrained_kernel = base - scaling_factor * constraint

#
        return constrained_kernel if not diag else constrained_kernel.diag()

    def evaluate(self, x1, x2=None):
        if x2 is None:
            x2 = x1
        return self.forward(x1, x2)
    
class TestGPModel(ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ZeroMean()
        self.covar_module = NewtonGirardAdditiveKernel(base_kernel=ConstrainedRBFKernel(), num_dims=train_x.shape[-1])

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return MultivariateNormal(mean_x, covar_x)

likelihood = GaussianLikelihood()
model = TestGPModel(train_x, y_train, likelihood)

# Initialize lengthscale to 1.0
# model.covar_module.base_kernel.raw_lengthscale = torch.nn.Parameter(torch.tensor(1.0).view(1, 1, 1))

# Training procedure
model.train()
likelihood.train()

optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
mll = ExactMarginalLogLikelihood(likelihood, model)

for i in range(1000):
    optimizer.zero_grad()
    output = model(train_x)
    with gpytorch.settings.cholesky_jitter(1e-4*2):
        loss = -mll(output, y_train)
    loss.backward()
    optimizer.step()
    print(f'Iteration {i+1}/1000 - Loss: {loss.item()}')

Iteration 1/1000 - Loss: 1.4318050146102905
Iteration 2/1000 - Loss: 1.4232187271118164
Iteration 3/1000 - Loss: 1.4178907871246338
Iteration 4/1000 - Loss: 1.4152036905288696
Iteration 5/1000 - Loss: 1.414358139038086
Iteration 6/1000 - Loss: 1.4144659042358398
Iteration 7/1000 - Loss: 1.4147462844848633
Iteration 8/1000 - Loss: 1.4147002696990967
Iteration 9/1000 - Loss: 1.4141401052474976
Iteration 10/1000 - Loss: 1.4131066799163818
Iteration 11/1000 - Loss: 1.4117610454559326
Iteration 12/1000 - Loss: 1.4102990627288818
Iteration 13/1000 - Loss: 1.4088943004608154
Iteration 14/1000 - Loss: 1.4076683521270752
Iteration 15/1000 - Loss: 1.406672716140747
Iteration 16/1000 - Loss: 1.405897617340088
Iteration 17/1000 - Loss: 1.4053800106048584
Iteration 18/1000 - Loss: 1.4057871103286743
Iteration 19/1000 - Loss: 1.4052900075912476
Iteration 20/1000 - Loss: 1.4056861400604248
Iteration 21/1000 - Loss: 1.4057955741882324
Iteration 22/1000 - Loss: 1.4055414199829102
Iteration 23/1000 - Lo



Iteration 38/1000 - Loss: 1.4047826528549194
Iteration 39/1000 - Loss: 1.405696988105774
Iteration 40/1000 - Loss: 1.4062927961349487
Iteration 41/1000 - Loss: 1.4066720008850098
Iteration 42/1000 - Loss: 1.4068931341171265
Iteration 43/1000 - Loss: 1.4070007801055908
Iteration 44/1000 - Loss: 1.4070289134979248
Iteration 45/1000 - Loss: 1.4070048332214355
Iteration 46/1000 - Loss: 1.4069499969482422
Iteration 47/1000 - Loss: 1.4068758487701416
Iteration 48/1000 - Loss: 1.4067866802215576
Iteration 49/1000 - Loss: 1.4066792726516724
Iteration 50/1000 - Loss: 1.4065427780151367
Iteration 51/1000 - Loss: 1.4063608646392822
Iteration 52/1000 - Loss: 1.4061156511306763
Iteration 53/1000 - Loss: 1.405784010887146
Iteration 54/1000 - Loss: 1.405339002609253
Iteration 55/1000 - Loss: 1.4047414064407349
Iteration 56/1000 - Loss: 1.4039335250854492
Iteration 57/1000 - Loss: 1.4029620885849
Iteration 58/1000 - Loss: 1.6675331592559814
Iteration 59/1000 - Loss: 1.4049571752548218
Iteration 60/100

In [5]:
for param_name, param in model.named_parameters():
    print(f'Parameter name: {param_name:42} value = {param.data}')
    # Evaluating with standard deviations
model.eval()
likelihood.eval()
with torch.no_grad():
    output = model(test_x)
    
    # Extracting means and standard deviations
    predicted_means = output.loc
    predicted_covariance_matrix = output.covariance_matrix
    predicted_stddevs = output.stddev.numpy()  # Extract standard deviations
print("Predicted Means:")
print(predicted_means)

print("Predicted Standard Deviations:")
print(predicted_stddevs.shape)


print(predicted_covariance_matrix.shape)

Parameter name: likelihood.noise_covar.raw_noise           value = tensor([0.7078])
Parameter name: covar_module.raw_outputscale               value = tensor([-1.7381, -2.2710, -2.2770, -2.3386, -2.2976, -2.2260, -2.2388, -2.2593,
        -2.2372, -2.2552])
Parameter name: covar_module.base_kernel.raw_lengthscale   value = tensor([[[3.8822]]])
Predicted Means:
tensor([-0.1218,  0.1457,  0.0025,  0.0292,  0.0702,  0.0669,  0.0534, -0.0349,
         0.0137, -0.1629, -0.0670, -0.1332,  0.0954, -0.0653, -0.0366, -0.0161,
         0.0554,  0.0381, -0.0118,  0.2769,  0.2200, -0.0724,  0.0885,  0.0226,
         0.0040,  0.0038,  0.2994, -0.1375, -0.0317, -0.0514, -0.1664,  0.0962,
        -0.1320, -0.0476, -0.0706, -0.0564, -0.1195,  0.2446, -0.1421, -0.0233])
Predicted Standard Deviations:
(40,)
torch.Size([40, 40])


In [6]:
trained_kernel = model.covar_module.base_kernel
# l = model.covar_module.base_kernel.raw_lengthscale.data
# base_kernel = ConstrainedRBFKernel()
# base_kernel.lengthscale = l
outputscale = model.covar_module.outputscale
kernel = NewtonGirardAdditiveKernel(base_kernel=trained_kernel, num_dims=train_x.shape[-1])
kernel.outputscale = outputscale
# output = kernel(train_x, train_x).evaluate()
# print(output)
noise_variance = likelihood.noise.data
# Compute training and test kernel matrices

with torch.no_grad():
    model.eval()

    likelihood.eval()
    K_train = kernel(train_x, train_x).evaluate() + noise_variance * torch.eye(train_x.size(0))
    K_s = kernel(test_x,train_x).evaluate()
    K_ss = kernel(test_x, test_x).evaluate() +  torch.eye(test_x.size(0))  # Added jitter for numerical stability

    K_inv =  torch.inverse(K_train)
    alpha = torch.matmul(K_inv, y_train)
    predicted_mean =torch.matmul(K_s,alpha)
    v = torch.linalg.solve_triangular(torch.linalg.cholesky(K_train), K_s.t(), upper=False)
    predicted_covariance_matrix = K_ss - v.t().matmul(v)


In [7]:
predicted_mean

tensor([-0.1218,  0.1457,  0.0025,  0.0292,  0.0702,  0.0669,  0.0534, -0.0349,
         0.0137, -0.1629, -0.0670, -0.1332,  0.0954, -0.0653, -0.0366, -0.0161,
         0.0554,  0.0381, -0.0118,  0.2769,  0.2200, -0.0724,  0.0885,  0.0226,
         0.0040,  0.0038,  0.2994, -0.1375, -0.0317, -0.0514, -0.1664,  0.0962,
        -0.1320, -0.0476, -0.0706, -0.0564, -0.1195,  0.2446, -0.1421, -0.0233])

In [8]:
print(type(model.covar_module.outputscale))

<class 'torch.Tensor'>


# start of Shapley calculations

## K per feature

In [9]:
import torch
import gpytorch

# Assuming train_x and other necessary imports and initializations are done above

n_samples, n_features = train_x.shape

K_per_feature = torch.zeros(n_samples, n_features)

with torch.no_grad():
    model.eval()
    likelihood.eval()
    
    # Get lengthscale from model's kernel if it's already defined
    l = model.covar_module.base_kernel.lengthscale.item()
    
    # Define the ConstrainedRBFKernel

    constrained_kernel = ConstrainedRBFKernel()

    constrained_kernel.lengthscale = l
    
    # Extract the instance's features; assuming you want the 4th sample (index 3)
    instance_features = train_x[3].unsqueeze(0)  # Shape (1, d)

    # Loop over each feature dimension
    for i in range(n_features):
        # Extract the i-th feature across all samples
        feature_column = train_x[:, i].unsqueeze(1)  # Shape (n, 1)
        instance_feature = instance_features[:, i].unsqueeze(1)  # Shape (1, 1)
       
        # Compute the kernel matrix for the i-th feature
        # K_per_feature[:, i] = constrained_kernel.evaluate(instance_feature, feature_column)
        K_per_feature[:, i] = constrained_kernel.evaluate(instance_feature, feature_column)
        # print(K_per_feature)
# Do something with K_per_feature or print it
print(K_per_feature)


tensor([[ 0.0283, -0.1020, -0.0108,  ...,  0.0449,  0.0039,  0.0052],
        [ 0.0161, -0.0518, -0.0641,  ..., -0.0147,  0.0425,  0.0043],
        [-0.0215,  0.0487, -0.0786,  ..., -0.0019,  0.0253,  0.0049],
        ...,
        [-0.0123,  0.0151,  0.0152,  ..., -0.0419, -0.0396, -0.0042],
        [ 0.0558, -0.0227,  0.0541,  ...,  0.0258,  0.0205,  0.0063],
        [ 0.0598,  0.0095,  0.0121,  ...,  0.0650,  0.0008,  0.0008]])


  self.initialize(raw_lengthscale=self.raw_lengthscale_constraint.inverse_transform(torch.tensor(value).view(1, 1, 1)))


In [10]:
train_x[:, 0].unsqueeze(1).shape

torch.Size([160, 1])

In [11]:
K_per_feature.shape

torch.Size([160, 10])

In [12]:

def Omega(X, i,sigmas,q_additivity=None):
    
    n, d = X.shape
    if q_additivity is None:
        q_additivity = d
    
    # Reorder columns so that the i-th column is first
    idx = torch.arange(d)
    idx[i] = 0
    idx[0] = i
    X = X[:, idx]

    # Initialize dp array
    dp = torch.zeros((q_additivity, d, n))

    # Initial sum of features across the dataset
    sum_current = torch.zeros((n,))
    
    # Fill the first order dp (base case)
    for j in range(d):
        dp[0, j, :] = X[:, j]
        sum_current += X[:, j]

    # Fill the dp table for higher orders
    for i in range(1, q_additivity):
        temp_sum = torch.zeros((n,))
        for j in range(d):
            # Subtract the previous contribution of this feature when moving to the next order
            sum_current -= dp[i - 1, j, :]

            dp[i, j, :] =  X[:,j] * sum_current
            dp[i, j, :] = dp[i, j, :] * (i/(i+1)) 
            dp[i,j,:] = dp[i,j,:]
            temp_sum += dp[i, j, :]
        
        sum_current = temp_sum
    for i in range(q_additivity):
        dp[i,:,:] = dp[i,:,:] * sigmas[i]
    # Sum the first row of each slice
    omega = torch.sum(dp[:, 0, :], axis=0)

    return omega , dp

sigmas = outputscale.data.unsqueeze(-1)
val = torch.zeros(n_features)
for i in range(n_features):
    omega_dp, dp = Omega(K_per_feature, i, sigmas,q_additivity=None)
    val[i] = torch.matmul(omega_dp, alpha)

In [13]:
print(torch.sum(val))

tensor(0.6987)


# prediction using only base kernel (which is calculated in K_per_feature for all features seperately)

In [29]:

def K_S_instance(X, sigmas,q_additivity=None):
    
    n, d = X.shape
    if q_additivity is None:
        q_additivity = d
    
    # Reorder columns so that the i-th column is first

    # Initialize dp array
    dp = torch.zeros((q_additivity, d, n))

    # Initial sum of features across the dataset
    sum_current = torch.zeros((n,))
    
    # Fill the first order dp (base case)
    for j in range(d):
        dp[0, j, :] = X[:, j]
        sum_current += X[:, j]

    # Fill the dp table for higher orders
    for i in range(1, q_additivity):
        temp_sum = torch.zeros((n,))
        for j in range(d):
            # Subtract the previous contribution of this feature when moving to the next order
            sum_current -= dp[i - 1, j, :]

            dp[i, j, :] =  X[:,j] * sum_current
            dp[i, j, :] = dp[i, j, :] 
            dp[i,j,:] = dp[i,j,:]
            temp_sum += dp[i, j, :]
        
        sum_current = temp_sum
    for i in range(q_additivity):
        dp[i,:,:] = dp[i,:,:] * sigmas[i]
     # here i would like to some all dimentions d
    result = torch.zeros(n_samples)
    for i in range(d):
        result += torch.sum(dp[:,i,:], axis=0)

    return result, dp

sigmas = outputscale.data.unsqueeze(-1)

K_s_instance , dp= K_S_instance(K_per_feature, sigmas)

prediction = torch.matmul(K_s_instance.unsqueeze(0),  alpha.unsqueeze(1))
print(prediction)

tensor([[0.6987]])


In [16]:
val

tensor([ 0.1792,  0.1127,  0.1404,  0.0018,  0.0008,  0.0409,  0.0166,  0.1642,
         0.0447, -0.0027])

In [17]:
model.eval()
with torch.no_grad():
    trained_pred_dist = likelihood(model(test_x))
    predictive_mean = trained_pred_dist.mean
    lower, upper = trained_pred_dist.confidence_region()
final_mse = gpytorch.metrics.mean_squared_error(trained_pred_dist, y_test, squared=True)

print(f'Trained model MSE: {final_mse:.2f}')
final_mse = gpytorch.metrics.mean_squared_error(trained_pred_dist, y_test, squared=True)

print(f'Trained model MSE: {final_mse:.2f}')

Trained model MSE: 0.02
Trained model MSE: 0.02


In [None]:
# with torch.no_grad():
#     instance_features = train_x[3].unsqueeze(0) 
#     model.eval()

#     likelihood.eval()
#     K_sample = kernel(instance_features, train_x).evaluate()
#     prediction = torch.matmul(K_sample,alpha)

# prediction