In [338]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score,train_test_split
import torch
import gpytorch
from gpytorch.models import ExactGP
from gpytorch.mlls import ExactMarginalLogLikelihood

from gpytorch.distributions import MultivariateNormal
from gpytorch.likelihoods import GaussianLikelihood

import torch
from linear_operator import to_dense
from gpytorch.constraints import Positive
from gpytorch.kernels import Kernel
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
from sklearn.preprocessing import StandardScaler


from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

# Load data
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
X = X[:, :3]

# Split the data into training and testing sets
train_x, test_x, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 20% data as test
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the training data and transform it
train_x = scaler.fit_transform(train_x)

# Transform the test data using the same scaler
test_x = scaler.transform(test_x)

# Convert to torch tensors
train_x = torch.tensor(train_x,dtype=torch.float32)
test_x = torch.tensor(test_x, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)






'''
k(x_1, x_2) = \exp\left(-\frac{1}{2 \l^2} \sum_{k=1}^d \left(\left| x_{1k} - x_{2k} \right|\right)^2 \right)
'''
    
class CustomRBFKernel(gpytorch.kernels.Kernel):

    has_lengthscale = True

    def forward(self, x1, x2, diag=False, **params):
        # Compute squared distance
        # squared_dist = self.covar_dist(x1, x2, square_dist=True, diag=diag, **params)
        diff = x1.unsqueeze(1) - x2.unsqueeze(0)
        # diff = torch.abs(diff)
        squared_dist = (diff ** 2).sum(-1)

        # Divide by 2 * lengthscale^2
        scaled_squared_dist = squared_dist.div(2 * self.lengthscale.pow(2)) #.div

        # Compute exponential
        covar_matrix = scaled_squared_dist.mul_(-1).exp_()

        return covar_matrix


class DPkernel(gpytorch.kernels.Kernel):
    def __init__(self, base_kernel, num_dims, q_additivity, **kwargs):
        super().__init__(**kwargs)
        self.base_kernel = base_kernel
        self.num_dims = num_dims
        self.q_additivity = q_additivity
        self.register_parameter(
            name="raw_outputscale", 
            parameter=torch.nn.Parameter(torch.zeros(1, self.q_additivity))
        )
        self.outputscale_constraint = gpytorch.constraints.Positive()
        self.register_constraint("raw_outputscale", self.outputscale_constraint)

    @property
    def outputscale(self):
        return self.outputscale_constraint.transform(self.raw_outputscale).squeeze()

    @outputscale.setter
    def outputscale(self, value):
        if not torch.is_tensor(value):
            value = torch.tensor(value, device=self.raw_outputscale.device)
        self.initialize(raw_outputscale=self.outputscale_constraint.inverse_transform(value))

    def forward(self, x1, x2, diag=False, **params):
    # Determine sizes based on input matrices
        x1_size = x1.size(0)
        x2_size = x2.size(0)
        
        # Initialize matrices based on input sizes
        result = torch.zeros(x1_size, x2_size, device=x1.device) #initialize the result matrix
        sum_order_b = torch.zeros(x1_size, x2_size, device=x1.device) # initialize the matrix for the matrix for a single order
        kernels =[] # list were the z1, z2,... would be stored

        # print(f"Initial x1 shape: {x1.shape}, x2 shape: {x2.shape}")
        
        #calculations for first order
        #calcualte the kernels for each dimentions
        for d in range(self.num_dims):
            x1_d = x1[:, d:d+1]
            x2_d = x2[:, d:d+1]
            k_d = self.base_kernel(x1_d, x2_d).evaluate() # change thek to k0
            kernels.append(k_d) #save them in order in the kernels list
            # print(f"Kernel k_d at dim {d} shape: {k_d.shape}, sum_order_b shape: {sum_order_b.shape}")

            sum_order_b += k_d # add each one dimension kernels to one matrix for first order
    
        # first_kernels = kernels
        outputscale = self.outputscale.unsqueeze(0) if len(self.outputscale.shape) == 0 else self.outputscale
        result += sum_order_b * self.outputscale[0] #add the first order kernel miltiplied by first outputscale

        # Compute higher order interactions
        for i in range(1, self.q_additivity):
            temp_sum = torch.zeros(x1_size, x2_size, device=x1.device)
            new_kernels = []
            for j in range(self.num_dims):
                for k in range(j + 1, self.num_dims):
                    new_kernel = kernels[j] * kernels[k]
                    new_kernels.append(new_kernel)
                    temp_sum += new_kernel

            kernels = new_kernels  # update kernels list with new order interactions
            result += temp_sum * self.outputscale[i]

        return result

# Example usage in a GP model
class MyGP(gpytorch.models.ExactGP): # i need to find a diferent model
    def __init__(self, train_x, train_y, likelihood):
        super(MyGP, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ZeroMean()
        # self.base_kernel = gpytorch.kernels.RBFKernel()
        self.base_kernel = CustomRBFKernel()
        self.covar_module = DPkernel(base_kernel=self.base_kernel, num_dims=train_x.size(-1), q_additivity=train_x.size(-1))

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x,x)  # Make sure to pass x twice WHY
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Create the GP model
likelihood = gpytorch.likelihoods.GaussianLikelihood()


model = MyGP(train_x, y_train.squeeze(-1), likelihood)
model.eval()
# with torch.no_grad():
#     untrained_pred_dist = likelihood(model(test_x))
#     predictive_mean = untrained_pred_dist.mean
#     lower, upper = untrained_pred_dist.confidence_region()
# Set up optimizer and marginal log likelihood
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

model.train()
likelihood.train()
# Training loop
training_iter = 1000
for i in range(training_iter):
    optimizer.zero_grad()
    output = model(train_x)
    # print(output)
    loss = -mll(output, y_train)
    loss = loss.mean() 
    loss.backward()
    # print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
    #     i + 1, training_iter, loss.item(),
    #     model.covar_module.base_kernel.lengthscale.item(),
    #     model.likelihood.noise.item()
    # ))
    optimizer.step()
# print('likelihood noise', likelihood.noise)
# print('likelihood noise raw', likelihood.noise_covar.raw_noise)
model.eval()
# with torch.no_grad():
#     trained_pred_dist = likelihood(model(test_x))
#     predictive_mean = trained_pred_dist.mean
#     lower, upper = trained_pred_dist.confidence_region()
# Viewing model parameters after training
for param_name, param in model.named_parameters():
    print(f'Parameter name: {param_name:42} value = {param.data}')


#evaluating there is a problem when the test_y and test_x have float numbers
with torch.no_grad():
    model.eval()  # Set the model to evaluation mode (mode is for computing predictions through the model posterior.)
    likelihood.eval()
    output = likelihood(model(test_x))  # Make predictions on new data 
    


for constraint_name, constraint in model.named_constraints():
    print(f'Constraint name: {constraint_name:55} constraint = {constraint}')


# Extracting means and standard deviations
predicted_means = output.mean.numpy() 
predicted_stddevs = output.stddev.numpy()  # Extract standard deviations

print("Predicted Means:")
print(predicted_means)

print("Predicted Standard Deviations:")
print(predicted_stddevs)




Parameter name: likelihood.noise_covar.raw_noise           value = tensor([3.8435])
Parameter name: base_kernel.raw_lengthscale                value = tensor([[-4.1559]])
Parameter name: covar_module.raw_outputscale               value = tensor([[20.0976, 23.1188, 14.7172]])
Constraint name: likelihood.noise_covar.raw_noise_constraint             constraint = GreaterThan(1.000E-04)
Constraint name: base_kernel.raw_lengthscale_constraint                  constraint = Positive()
Constraint name: covar_module.raw_outputscale_constraint                 constraint = Positive()
Predicted Means:
[136.43709  126.317535 128.54861  142.43285  132.39062  128.48618
 156.03854  203.0178   149.64705  170.51485  134.30383  150.25023
 154.87848  204.96857   99.698975 165.05661  193.5575   196.85657
 155.33037  200.43794  159.80981  131.59915  160.64688  205.76811
 149.31332  174.03635  217.24796  169.20798  109.80788  126.742065
 141.68932  122.119675 166.41295  196.33499  142.92044  166.7475
  93.424

In [339]:
# calculate teh alpha_hat_eta
with torch.no_grad():
    # Evaluate the kernel matrix
    t_k_matrix = model.covar_module(train_x).evaluate()
    
    # Ensure the noise variance is non-zero and sufficiently large to avoid singularity
    noise_variance = likelihood.noise_covar.noise if likelihood.noise_covar.noise > 1e-6 else 1e-6
    n_matrix = torch.eye(t_k_matrix.size(-1), device=t_k_matrix.device) * noise_variance
    
    # Add regularization to avoid singular matrix

    K_inv = torch.inverse(t_k_matrix + n_matrix + torch.eye(t_k_matrix.size(-1), device=t_k_matrix.device))

    # Compute alpha_hat_eta using the inverse (dot product)
    alpha_hat_eta = torch.matmul(K_inv, y_train)

In [345]:
# print(K.shape)  # Should be (n_samples, n_features)
n_samples, n_features = train_x.shape

# Initialize a tensor to hold the full kernel matrix K
K = torch.zeros(n_samples, n_features)

with torch.no_grad():
    kernel = model.covar_module

    for j in range(n_features):
        # Extract the j-th feature from all samples, ensure it is 2D: [n_samples, 1]
        x_feature_j = train_x[:, j].unsqueeze(1)
        
        # Compute the kernel matrix for this feature across all samples
        # This assumes the kernel can process a matrix of shape [n_samples, 1]
        # and output a distance matrix of shape [n_samples, n_samples]
        pred_dist_matrix = kernel( x_feature_j).evaluate()

        # Since we want to store only specific values or summarization, let's store the mean of each row
        # This operation depends on your specific requirement, here we assume mean for example purposes
        K[:, j] = pred_dist_matrix.mean(dim=1)
print(K)

tensor([[ 65.5050, 115.2710,  64.4729],
        [ 66.1310, 115.2710,  64.0486],
        [ 66.7570, 121.8438,  64.2640],
        ...,
        [ 66.4440, 121.8438,  64.8722],
        [ 64.2530, 121.8438,  66.0921],
        [ 64.2530, 121.8438,  64.0591]])


In [347]:
import itertools

n_samples, n_features = train_x.size()
val = torch.zeros(n_features)
#### calculate the extendent K


# Generate all combinations of feature indices
feature_combinations = list(itertools.chain.from_iterable(
    itertools.combinations(range(n_features), r) for r in range(1, n_features + 1)
))

# Initialize an extended kernel matrix to store the results
extended_K = torch.zeros((n_samples, len(feature_combinations)))

# Compute the product of kernels for each combination
for idx, combination in enumerate(feature_combinations):
    # print(type(combination[0]))
    # Start with the first column in the combination
    product_kernel = K[:, combination[0]]
    if len(combination)>1:
        # Multiply by each subsequent column in the combination
        for col in combination[1:]:
            product_kernel *= K[:, col]
    
    # Store the result in the corresponding column of extended_K
    extended_K[:, idx] = product_kernel

# print(extended_K)

# print("THIS IS EXTENDED", extended_K)
## 

# create a loop for each feature
for j in range(n_features):
    
    # find the subsets where the j-th feature was used to create (find the index)
    # print('pre', indices_of_kj_columns)
    indices_of_kj_columns = []
    
        # Check if kj is present in the combination
    for idx, combination in enumerate(feature_combinations):
        # print(idx,combination)
        if j in combination:
            indices_of_kj_columns.append(idx)
    indices_of_kj_columns = torch.tensor(indices_of_kj_columns) # change it from a list to a tensor
    
  
    
    
    ### updated extended K
    updated_extended_K = torch.zeros(n_samples, len(indices_of_kj_columns)) # initialise
    # update the K_extended matrix so that only the columns with the indexes found 
    # Copy the columns from extended_K corresponding to indices_of_kj_columns
    for i, idx in enumerate(indices_of_kj_columns):
        # idx = int(idx)
        updated_extended_K[:, i] = extended_K[:, idx]
  

    #create a vector of weights for the corresponding columns (a weight i sthe 1/len(number of indences) that build this column) 
    weights = torch.zeros((len(indices_of_kj_columns))).unsqueeze(-1)

    #Set the diagonal elements to the number of kernels needed to cover each feature combination
    for i, idx in enumerate(indices_of_kj_columns):
        weights[i] = 1/len(feature_combinations[idx])
    # print('weights', weights)
    omega = torch.matmul( updated_extended_K, weights)
    
    # Transpose the result back to its original shape
    omega = omega.T.squeeze(-1)
    

    val[j] = torch.matmul(omega, alpha_hat_eta)
    


print('this is shapley value',val)



this is shapley value tensor([1.4436e+25, 1.4436e+25, 1.4436e+25])
