In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update(plt.rcParamsDefault)
import os, sys
from scipy.stats import norm, skewnorm
from scipy.stats import gaussian_kde
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, classification_report, make_scorer, log_loss, roc_auc_score, brier_score_loss
from sklearn.metrics import mean_squared_error, mean_absolute_error

import gpytorch
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
proj_dir = 'C:/Users/ady05/Desktop/NU/DANA/NVQI/prob_learning_new/'
workspace = proj_dir + 'OtherModels/VGP-mrs/'
util_dir = proj_dir + 'OtherModels/utils/'
data_dir = proj_dir + 'datasets/'
proc_dir = proj_dir + 'data processing/'

In [3]:
sys.path.insert(0, util_dir)
from data_proc import data_proc_mrs6
from plot_measures import (
    plot_confusion_matrix,
    plot_roc,
    plot_outcome_prob_relation,
    plot_feature_importance
)

# Data processing

In [4]:
df_comb = pd.read_excel(proc_dir + 'comb.xlsx')
df_num = pd.read_excel(data_dir + 'vargroups_numeric_new.xlsx')
df_cat = pd.read_excel(data_dir + 'vargroups_categorical_new.xlsx')

groupname = 'group 24h'

In [6]:
X_data, y_data, num_names, cat_names = data_proc_mrs6(df_comb, df_num, df_cat, groupname)

(X_data.shape, y_data.shape)

((3588, 76), (3588,))

In [8]:
if hasattr(y_data, "toarray"):  # Check if y_data is a sparse matrix
    y_data = y_data.toarray().ravel() 
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, stratify=y_data, test_size=0.2, random_state=1121218
)

(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

((2870, 76), (2870,), (718, 76), (718,))

In [9]:
X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train).float()

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)  # Adjust batch size as needed

# Variational Gaussian Process Model

In [10]:
class VariationalGPModel(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = gpytorch.variational.CholeskyVariationalDistribution(
            inducing_points.size(0)
        )
        variational_strategy = gpytorch.variational.VariationalStrategy(
            self, inducing_points, variational_distribution, learn_inducing_locations=True
        )
        super().__init__(variational_strategy)

        self.mean_module = gpytorch.means.LinearMean(input_size=inducing_points.size(1))
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RBFKernel() + gpytorch.kernels.LinearKernel()
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

inducing_points = torch.randn(100, X_train_tensor.shape[1])  
model = VariationalGPModel(inducing_points)

likelihood = gpytorch.likelihoods.GaussianLikelihood()
mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=X_train_tensor.shape[0])

optimizer = optim.Adam(model.parameters(), lr=0.01) 
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

epochs = 100
model.train()
likelihood.train()

for epoch in range(epochs):
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()  
        output = model(X_batch)

        loss = -mll(output, y_batch)
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()

    scheduler.step()  # Update learning rate
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader)}')

model.eval()
likelihood.eval()
X_test_tensor = torch.from_numpy(X_test).float()

with torch.no_grad(), gpytorch.settings.fast_pred_var():
    predictions = likelihood(model(X_test_tensor))
    mean = predictions.mean.numpy()
    variance = predictions.variance.numpy()



Epoch 1/100, Loss: 64.65619320339627
Epoch 2/100, Loss: 25.795291731092664
Epoch 3/100, Loss: 12.104550711313884
Epoch 4/100, Loss: 6.993372387356228
Epoch 5/100, Loss: 5.36964439286126
Epoch 6/100, Loss: 5.271765454610189
Epoch 7/100, Loss: 4.33830222023858
Epoch 8/100, Loss: 4.579877620273166
Epoch 9/100, Loss: 4.231889496909248
Epoch 10/100, Loss: 4.3026762061648895
Epoch 11/100, Loss: 3.784212907155355
Epoch 12/100, Loss: 3.3583807839287654
Epoch 13/100, Loss: 3.3068451510535346
Epoch 14/100, Loss: 3.2719899548424616
Epoch 15/100, Loss: 3.2397481812371147
Epoch 16/100, Loss: 3.2322032663557265
Epoch 17/100, Loss: 3.235787105560303
Epoch 18/100, Loss: 3.1900858137342665
Epoch 19/100, Loss: 3.1535080009036593
Epoch 20/100, Loss: 3.125660112169054
Epoch 21/100, Loss: 3.082212787204319
Epoch 22/100, Loss: 3.062887689802382
Epoch 23/100, Loss: 3.0536483658684626
Epoch 24/100, Loss: 3.0407443046569824
Epoch 25/100, Loss: 3.0375585397084555
Epoch 26/100, Loss: 3.0264973322550457
Epoch 27/

# Measures

In [11]:
def root_mean_squared_error(y_pred, y_test):
    return np.sqrt(mean_squared_error(y_pred, y_test))
def normal_nll(loc, scale, y_test):
    return -norm.logpdf(y_test.flatten(), loc=loc, scale=scale).mean()

In [12]:
print(root_mean_squared_error(mean, y_test))
print(normal_nll(mean, variance**0.5, y_test))

1.5640403784103054
2.005787681464216
