# GP Initial Implementation: GPyTorch

In [2]:
# Import needed libraries and modules
from codecarbon import EmissionsTracker
import numpy as np
import torch
import gpytorch
from torch.optim import Adam
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Fetch dataset from UCI Repository
from ucimlrepo import fetch_ucirepo
heart_disease = fetch_ucirepo(id=45)
df = heart_disease.data.original

In [3]:

# ---------------------------------------------------------------------------- #
#                                PRE-PROCESSING                                #
# ---------------------------------------------------------------------------- #

##### SETTINGS #####
PC_Features = True
Random_Seed = 82024
K_Folds = 10
Max_Iterations = 200
####################

# Drop missing values
df = df.dropna()
df = df.reset_index(drop=True)

# Binarize target
df.loc[df['num'] != 0, 'num'] = 1

# Define features and target vectors
X = df.iloc[:,:-1]
y = df['num']

# Separate integer from categorical features
int_features, cat_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak'],\
['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Define preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('int', StandardScaler(), int_features),
        ('cat', OneHotEncoder(), cat_features)
    ])

# Define pipeline depending on whether PCA is requested or not
if PC_Features:
    preprocessor = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=12))
    ])

In [4]:
# ---------------------------------------------------------------------------- #
#                                     MODEL                                    #
# ---------------------------------------------------------------------------- #

# Initiate CodeCarbon to track emissions
tracker = EmissionsTracker('GP gpytorch model', log_level='warning')
tracker.start()

# Create model class
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_X, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_X, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# K-Fold cross-validation
kfold = KFold(n_splits=K_Folds, shuffle=True, random_state=Random_Seed)
roc_aucs, accs = [], []

for train_idx, test_idx in kfold.split(X):
    # Split data into training and testing sets
    train_X, test_X = X.iloc[train_idx], X.iloc[test_idx]
    train_y, test_y = y.iloc[train_idx], y.iloc[test_idx]
    
    # Preprocess data
    train_X = preprocessor.fit_transform(train_X)
    test_X = preprocessor.transform(test_X)
    
    # Convert to PyTorch tensors
    train_X = torch.tensor(train_X, dtype=torch.float32)
    train_y = torch.tensor(train_y.values, dtype=torch.float32)
    test_X = torch.tensor(test_X, dtype=torch.float32)
    test_y = torch.tensor(test_y.values, dtype=torch.float32)
    
    # Initialize model and likelihood
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = ExactGPModel(train_X, train_y, likelihood)
    
    # Train model
    model.train()
    likelihood.train()
    
    # Use Adam optimizer
    optimizer = Adam(model.parameters(), lr = 0.1)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    for _ in range(Max_Iterations):
        optimizer.zero_grad()
        output = model(train_X)
        loss = -mll(output, train_y)
        loss.backward()
        optimizer.step()

    # Evaluate model
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        test_pred = likelihood(model(test_X))
        pred_probs = test_pred.mean.numpy()
        roc_aucs.append(roc_auc_score(test_y.numpy(), pred_probs))
        accs.append(accuracy_score(test_y.numpy(), (pred_probs > 0.5).astype(int)))

# Calculate and display results
acc = np.mean(accs)
acc_std = np.std(accs)
roc_auc = np.mean(roc_aucs)
roc_auc_std = np.std(roc_aucs)

print(f"Accuracy: {acc:.4f} ± {acc_std:.4f}")
print(f"AUC-ROC: {roc_auc:.4f} ± {roc_auc_std:.4f}")

# Stop emission tracking
_ = tracker.stop()

[codecarbon ERROR @ 16:31:56] Error: Another instance of codecarbon is already running. Turn off the other instance to be able to run this one. Exiting.




Accuracy: 0.8421 ± 0.0388
AUC-ROC: 0.9119 ± 0.0369
