# GP Initial Implementation: Scikit-Learn

In [1]:
# Import needed libraries and modules
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import roc_auc_score, accuracy_score
import torch
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# Fetch dataset from UCI Repository
from ucimlrepo import fetch_ucirepo
heart_disease = fetch_ucirepo(id=45)
df = heart_disease.data.original

In [2]:
# ---------------------------------------------------------------------------- #
#                                PRE-PROCESSING                                #
# ---------------------------------------------------------------------------- #

# --------------------------------- SETTINGS --------------------------------- #
Normalize = True
PC_Features = True
Test_Size = 0.2
Random_Seed = 82024
Torch = False
Num_iterations = 100
Cross_Validation = True

# Drop missing values
df = df.dropna()
df = df.reset_index(drop=True)

# Binarize target
df.loc[df['num'] != 0, 'num'] = 1

# Define features and target vectors
X = df.iloc[:,:-1]
y = df['num']

# Normalize if requested
if (Normalize) or (PC_Features):
    int_features, cat_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak'],\
    ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('int', StandardScaler(), int_features),
        ('cat', OneHotEncoder(), cat_features)
    ])
    X = preprocessor.fit_transform(X)
else:
    X = X.values

# Apply PCA if requested
if PC_Features:
    pca = PCA(n_components=12)
    X = pca.fit_transform(X)
    
# Convert to torch tensor if requested
if Torch:
    X = torch.tensor(X)
    y = torch.tensor(y).double()

if not Cross_Validation:
    # Split train and test data
    index = list(range(y.size))
    train_index, test_index = train_test_split(index, test_size=Test_Size, random_state=Random_Seed)

    train_X = X[train_index]
    train_y = y.loc[train_index].values

    test_X = X[test_index]
    test_y = y.loc[test_index].values



In [3]:
# ---------------------------------------------------------------------------- #
#                                     MODEL                                    #
# ---------------------------------------------------------------------------- #
# Create model
model = GaussianProcessClassifier(
    random_state=Random_Seed,
    kernel=RBF(length_scale=1.0)
    )

# Cross validation:
if Cross_Validation:
    cv_results = cross_validate(model, X, y, scoring=['accuracy', 'roc_auc'])
    acc = cv_results['test_accuracy'].mean()
    roc_auc = cv_results['test_roc_auc'].mean()

else:
    # Train model
    model.fit(train_X, train_y)

    # Test model
    y_pred = model.predict(test_X)

    pred_probs = model.predict_proba(test_X)

    # Evaluation:
    acc = accuracy_score(test_y, y_pred)
    roc_auc = roc_auc_score(test_y, pred_probs[:, 1])

print(f"Accuracy: {acc:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

Accuracy: 0.8551
AUC-ROC: 0.9061
