# Sparse Variational GP Baseline for Diabetes Data

This notebook walks through a centralized Sparse/Variational Gaussian Process (SVGP) pipeline before moving to the federated-learning setting. Each major step is separated so you can adapt or swap components easily later.


In [None]:
# Step 0 – Core imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report,
    f1_score,
    roc_auc_score,
    precision_recall_curve,
)
import torch
from torch.utils.data import TensorDataset, DataLoader
import gpytorch


## Step 1 – Load the diabetes dataset


In [None]:
DATA_PATH = "diabetes_1.csv"

df = pd.read_csv(DATA_PATH)
print("Raw shape:", df.shape)
df.head()


## Step 2 – Basic preprocessing (drop NA + one-hot categorical columns)


In [None]:
df = df.dropna().reset_index(drop=True)
df = pd.get_dummies(df, drop_first=True)
print("After preprocessing:", df.shape)
df.head()


## Step 3 – Train/test split and scaling


In [None]:
target_col = "diabetes"

y = df[target_col]
X = df.drop(columns=[target_col])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Train/Test shapes:", X_train_scaled.shape, X_test_scaled.shape)
np.bincount(y_train.values.astype(int))


## Step 4 – Move data into PyTorch tensors and loaders


In [None]:
batch_size = 512
num_inducing = 200

X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_t = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.float32)

dataset = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

inducing_idx = torch.randperm(X_train_t.shape[0])[:num_inducing]
inducing_points = X_train_t[inducing_idx]

X_train_t.shape, X_test_t.shape


## Step 5 – Define the SVGP model and likelihood


In [None]:
class SVGPClassificationModel(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = gpytorch.variational.CholeskyVariationalDistribution(
            inducing_points.size(0)
        )
        variational_strategy = gpytorch.variational.VariationalStrategy(
            self,
            inducing_points,
            variational_distribution,
            learn_inducing_locations=True,
        )
        super().__init__(variational_strategy)

        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RBFKernel(ard_num_dims=X_train_t.shape[1])
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

likelihood = gpytorch.likelihoods.BernoulliLikelihood()
model = SVGPClassificationModel(inducing_points)


## Step 6 – Train with variational ELBO


In [None]:
training_iter = 100
learning_rate = 0.01

model.train()
likelihood.train()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(dataset))

for epoch in range(1, training_iter + 1):
    running_loss = 0.0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        output = model(xb)
        loss = -mll(output, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | Loss: {running_loss / len(train_loader):.4f}")


## Step 7 – Evaluate (threshold sweep + metrics)


In [None]:
model.eval()
likelihood.eval()

with torch.inference_mode():
    test_probs = likelihood(model(X_test_t)).probs.squeeze().cpu().numpy()

precision, recall, thresholds = precision_recall_curve(y_test, test_probs)
f1_values = 2 * precision[:-1] * recall[:-1] / (precision[:-1] + recall[:-1] + 1e-12)
best_idx = f1_values.argmax()
best_thr = thresholds[best_idx]
y_pred = (test_probs >= best_thr).astype(int)

print(f"Best threshold: {best_thr:.3f}")
print(f"F1@best: {f1_values[best_idx]:.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test, test_probs):.3f}")
print(classification_report(y_test, y_pred))
