In [1]:
# imports
import os
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
# Load cleaned data from eda step
file_path = "../data/cleaned_diabetes_health_indicators_dataset.csv"
df = pd.read_csv(file_path)

In [3]:
# Separate features and target
X = df.drop("Diabetes_012", axis=1)
y = df["Diabetes_012"]

# Merge Class 1 and 2 into single class
y_binary = y.copy()
y_binary[y_binary == 2] = 1

# Split data into training and testing sets
X_train, X_test, y_train_binary, y_test_binary = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)
print("Train/Test Split done")

Train/Test Split done


In [4]:
y_train_binary = pd.Series(y_train_binary, index=X_train.index)
y_test_binary = pd.Series(y_test_binary, index=X_test.index)

# Stage 1: SVM RBF - 0 vs (1+2)
svm_stage1 = SVC(kernel='rbf', probability=True, class_weight='balanced')
svm_stage1.fit(X_train, y_train_binary)

y_pred_binary = svm_stage1.predict(X_test)

print("\nStage 1 SVM Classification Report (0 vs 1+2):")
print(classification_report(y_test_binary, y_pred_binary))

# Stage 2: SVM RBF - 1 vs 2

# Get original test labels with 0/1/2
y_test_full = y.iloc[y_test_binary.index]

# Find samples predicted as diabetic
indices_pred_diabetes = np.where(y_pred_binary == 1)[0]
X_test_diabetes = X_test.iloc[indices_pred_diabetes]
y_test_diabetes = y_test_full.iloc[indices_pred_diabetes]

# Keep only class 1 and 2
mask_12 = (y_test_diabetes == 1) | (y_test_diabetes == 2)
X_test_diabetes = X_test_diabetes[mask_12]
y_test_diabetes = y_test_diabetes[mask_12]

# Prepare second-stage training data
y_train_full = y.loc[X_train.index]
mask_train_diabetes = (y_train_binary == 1)
X_train_diabetes = X_train[mask_train_diabetes]
y_train_diabetes = y_train_full[mask_train_diabetes]

# Train second SVM
svm_stage2 = SVC(kernel='rbf', probability=True, class_weight='balanced')
svm_stage2.fit(X_train_diabetes, y_train_diabetes)

y_pred_second_stage = svm_stage2.predict(X_test_diabetes)

print("\nStage 2 SVM Classification Report (1 vs 2):")
print(classification_report(y_test_diabetes, y_pred_second_stage))


Stage 1 SVM Classification Report (0 vs 1+2):
              precision    recall  f1-score   support

         0.0       0.94      0.68      0.79     38012
         1.0       0.34      0.79      0.48      7945

    accuracy                           0.70     45957
   macro avg       0.64      0.73      0.63     45957
weighted avg       0.84      0.70      0.73     45957


Stage 2 SVM Classification Report (1 vs 2):
              precision    recall  f1-score   support

         1.0       0.14      0.34      0.20       645
         2.0       0.91      0.76      0.83      5628

    accuracy                           0.72      6273
   macro avg       0.53      0.55      0.51      6273
weighted avg       0.83      0.72      0.77      6273



In [7]:
# Reconstruct final prediction array
y_pred_final = y_pred_binary.copy()
pred_diabetes_indices = np.where(y_pred_binary == 1)[0]

# Reindex and assign
y_test_full_diabetes = y_test_full.iloc[pred_diabetes_indices]
valid_indices = pred_diabetes_indices[(y_test_full_diabetes == 1) | (y_test_full_diabetes == 2)]

for i, idx in enumerate(valid_indices):
    y_pred_final[idx] = y_pred_second_stage[i]

# --------------------------------------------
# Save final predictions
# --------------------------------------------
import os
os.makedirs("results", exist_ok=True)
np.save("results/y_pred_svm.npy", y_pred_final)