In [15]:
# imports
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [16]:
# Load cleaned data from eda step
file_path = "../data/cleaned_diabetes_health_indicators_dataset.csv"
df = pd.read_csv(file_path)

In [17]:
# Standardize Data
dependent_column = ["Diabetes_012"]
numerical_predictor_columns = ["BMI", "Age", "Income"]

# Find Z-score of Numerical Predictors
Z_train = preprocessing.scale(df[numerical_predictor_columns])
Z_train = pd.DataFrame(Z_train, columns=numerical_predictor_columns)

# Create dummy vars
dummy_HighBP = pd.get_dummies(df["HighBP"], drop_first=True)
dummy_HighChol = pd.get_dummies(df["HighChol"], drop_first=True)
dummy_CholCheck = pd.get_dummies(df["CholCheck"], drop_first=True)
dummy_Smoker = pd.get_dummies(df["Smoker"] , drop_first=True)
dummy_Stroke = pd.get_dummies(df["Stroke"] , drop_first=True)
dummy_HeartDiseaseorAttack = pd.get_dummies(df["HeartDiseaseorAttack"] , drop_first=True)
dummy_PhysActivity = pd.get_dummies(df["PhysActivity"] , drop_first=True)
dummy_Fruits = pd.get_dummies(df["Fruits"] , drop_first=True)
dummy_Veggies = pd.get_dummies(df["Veggies"] , drop_first=True)
dummy_HvyAlcoholConsump = pd.get_dummies(df["HvyAlcoholConsump"] , drop_first=True)
dummy_AnyHealthcare = pd.get_dummies(df["AnyHealthcare"] , drop_first=True)
dummy_NoDocbcCost = pd.get_dummies(df["NoDocbcCost"] , drop_first=True)
dummy_GenHlth = pd.get_dummies(df["GenHlth"] , drop_first=True)
dummy_DiffWalk = pd.get_dummies(df["DiffWalk"] , drop_first=True)
dummy_Sex = pd.get_dummies(df["Sex"] , drop_first=True)
dummy_Education = pd.get_dummies(df["Education"] , drop_first=True)
dummy_MentHlth = pd.get_dummies(df["MentHlth"] , drop_first=True)
dummy_PhysHlth = pd.get_dummies(df["PhysHlth"] , drop_first=True)

# Concatenate the standardized numerical features with the dummy vars
X = pd.concat([Z_train, dummy_HighBP, dummy_HighChol, dummy_CholCheck, dummy_Smoker, dummy_Stroke, dummy_HeartDiseaseorAttack,
                     dummy_PhysActivity, dummy_Fruits, dummy_Veggies, dummy_HvyAlcoholConsump, dummy_AnyHealthcare, dummy_NoDocbcCost,
                    dummy_GenHlth, dummy_DiffWalk, dummy_Sex, dummy_Education], axis=1)
X.columns = X.columns.astype(str)
y = df[dependent_column]

# Do this to avoid warnings
y = y.values.ravel()

# Merge Class 1 and 2 into single class
y_binary = y.copy()
y_binary[y_binary == 2] = 1

# Split data into training and testing sets
X_train, X_test, y_train_binary, y_test_binary = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)
print("Train/Test Split done")

Train/Test Split done


In [None]:
y_train_binary = pd.Series(y_train_binary, index=X_train.index)
y_test_binary = pd.Series(y_test_binary, index=X_test.index)

# Stage 1: SVM RBF - 0 vs (1+2)
svm_stage1 = SVC(kernel='rbf', probability=True, class_weight='balanced')
svm_stage1.fit(X_train, y_train_binary)

y_pred_binary = svm_stage1.predict(X_test)

print("\nStage 1 SVM Classification Report (0 vs 1+2):")
print(classification_report(y_test_binary, y_pred_binary))

# Stage 2: SVM RBF - 1 vs 2

# Get original test labels with 0/1/2
y_test_full = y.iloc[y_test_binary.index]

# Find samples predicted as diabetic
indices_pred_diabetes = np.where(y_pred_binary == 1)[0]
X_test_diabetes = X_test.iloc[indices_pred_diabetes]
y_test_diabetes = y_test_full.iloc[indices_pred_diabetes]

# Keep only class 1 and 2
mask_12 = (y_test_diabetes == 1) | (y_test_diabetes == 2)
X_test_diabetes = X_test_diabetes[mask_12]
y_test_diabetes = y_test_diabetes[mask_12]

# Prepare second-stage training data
y_train_full = y.loc[X_train.index]
mask_train_diabetes = (y_train_binary == 1)
X_train_diabetes = X_train[mask_train_diabetes]
y_train_diabetes = y_train_full[mask_train_diabetes]

# Train second SVM
svm_stage2 = SVC(kernel='rbf', probability=True, class_weight='balanced')
svm_stage2.fit(X_train_diabetes, y_train_diabetes)

y_pred_second_stage = svm_stage2.predict(X_test_diabetes)

print("\nStage 2 SVM Classification Report (1 vs 2):")
print(classification_report(y_test_diabetes, y_pred_second_stage))

In [None]:
# Reconstruct final prediction array
y_pred_final = y_pred_binary.copy()
pred_diabetes_indices = np.where(y_pred_binary == 1)[0]

# Reindex and assign
y_test_full_diabetes = y_test_full.iloc[pred_diabetes_indices]
valid_indices = pred_diabetes_indices[(y_test_full_diabetes == 1) | (y_test_full_diabetes == 2)]

for i, idx in enumerate(valid_indices):
    y_pred_final[idx] = y_pred_second_stage[i]

# --------------------------------------------
# Save final predictions
# --------------------------------------------
import os
os.makedirs("results", exist_ok=True)
np.save("results/y_pred_svm.npy", y_pred_final)