In [7]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, classification_report, roc_auc_score, log_loss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter
import matplotlib.pyplot as plt
import joblib

original_df = pd.read_csv('Data/diabetes_012_health_indicators_BRFSS2015.csv')

y = original_df.Diabetes_012
X = original_df.drop('Diabetes_012', axis=1)
X.head(1)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0


In [8]:
# Numerical Predictors
numerical_cols = ['BMI', 'MentHlth', 'PhysHlth', 'Age']
# numerical_cols = list(dict.fromkeys(numerical_cols))  # Remove duplicates just in case

# Categorical Predictors
categorical_cols = [
    'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
    'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
    'NoDocbcCost', 'GenHlth', 'DiffWalk', 'Sex', 'Education', 'Income'
]

# One-hot encode categorical features and drop the first category of each
# Feature Engineering - Turns categorical data into a numerical format that a model can understand
X_cat = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Extract the numerical features
X_num = X[numerical_cols]

# Concatenate numerical and encoded categorical features
X = pd.concat([X_num, X_cat], axis=1)

# Remove any duplicated columns that may result from concat
X = X.loc[:, ~X.columns.duplicated()]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaler will only fit on training data so, calculates Mean & STD and stores this on the scaler
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols]) # Uses scaler that has only seen the traning data -> standardize 

# Combine transformed training group +  assigned clean, matching row indices
df_train = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)

In [9]:
# Removing 'MentHlth' from being an outlier as a single unique value is being treated as an outlier but, is a valid data point
numerical_cols2 = ['BMI', 'Age']

# Set a z-score threshold: ±1 SD → ~68.3% of data, ±2 SD → ~95.5% of data, ±3 SD → ~99.7% of data
threshold = 3    

# Calculate absolute z-scores for numerical columns
z_scores = np.abs(df_train[numerical_cols2])

# Mask: rows where any numerical feature has a z-score >= threshold of 3
outlier_mask = (z_scores >= threshold).any(axis=1)

# Get the outlier rows
outliers = df_train[outlier_mask]

# Get the cleaned training data (rows that are NOT outliers)
df_train_cleaned = df_train[~outlier_mask].reset_index(drop=True)

# Show the outliers
# print("Outlier rows removed from training data:")
# print(outliers)

# Print how many were removed
print(f"\nRemoved {len(outliers)} outliers from training data.")


Removed 2366 outliers from training data.


In [10]:
# Refit scaler on cleaned training data
scaler = StandardScaler()
df_train_cleaned[numerical_cols] = scaler.fit_transform(df_train_cleaned[numerical_cols])

# Apply same transformation to test data
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [11]:
print(set(X_test.columns) - set(df_train_cleaned.drop('Diabetes_012', axis=1).columns))
print(set(df_train_cleaned.drop('Diabetes_012', axis=1).columns) - set(X_test.columns))

set()
set()


In [13]:
# Split training data (features and labels) ===
X_train = df_train_cleaned.drop('Diabetes_012', axis=1)
y_train = df_train_cleaned.Diabetes_012  # or your actual target column

# Reindex X_test to match final X_train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [14]:
from sklearn.svm import OneClassSVM
import numpy as np

# Define nu values to try
nu_values = [0.01, 0.1, 0.3]

# Dictionary to hold models: models[nu][class] = OneClassSVM
models_by_nu = {}

for nu in nu_values:
    print(f"\nTraining OneClassSVM models with nu = {nu}")
    models = {}

    for cls in y_train.unique():
        # Subset training data for the current class
        X_cls = X_train[y_train == cls]

        # Train OneClassSVM for this class and nu value
        model = OneClassSVM(kernel='rbf', gamma='auto', nu=nu)
        model.fit(X_cls)

        models[cls] = model

    # Store all models for this nu value
    models_by_nu[nu] = models



Training OneClassSVM models with nu = 0.01

Training OneClassSVM models with nu = 0.1

Training OneClassSVM models with nu = 0.3


In [15]:
# Store results
scores_by_nu = {}

for nu, models in models_by_nu.items():
    print(f"\nScoring with nu = {nu}")

    # Compute decision function scores for each class
    scores = {cls: model.decision_function(X_test) for cls, model in models.items()}

    # Combine scores into matrix: (rows = samples, cols = classes)
    X_test_scores = np.column_stack([scores[cls] for cls in sorted(models)])

    # Predict class with highest score (most likely inlier)
    y_pred = np.argmax(X_test_scores, axis=1)

    # Store results
    scores_by_nu[nu] = {
        'scores': X_test_scores,
        'y_pred': y_pred
    }



Scoring with nu = 0.01

Scoring with nu = 0.1

Scoring with nu = 0.3


In [16]:
from sklearn.metrics import classification_report

for nu in scores_by_nu:
    print(f"\n--- Classification Report for nu = {nu} ---")
    y_pred = scores_by_nu[nu]['y_pred']
    print(classification_report(y_test, y_pred))



--- Classification Report for nu = 0.01 ---
              precision    recall  f1-score   support

         0.0       0.85      0.98      0.91     42795
         1.0       0.03      0.02      0.02       944
         2.0       0.47      0.03      0.05      6997

    accuracy                           0.83     50736
   macro avg       0.45      0.34      0.33     50736
weighted avg       0.78      0.83      0.77     50736


--- Classification Report for nu = 0.1 ---
              precision    recall  f1-score   support

         0.0       0.86      0.89      0.87     42795
         1.0       0.02      0.09      0.03       944
         2.0       0.39      0.13      0.19      6997

    accuracy                           0.77     50736
   macro avg       0.42      0.37      0.37     50736
weighted avg       0.78      0.77      0.76     50736


--- Classification Report for nu = 0.3 ---
              precision    recall  f1-score   support

         0.0       0.89      0.68      0.77     42

My data is too big, can we use undersampling then, remove highly overlapping samples for class 2 (e.g., ones with lowest z-score distance from class 0 to build clearer margins helping SVM focus), train OneClassSVM boosting nu and gamma specifically for class 2, Use class-wise score thresholds instead of strict argmax

In [17]:
from imblearn.under_sampling import RandomUnderSampler

# Only use classes 0 and 1 for undersampling
rus = RandomUnderSampler(random_state=42)

# Combine X and y (excluding class 2)
mask = y_train != 2
X_temp, y_temp = rus.fit_resample(X_train[mask], y_train[mask])

# Now add all of class 2 back in
X_balanced = pd.concat([X_temp, X_train[y_train == 2]], axis=0)
y_balanced = pd.concat([y_temp, y_train[y_train == 2]], axis=0)


In [19]:
# For class 2, remove overlapping samples using z-score distance from class 0 mean

from scipy.stats import zscore

# Get means for class 0
X0 = X_balanced[y_balanced == 0]
X2 = X_balanced[y_balanced == 2]

# Compute z-scores (distance from class 0 mean)
z_scores = np.abs((X2 - X0.mean()) / X0.std())

# Keep only top X% furthest points (e.g., top 70%)
X2 = X2.copy()  
X2.loc[:, 'z_dist'] = z_scores.mean(axis=1)
X2_filtered = X2.sort_values(by='z_dist', ascending=False).drop('z_dist', axis=1)
X2_filtered = X2_filtered.iloc[:int(len(X2_filtered) * 0.7)]  # top 70% most distinct


In [22]:
model_2 = OneClassSVM(kernel='rbf', gamma='scale', nu=0.4)
model_2.fit(X2_filtered)

# Train OneClassSVM for class 0
X0 = X_balanced[y_balanced == 0]
model_0 = OneClassSVM(kernel='rbf', gamma='scale', nu=0.01)
model_0.fit(X0)

# Train OneClassSVM for class 1
X1 = X_balanced[y_balanced == 1]
model_1 = OneClassSVM(kernel='rbf', gamma='scale', nu=0.1)
model_1.fit(X1)

# Get decision scores per class
scores = {
    0: model_0.decision_function(X_test),
    1: model_1.decision_function(X_test),
    2: model_2.decision_function(X_test)
}

# Apply a class-wise score threshold boost (favor class 2)
scores[2] += 0.2  # adjust this to tune recall vs precision

# Stack and predict class with highest adjusted score
score_matrix = np.column_stack([scores[cls] for cls in sorted(scores)])
y_pred = np.argmax(score_matrix, axis=1)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.85      0.15      0.26     42795
         1.0       0.01      0.24      0.02       944
         2.0       0.18      0.66      0.29      6997

    accuracy                           0.22     50736
   macro avg       0.35      0.35      0.19     50736
weighted avg       0.75      0.22      0.26     50736



In [26]:
# Post-predict filter:
# This compares actual class 2 score to the best of class 0 or 1
# It only keeps class 2 predictions when they are meaningfully stronger
# This lets you keep strong class 2 predictions, while rejecting borderline false positives → precision improves.

# Step 1: Stack score matrix
score_matrix = np.column_stack([scores[cls] for cls in sorted(scores)])
y_pred = np.argmax(score_matrix, axis=1)

# Step 2: Filter weak class 2 predictions
for i, pred in enumerate(y_pred):
    if pred == 2:
        second_best = max(score_matrix[i, 0], score_matrix[i, 1])
        margin = score_matrix[i, 2] - second_best
        if margin < 0.2:  # tweak this value
            y_pred[i] = np.argmax(score_matrix[i, :2])  # fallback to class 0 or 1

# Step 5: Evaluate results
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.85      0.15      0.26     42795
         1.0       0.01      0.24      0.02       944
         2.0       0.18      0.66      0.29      6997

    accuracy                           0.22     50736
   macro avg       0.35      0.35      0.19     50736
weighted avg       0.75      0.22      0.26     50736



In [28]:
# Hybrid: Train binary classifier for class 2 vs rest (only for refining predictions)

from sklearn.linear_model import LogisticRegression

# Binary classifier for class 2 (vs. all others)
y_binary = (y_train == 2).astype(int)
clf_bin = LogisticRegression(class_weight='balanced', max_iter=1000)
clf_bin.fit(X_train, y_binary)

# OneClassSVM predictions
y_pred = np.argmax(score_matrix, axis=1)

# Filter borderline class 2s using classifier
for i, pred in enumerate(y_pred):
    if pred == 2:
        proba = clf_bin.predict_proba(X_test.iloc[[i]])[0][1]  
        if proba < 0.5:  # You can tune this threshold
            y_pred[i] = np.argmax(score_matrix[i, :2])

# Evaluate
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.85      0.15      0.26     42795
         1.0       0.01      0.44      0.03       944
         2.0       0.31      0.53      0.39      6997

    accuracy                           0.21     50736
   macro avg       0.39      0.37      0.23     50736
weighted avg       0.76      0.21      0.27     50736

