In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix, make_scorer
)

from imblearn.over_sampling import SMOTE
from statsmodels.stats.outliers_influence import variance_inflation_factor


### Read Data

In [2]:
diabetes = pd.read_csv("diabetes_binary_health_indicators_BRFSS2015.csv")
diabetes.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


### Remove Outliers

In [3]:
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df_cleaned = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_cleaned

diabetes_cleaned = drop_outliers_iqr(diabetes, 'BMI')
diabetes_cleaned = drop_outliers_iqr(diabetes_cleaned, 'GenHlth')

### Remove Correlated Columns

In [41]:
vif_data = pd.DataFrame()
diabetes_copy = diabetes_cleaned.copy()
diabetes_copy.drop(columns=['Diabetes_binary'], inplace=True)
vif_data['Feature'] = diabetes_copy.columns
vif_data['VIF'] = [variance_inflation_factor(diabetes_copy.values, i) for i in range(diabetes_copy.shape[1])]
print(vif_data)

                 Feature        VIF
0                 HighBP   2.195656
1               HighChol   1.982777
2              CholCheck  23.083016
3                    BMI  27.334500
4                 Smoker   1.891714
5                 Stroke   1.099193
6   HeartDiseaseorAttack   1.235291
7           PhysActivity   4.983267
8                 Fruits   3.089197
9                Veggies   6.036738
10     HvyAlcoholConsump   1.085847
11         AnyHealthcare  21.665983
12           NoDocbcCost   1.190272
13               GenHlth  10.300190
14              MentHlth   1.341179
15              PhysHlth   1.574188
16              DiffWalk   1.544783
17                   Sex   1.948048
18                   Age   9.819236
19             Education  31.048162
20                Income  15.146232


In [4]:
diabetes_cleaned = diabetes_cleaned.drop(columns=['CholCheck','AnyHealthcare','Education'])

### Split Data

In [5]:

x_train, x_test = train_test_split(diabetes_cleaned, test_size=0.20, random_state=42)
x_test,y_test = x_test.drop(['Diabetes_binary'],axis=1),x_test['Diabetes_binary']
x_train,y_train = x_train.drop(['Diabetes_binary'],axis=1),x_train['Diabetes_binary']


### Standardize Data

In [6]:
# scaler = StandardScaler()
# scaler = RobustScaler()
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaler = scaler.transform(x_train)
x_test_scaler = scaler.transform(x_test)

### Search For Best Parameters Wit Cross-Validation with Smothing

In [7]:
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train_scaler, y_train)


recall_class1_scorer = make_scorer(
    recall_score, 
    pos_label=1,
    average='binary'
)

perceptron = Perceptron(random_state=42)

param_grid = [
    {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'eta0': [0.001, 0.01, 0.1],
        'class_weight': ['balanced', None]
    }
]
grid_search = GridSearchCV(
    perceptron, 
    param_grid, 
    cv=5, 
    scoring=recall_class1_scorer,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(x_train_resampled, y_train_resampled)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(x_test_scaler)

print("Best Parameters:", grid_search.best_params_)
print("\nTest Accuracy: {:.2%}".format(accuracy_score(y_test, y_pred)))


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'alpha': 0.0001, 'class_weight': 'balanced', 'eta0': 0.1, 'penalty': 'l1'}

Test Accuracy: 71.06%


### TEST 1 USE Smothing

In [None]:

smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train_scaler, y_train)

model = Perceptron(
    eta0=0.1,        
    alpha=0.0001,
    random_state=42,
    class_weight='balanced',
    penalty='l1'  
)

model.fit(x_train_resampled, y_train_resampled)

y_train_pred = model.predict(x_train_resampled)
y_test_pred = model.predict(x_test_scaler)

train_accuracy = accuracy_score(y_train_resampled, y_train_pred) * 100
test_accuracy = accuracy_score(y_test, y_test_pred) * 100

print(f"Training Accuracy: {train_accuracy:.2f}%")
print(f"Test Accuracy: {test_accuracy:.2f}%")
print(classification_report(y_test, y_test_pred, target_names=['No Diabetes', 'Diabetes']))

def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
                xticklabels=['No Diabetes', 'Diabetes'],
                yticklabels=['No Diabetes', 'Diabetes'])
    plt.title(title)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# plot_confusion_matrix(y_train_resampled, y_train_pred, "Training Set Confusion Matrix")
# plot_confusion_matrix(y_test, y_test_pred, "Test Set Confusion Matrix")


Training Accuracy: 70.77%
Test Accuracy: 71.06%
              precision    recall  f1-score   support

 No Diabetes       0.95      0.71      0.81     40903
    Diabetes       0.25      0.70      0.37      5689

    accuracy                           0.71     46592
   macro avg       0.60      0.71      0.59     46592
weighted avg       0.86      0.71      0.76     46592



### Search For Best Parameters Wit Cross-Validation 

In [46]:
recall_class1_scorer = make_scorer(
    recall_score, 
    pos_label=1,
    average='binary'
)

perceptron = Perceptron(random_state=42)

param_grid = [
    {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'eta0': [0.001, 0.01, 0.1],
        'class_weight': ['balanced', None]
    }
]
grid_search = GridSearchCV(
    perceptron, 
    param_grid, 
    cv=5, 
    scoring=recall_class1_scorer,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(x_train_scaler, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(x_test_scaler)

print("Best Parameters:", grid_search.best_params_)
print("\nTest Accuracy: {:.2%}".format(accuracy_score(y_test, y_pred)))


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'alpha': 0.001, 'class_weight': 'balanced', 'eta0': 0.1, 'penalty': 'l1'}

Test Accuracy: 61.17%


### TEST 2 NOT USE Smothing

In [10]:
perceptron =  Perceptron(
        random_state=42,
        alpha=0.001,
        penalty='l1',
        eta0=0.1,
        class_weight='balanced'
        )

perceptron.fit(x_train_scaler, y_train)

y_train_pred = perceptron.predict(x_train_scaler)
y_test_pred = perceptron.predict(x_test_scaler)

train_accuracy = accuracy_score(y_train, y_train_pred) * 100
test_accuracy = accuracy_score(y_test, y_test_pred) * 100

print(f"Training Accuracy: {train_accuracy:.2f}%")
print(f"Test Accuracy: {test_accuracy:.2f}%")

def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['No Diabetes', 'Prediabetes', 'Diabetes'],
                yticklabels=['No Diabetes', 'Prediabetes', 'Diabetes'])
    plt.title(title)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# plot_confusion_matrix(y_train, y_train_pred, "Training Set Confusion Matrix")
# plot_confusion_matrix(y_test, y_test_pred, "Test Set Confusion Matrix")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred,target_names=['No Diabetes', 'Diabetes']))

Training Accuracy: 61.29%
Test Accuracy: 61.17%

Classification Report (Test Set):
              precision    recall  f1-score   support

 No Diabetes       0.97      0.58      0.72     40903
    Diabetes       0.22      0.85      0.35      5689

    accuracy                           0.61     46592
   macro avg       0.59      0.72      0.54     46592
weighted avg       0.88      0.61      0.68     46592



### Test 3 Change Threshold

In [14]:
y_train_scores = perceptron.decision_function(x_train_scaler)
y_test_scores = perceptron.decision_function(x_test_scaler)

# Apply threshold
threshold = 0.1
y_train_pred = (y_train_scores >= threshold).astype(int)
y_test_pred = (y_test_scores >= threshold).astype(int)

train_accuracy = accuracy_score(y_train, y_train_pred) * 100
test_accuracy = accuracy_score(y_test, y_test_pred) * 100

print(f"Training Accuracy with Threshold {threshold}: {train_accuracy:.2f}%")
print(f"Test Accuracy with Threshold {threshold}: {test_accuracy:.2f}%")

print("\nClassification Report (Test Set) with Threshold =", threshold)
print(classification_report(y_test, y_test_pred, target_names=['No Diabetes', 'Diabetes']))


Training Accuracy with Threshold 0.1: 62.79%
Test Accuracy with Threshold 0.1: 62.78%

Classification Report (Test Set) with Threshold = 0.1
              precision    recall  f1-score   support

 No Diabetes       0.96      0.60      0.74     40903
    Diabetes       0.23      0.84      0.36      5689

    accuracy                           0.63     46592
   macro avg       0.59      0.72      0.55     46592
weighted avg       0.87      0.63      0.69     46592



### Test 4 Get the top 10 Features

In [49]:

selector = SelectKBest(f_classif, k=10)
X_new = selector.fit_transform(x_train, y_train)

selected_features = x_train.columns[selector.get_support()]
print("Best features:", selected_features)

Best features: Index(['HighBP', 'HighChol', 'BMI', 'HeartDiseaseorAttack', 'PhysActivity',
       'GenHlth', 'PhysHlth', 'DiffWalk', 'Age', 'Income'],
      dtype='object')


In [50]:

X_train_selected = selector.transform(x_train)
X_test_selected = selector.transform(x_test)
scaler = StandardScaler()
scaler.fit(X_train_selected)
x_train_scaler = scaler.transform(X_train_selected)
x_test_scaler = scaler.transform(X_test_selected)

model_selected_features = Perceptron(random_state=42,
        alpha=0.001,
        penalty='l2',
        eta0=0.1,
        class_weight='balanced')

model_selected_features.fit(x_train_scaler, y_train)

y_train_pred = model_selected_features.predict(x_train_scaler)
y_test_pred = model_selected_features.predict(x_test_scaler)

train_accuracy = accuracy_score(y_train, y_train_pred) * 100
test_accuracy = accuracy_score(y_test, y_test_pred) * 100

print(f"Training Accuracy: {train_accuracy:.2f}%")
print(f"Test Accuracy: {test_accuracy:.2f}%")
print(classification_report(y_test, y_test_pred, target_names=['No Diabetes', 'Diabetes']))


Training Accuracy: 67.74%
Test Accuracy: 67.83%
              precision    recall  f1-score   support

 No Diabetes       0.95      0.67      0.78     40903
    Diabetes       0.24      0.77      0.37      5689

    accuracy                           0.68     46592
   macro avg       0.60      0.72      0.58     46592
weighted avg       0.87      0.68      0.73     46592

