In [1]:
#Naive bayes
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score



diab = pd.read_csv("diabetes.csv")


X = diab.drop('Outcome', axis=1)
y = diab['Outcome']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)


X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.4, random_state=25)


nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
acc=accuracy*100
print("Naive Bayes Accuracy:",acc)
def calculate_sensitivity(y_true, y_pred):
    true_positives = sum((y_true == 1) & (y_pred == 1))
    actual_positives = sum(y_true == 1)
    sensitivity = true_positives / actual_positives
    return sensitivity


def calculate_f1_score(y_true, y_pred):
    return f1_score(y_true, y_pred)

def calculate_specificity(TN, FP):
    return TN / (TN + FP)

threshold = 0.2
y_pred_proba = nb_classifier.predict_proba(X_test)[:, 1]
y_pred_adjusted = (y_pred_proba > threshold).astype(int)

sensitivity = calculate_sensitivity(y_test, y_pred_adjusted)
f1_score = calculate_f1_score(y_test, y_pred_adjusted)
cm = confusion_matrix(y_test, y_pred)
TN = cm[0, 0]
FP = cm[0, 1]
specificity=calculate_specificity(TN,FP)

print("Sensitivity:", sensitivity*100)
print("Specificity: ",specificity*100)
print("F1 Score:", f1_score*100)

Naive Bayes Accuracy: 77.59740259740259
Sensitivity: 87.61904761904762
Specificity:  87.192118226601
F1 Score: 63.230240549828174


In [2]:
#knn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,f1_score
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
diab = pd.read_csv("diabetes.csv")

X = diab.drop('Outcome', axis=1)
y = diab['Outcome']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=5)  
X_pca = pca.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.4, random_state=125)
k = 3
algorithm = 'auto'  
metric = 'euclidean'  
knn_classifier = KNeighborsClassifier(n_neighbors=k, algorithm=algorithm, metric=metric)
n_splits = 5
batch_size = 100
performance = []
kf = KFold(n_splits=n_splits)
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index] 
    num_batches = len(X_train_fold) // batch_size
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = (i + 1) * batch_size
        X_batch = X_train_fold[batch_start:batch_end]
        y_batch = y_train_fold.iloc[batch_start:batch_end] 
        knn_classifier.fit(X_batch, y_batch)

    
    y_pred_fold = knn_classifier.predict(X_test_fold)
    score = accuracy_score(y_test_fold, y_pred_fold)
    performance.append(score)

avg_performance = np.mean(performance)

threshold=0.33
y_pred_proba=knn_classifier.predict_proba(X_test)[:,1]
y_pred_adjusted=(y_pred_proba>threshold).astype(int)
f1=f1_score(y_test,y_pred_adjusted)
sensitivity=calculate_sensitivity(y_test,y_pred_adjusted)
print("KNN:", avg_performance*100)
cm = confusion_matrix(y_test, y_pred)
TN = cm[0, 0]
FP = cm[0, 1]
specificity=calculate_specificity(TN,FP)

print("Sensitivity:", sensitivity*100)
print("Specificity: ",specificity*100)
print("f1 score:",f1*100)

KNN: 76.95652173913044
Sensitivity: 89.43089430894308
Specificity:  69.72972972972973
f1 score: 65.67164179104478


In [3]:
#SVM
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

diab = pd.read_csv("diabetes.csv")

X = diab
y = diab['Outcome']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.4, random_state=0)

svm = SVC(kernel="sigmoid", gamma=0.5, C=1.0, class_weight='balanced', probability=True)
svm.fit(X_train, y_train)
pred_proba = svm.predict_proba(X_test)
threshold = 0.25 
y_pred_adjusted = (pred_proba[:, 1] > threshold).astype(int)
acc1 = accuracy_score(y_test, y_pred_adjusted)
sensitivity = calculate_sensitivity(y_test, y_pred_adjusted)
f1 = f1_score(y_test, y_pred_adjusted)

print("SVM Accuracy:", acc1 * 100)
cm = confusion_matrix(y_test, y_pred)
TN = cm[0, 0]
FP = cm[0, 1]
specificity=calculate_specificity(TN,FP)

print("Sensitivity:", sensitivity*100)
print("Specificity: ",specificity*100)
print("F1 Score:", f1 * 100)

SVM Accuracy: 85.06493506493507
Sensitivity: 81.55339805825243
Specificity:  71.21951219512195
F1 Score: 78.50467289719626


In [4]:
#Decision Tree
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
diab = pd.read_csv("diabetes.csv")
X = diab.drop('Outcome', axis=1)
y = diab['Outcome']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.4, random_state=2)

ent = DecisionTreeClassifier(criterion="entropy", random_state=2, max_depth=100, splitter="best", min_samples_split=2, min_samples_leaf=1, ccp_alpha=0.10)
ent.fit(X_train, y_train)
pred = ent.predict(X_test)
acc = accuracy_score(y_test, pred)

threshold = 0.25 
y_pred_proba = ent.predict_proba(X_test)[:, 1]
y_pred_adjusted = (y_pred_proba > threshold).astype(int)
f1 = f1_score(y_test, y_pred_adjusted)
sensitivity = calculate_sensitivity(y_test, y_pred_adjusted)

print("DECISION TREE Accuracy:", acc*100)
cm = confusion_matrix(y_test, y_pred)
TN = cm[0, 0]
FP = cm[0, 1]
specificity=calculate_specificity(TN,FP)

print("Sensitivity:", sensitivity*100)
print("Specificity: ",specificity*100)
print("DECISION TREE F1 Score:", f1*100)

DECISION TREE Accuracy: 72.07792207792207
Sensitivity: 100.0
Specificity:  70.24390243902438
DECISION TREE F1 Score: 50.121654501216554


In [5]:
#Random Forest
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
diab = pd.read_csv("diabetes.csv")
X = diab.drop('Outcome', axis=1)
y = diab['Outcome']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.4, random_state=0)
rf_classifier = RandomForestClassifier(random_state=42,max_depth=None,n_estimators=4,max_samples=52)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
threshold = 0.3  
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]
y_pred_adjusted = (y_pred_proba > threshold).astype(int)
f1 = f1_score(y_test, y_pred_adjusted)
sensitivity = calculate_sensitivity(y_test, y_pred_adjusted)

print("Random Forest Accuracy:", accuracy*100)
cm = confusion_matrix(y_test, y_pred)
TN = cm[0, 0]
FP = cm[0, 1]
specificity=calculate_specificity(TN,FP)

print("Sensitivity:", sensitivity*100)
print("Specificity: ",specificity*100)
print("Random Forest F1 Score:", f1*100)

Random Forest Accuracy: 72.07792207792207
Sensitivity: 61.165048543689316
Specificity:  92.6829268292683
Random Forest F1 Score: 58.333333333333336


In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
diab = pd.read_csv("diabetes.csv")

scaler = StandardScaler()
Z = scaler.fit_transform(diab.drop('Outcome', axis=1))

pca = PCA(n_components=5)
x_pca = pca.fit_transform(Z)
y = diab['Outcome']
x_train, x_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.4, random_state=0)
mlp = MLPClassifier(hidden_layer_sizes=(20,), momentum=0.9, learning_rate_init=0.001, max_iter=500, activation='logistic', solver='adam', random_state=1)
mlp.fit(x_train, y_train)
pred_mlp = mlp.predict(x_test)
acc_mlp = accuracy_score(y_test, pred_mlp)
threshold = 0. 
y_pred_proba = mlp.predict_proba(x_test)[:, 1]
y_pred_adjusted = (y_pred_proba > threshold).astype(int)
f1 = f1_score(y_test, y_pred_adjusted)
sensitivity = calculate_sensitivity(y_test, y_pred_adjusted)
f1=f1*100
acc_mlp=acc_mlp*100
sensitivity=sensitivity*100

print("MLP Accuracy:", acc_mlp)
cm = confusion_matrix(y_test, y_pred)
TN = cm[0, 0]
FP = cm[0, 1]
specificity=calculate_specificity(TN,FP)

print("Sensitivity:", sensitivity)
print("Specificity: ",specificity*100)
print("MLP F1 Score:", f1)

MLP Accuracy: 74.67532467532467
Sensitivity: 100.0
Specificity:  92.6829268292683
MLP F1 Score: 50.121654501216554
