# KNN

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# Importing the dataset
dataset = pd.read_csv('dataset/diabetes.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting K-NN to the Training set
knnClassifier = KNeighborsClassifier(n_neighbors=18)  # Fixed number of neighbors
knnClassifier.fit(X_train, y_train)

# Print accuracy on training and test set
print('Accuracy of K-NN classifier on training set: {:.2f}'.format(knnClassifier.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'.format(knnClassifier.score(X_test, y_test)))

# Predicting the Test set results
y_pred = knnClassifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print('TP - True Negative {}'.format(cm[0, 0]))
print('FP - False Positive {}'.format(cm[0, 1]))
print('FN - False Negative {}'.format(cm[1, 0]))
print('TP - True Positive {}'.format(cm[1, 1]))
print('Accuracy Rate: {}'.format(np.divide(np.sum([cm[0, 0], cm[1, 1]]), np.sum(cm))))
print('Misclassification Rate: {}'.format(np.divide(np.sum([cm[0, 1], cm[1, 0]]), np.sum(cm))))

# Calculate and print ROC-AUC score
print(round(roc_auc_score(y_test, y_pred), 5))


Accuracy of K-NN classifier on training set: 0.79
Accuracy of K-NN classifier on test set: 0.71
TP - True Negative 108
FP - False Positive 15
FN - False Negative 40
TP - True Positive 29
Accuracy Rate: 0.7135416666666666
Misclassification Rate: 0.2864583333333333
0.64917


# Decision Tree

In [3]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Importing the dataset
dataset = pd.read_csv('dataset/diabetes.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting Decision Tree to the Training set with fixed hyperparameters
tree = DecisionTreeClassifier(max_depth=6, max_features=4, min_samples_split=5, random_state=42)
tree.fit(X_train, y_train)

# Print accuracy on training and test set
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

# Predicting the Test set results
y_pred = tree.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print('TP - True Negative {}'.format(cm[0, 0]))
print('FP - False Positive {}'.format(cm[0, 1]))
print('FN - False Negative {}'.format(cm[1, 0]))
print('TP - True Positive {}'.format(cm[1, 1]))
print('Accuracy Rate: {}'.format(np.divide(np.sum([cm[0, 0], cm[1, 1]]), np.sum(cm))))
print('Misclassification Rate: {}'.format(np.divide(np.sum([cm[0, 1], cm[1, 0]]), np.sum(cm))))

# Calculate and print ROC-AUC score
print(round(roc_auc_score(y_test, y_pred), 5))


Accuracy on training set: 0.852
Accuracy on test set: 0.729
TP - True Negative 92
FP - False Positive 31
FN - False Negative 21
TP - True Positive 48
Accuracy Rate: 0.7291666666666666
Misclassification Rate: 0.2708333333333333
0.72181


# Logistic Regression 

In [6]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Importing the dataset
dataset = pd.read_csv('dataset/diabetes.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting Logistic Regression to the Training set
logistic_regression_classifier = LogisticRegression(random_state=42)
logistic_regression_classifier.fit(X_train, y_train)

# Print accuracy on training and test set
print("Accuracy on training set: {:.3f}".format(logistic_regression_classifier.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(logistic_regression_classifier.score(X_test, y_test)))

# Predicting the Test set results
y_pred = logistic_regression_classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print('TP - True Negative {}'.format(cm[0, 0]))
print('FP - False Positive {}'.format(cm[0, 1]))
print('FN - False Negative {}'.format(cm[1, 0]))
print('TP - True Positive {}'.format(cm[1, 1]))
print('Accuracy Rate: {:.3f}'.format(accuracy_score(y_test, y_pred)))
print('Misclassification Rate: {:.3f}'.format(1 - accuracy_score(y_test, y_pred)))

# Calculate and print ROC-AUC score
print('ROC-AUC Score: {:.5f}'.format(roc_auc_score(y_test, y_pred)))


Accuracy on training set: 0.780
Accuracy on test set: 0.729
TP - True Negative 95
FP - False Positive 28
FN - False Negative 24
TP - True Positive 45
Accuracy Rate: 0.729
Misclassification Rate: 0.271
ROC-AUC Score: 0.71227


# SVM 

In [4]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Importing the dataset
dataset = pd.read_csv('dataset/diabetes.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting SVM to the Training set with fixed hyperparameters
svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_classifier.fit(X_train, y_train)

# Print accuracy on training and test set
print("Accuracy on training set: {:.3f}".format(svm_classifier.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(svm_classifier.score(X_test, y_test)))

# Predicting the Test set results
y_pred = svm_classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print('TP - True Negative {}'.format(cm[0, 0]))
print('FP - False Positive {}'.format(cm[0, 1]))
print('FN - False Negative {}'.format(cm[1, 0]))
print('TP - True Positive {}'.format(cm[1, 1]))
print('Accuracy Rate: {:.3f}'.format(accuracy_score(y_test, y_pred)))
print('Misclassification Rate: {:.3f}'.format(1 - accuracy_score(y_test, y_pred)))

# Calculate and print ROC-AUC score
print('ROC-AUC Score: {:.5f}'.format(roc_auc_score(y_test, y_pred)))


Accuracy on training set: 0.832
Accuracy on test set: 0.729
TP - True Negative 101
FP - False Positive 22
FN - False Negative 30
TP - True Positive 39
Accuracy Rate: 0.729
Misclassification Rate: 0.271
ROC-AUC Score: 0.69318


# Gaussian Naive bayes 

In [5]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Importing the dataset
dataset = pd.read_csv('dataset/diabetes.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Feature Scaling (Naive Bayes is not sensitive to feature scaling, but it's good practice)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting Gaussian Naive Bayes to the Training set
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train, y_train)

# Print accuracy on training and test set
print("Accuracy on training set: {:.3f}".format(naive_bayes_classifier.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(naive_bayes_classifier.score(X_test, y_test)))

# Predicting the Test set results
y_pred = naive_bayes_classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print('TP - True Negative {}'.format(cm[0, 0]))
print('FP - False Positive {}'.format(cm[0, 1]))
print('FN - False Negative {}'.format(cm[1, 0]))
print('TP - True Positive {}'.format(cm[1, 1]))
print('Accuracy Rate: {:.3f}'.format(accuracy_score(y_test, y_pred)))
print('Misclassification Rate: {:.3f}'.format(1 - accuracy_score(y_test, y_pred)))

# Calculate and print ROC-AUC score (Note: Naive Bayes does not provide probability estimates, so ROC-AUC may not be meaningful)
# print('ROC-AUC Score: {:.5f}'.format(roc_auc_score(y_test, y_pred)))


Accuracy on training set: 0.764
Accuracy on test set: 0.734
TP - True Negative 94
FP - False Positive 29
FN - False Negative 22
TP - True Positive 47
Accuracy Rate: 0.734
Misclassification Rate: 0.266


# Corss Validation

In [8]:
# Importing the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Importing the dataset
dataset = pd.read_csv('dataset/diabetes.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values

# Feature Scaling (optional)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create a 5-fold cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# SVM
svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_scores = cross_val_score(svm_classifier, X_scaled, y, cv=cv, scoring='accuracy')
print("SVM Accuracy: {:.3f} (+/- {:.3f})".format(np.mean(svm_scores), np.std(svm_scores)))

# K-NN
knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can choose an appropriate k value
knn_scores = cross_val_score(knn_classifier, X_scaled, y, cv=cv, scoring='accuracy')
print("K-NN Accuracy: {:.3f} (+/- {:.3f})".format(np.mean(knn_scores), np.std(knn_scores)))

# Logistic Regression
logistic_regression_classifier = LogisticRegression(random_state=42)
log_reg_scores = cross_val_score(logistic_regression_classifier, X_scaled, y, cv=cv, scoring='accuracy')
print("Logistic Regression Accuracy: {:.3f} (+/- {:.3f})".format(np.mean(log_reg_scores), np.std(log_reg_scores)))

# Decision Tree
tree_classifier = DecisionTreeClassifier(max_depth=6, max_features=4, min_samples_split=5, random_state=42)
tree_scores = cross_val_score(tree_classifier, X, y, cv=cv, scoring='accuracy')
print("Decision Tree Accuracy: {:.3f} (+/- {:.3f})".format(np.mean(tree_scores), np.std(tree_scores)))

# Naive Bayes (Gaussian)
naive_bayes_classifier = GaussianNB()
nb_scores = cross_val_score(naive_bayes_classifier, X, y, cv=cv, scoring='accuracy')
print("Naive Bayes Accuracy: {:.3f} (+/- {:.3f})".format(np.mean(nb_scores), np.std(nb_scores)))


SVM Accuracy: 0.760 (+/- 0.017)
K-NN Accuracy: 0.729 (+/- 0.024)
Logistic Regression Accuracy: 0.775 (+/- 0.015)
Decision Tree Accuracy: 0.710 (+/- 0.040)
Naive Bayes Accuracy: 0.755 (+/- 0.034)
