In [72]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [73]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
data_path = Path('Resources/Healthcare-Diabetes.csv')
df = pd.read_csv(data_path)

# Review the DataFrame
df.head(5)

Unnamed: 0,Id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,6,148,72,35,0,33.6,0.627,50,1
1,2,1,85,66,29,0,26.6,0.351,31,0
2,3,8,183,64,0,0,23.3,0.672,32,1
3,4,1,89,66,23,94,28.1,0.167,21,0
4,5,0,137,40,35,168,43.1,2.288,33,1


In [74]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = df['Outcome']

# Separate the X variable, the features
X = df.drop(columns='Outcome')
X =X.drop(columns="Id")

In [75]:
# Review the y variable Series
#y.head()

# Review the X variable DataFrame
#X.head()

# Check the balance of our target values
#y.value_counts()

In [76]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [77]:
# Support Vector Machines (SVM)
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)

# K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)

# Decision Trees
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

# Gradient Boosting Machines (GBM)
gbm_model = GradientBoostingClassifier()
gbm_model.fit(X_train, y_train)
gbm_predictions = gbm_model.predict(X_test)
gbm_accuracy = accuracy_score(y_test, gbm_predictions)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)

ml_models = {
    'Model': ['SVM', 'KNN', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'GBM', 'Logistic Regression'],
    'Accuracy': [svm_accuracy, knn_accuracy, nb_accuracy, dt_accuracy, rf_accuracy, gbm_accuracy, lr_accuracy]
}
ml_models = pd.DataFrame(ml_models)
ml_models

Unnamed: 0,Model,Accuracy
0,SVM,0.742775
1,KNN,0.872832
2,Naive Bayes,0.741329
3,Decision Tree,0.992775
4,Random Forest,0.995665
5,GBM,0.869942
6,Logistic Regression,0.767341


In [78]:
# Generate a confusion matrix for the model DECISION TREE!
confusion = confusion_matrix(y_test, dt_predictions)
print(f"Confusion matrix:\n{confusion}")

Confusion matrix:
[[451   3]
 [  2 236]]


In [79]:
# Print the classification report for the model DECISION TREE!
classification_rep = classification_report(y_test, dt_predictions)
print(f"Classification report:\n {classification_rep}")

Classification report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       454
           1       0.99      0.99      0.99       238

    accuracy                           0.99       692
   macro avg       0.99      0.99      0.99       692
weighted avg       0.99      0.99      0.99       692

