In [None]:
#importing libraries
import pandas as pd
import gdown

#read a CSV file from Google Drive
url = "https://drive.google.com/uc?id=1QlYmjMeLj3TcZWbfzI7GHulfYw5Zdzmx"
output = "diabetes.csv"

gdown.download(url, output, quiet=False)

# Load the dataset
df = pd.read_csv(output)
df

Downloading...
From: https://drive.google.com/uc?id=1QlYmjMeLj3TcZWbfzI7GHulfYw5Zdzmx
To: /content/diabetes.csv
100%|██████████| 34.7k/34.7k [00:00<00:00, 41.3MB/s]


Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,Positive
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,Positive
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,Positive
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Negative


In [None]:
from sklearn.preprocessing import MinMaxScaler

#data preprocessing, encoding the values to their numeric counterparts

df = df.dropna()
df = df.replace({'Yes': 1, 'No': 0})
df = df.replace({'Positive': 1, 'Negative': 0})
df = df.replace({'Male': 1, 'Female': 0})

#scaler used on Age to make it within the range of 0 and 1 like all the other data features
scaler = MinMaxScaler()
df['Age'] = scaler.fit_transform(df[['Age']])

df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,0.324324,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,0.567568,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,0.337838,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,0.391892,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,0.594595,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,0.310811,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,0.432432,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,0.567568,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,0.216216,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0


In [None]:
from sklearn.model_selection import train_test_split

x = df.drop(columns=['class'])
y = df['class']

#data splitting into training and testing data. 75% training and 25% testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

#arrays created to be used later in evaluation
classifier_names = []
accuracies = []
f1_scores = []
precisions = []
recalls = []
ROC = []

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc

#hyper-parameter tuning for adaboost
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1.0]
}

#adaboost classifier model
adaboost = AdaBoostClassifier()

grid_search = GridSearchCV(adaboost, param_grid, cv=5)
grid_search.fit(x_train, y_train)
adaboost_model = grid_search.best_estimator_
adaboost_y_pred = adaboost_model.predict(x_test)

#evaluating the model
print(f"Adaboost Stats:")
print(f"Accuracy: {accuracy_score(y_test, adaboost_y_pred)}")
print(f"f1: {f1_score(y_test, adaboost_y_pred)}")
print(f"Precision: {precision_score(y_test, adaboost_y_pred)}")
print(f"Recall: {recall_score(y_test, adaboost_y_pred)}")
print(f"AUC: {roc_auc_score(y_test, adaboost_y_pred)}")

#adding scores onto the array created earlier
classifier_names.append('Adaboost')
accuracies.append(accuracy_score(y_test, adaboost_y_pred))
f1_scores.append(f1_score(y_test, adaboost_y_pred))
precisions.append(precision_score(y_test, adaboost_y_pred))
recalls.append(recall_score(y_test, adaboost_y_pred))
ROC.append(roc_auc_score(y_test, adaboost_y_pred))

Adaboost Stats:
Accuracy: 0.9461538461538461
f1: 0.9585798816568047
Precision: 0.9529411764705882
Recall: 0.9642857142857143
AUC: 0.938664596273292


In [None]:
from sklearn.tree import DecisionTreeClassifier

#hyper-parameter tuning for Decision Tree Classifier
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

decisionTree = DecisionTreeClassifier()

grid_search = GridSearchCV(decisionTree, param_grid, cv=5)
grid_search.fit(x_train, y_train)
dt_model = grid_search.best_estimator_
decisiontree_y_pred = dt_model.predict(x_test)

#evaluating the model
print(f"Decision Tree Stats:")
print(f"Accuracy: {accuracy_score(y_test, decisiontree_y_pred)}")
print(f"f1: {f1_score(y_test, decisiontree_y_pred)}")
print(f"Precision: {precision_score(y_test, decisiontree_y_pred)}")
print(f"Recall: {recall_score(y_test, decisiontree_y_pred)}")
print(f"AUC: {roc_auc_score(y_test, decisiontree_y_pred)}")

#adding scores onto the array created earlier
classifier_names.append('DecisionTree')
accuracies.append(accuracy_score(y_test, decisiontree_y_pred))
f1_scores.append(f1_score(y_test, decisiontree_y_pred))
precisions.append(precision_score(y_test, decisiontree_y_pred))
recalls.append(recall_score(y_test, decisiontree_y_pred))
ROC.append(roc_auc_score(y_test, decisiontree_y_pred))

Decision Tree Stats:
Accuracy: 0.9615384615384616
f1: 0.9696969696969696
Precision: 0.9876543209876543
Recall: 0.9523809523809523
AUC: 0.9653209109730848


In [None]:
from sklearn.neighbors import KNeighborsClassifier

#hyper-parameter tuning for KNN
param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}

knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(x_train, y_train)
knn_model = grid_search.best_estimator_
knn_y_pred = knn_model.predict(x_test)

#evaluating the model
print(f"KNN Stats:")
print(f"Accuracy: {accuracy_score(y_test, knn_y_pred)}")
print(f"f1: {f1_score(y_test, knn_y_pred)}")
print(f"Precision: {precision_score(y_test, knn_y_pred)}")
print(f"Recall: {recall_score(y_test, knn_y_pred)}")
print(f"AUC: {roc_auc_score(y_test, knn_y_pred)}")

#adding to the arrays made earlier
classifier_names.append('KNN')
accuracies.append(accuracy_score(y_test, knn_y_pred))
f1_scores.append(f1_score(y_test, knn_y_pred))
precisions.append(precision_score(y_test, knn_y_pred))
recalls.append(recall_score(y_test, knn_y_pred))
ROC.append(roc_auc_score(y_test, knn_y_pred))

KNN Stats:
Accuracy: 0.9615384615384616
f1: 0.9693251533742331
Precision: 1.0
Recall: 0.9404761904761905
AUC: 0.9702380952380952


In [None]:
from sklearn.svm import SVC

#hyper-parameter tuning for SVC
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

svc = SVC()

grid_search = GridSearchCV(svc, param_grid, cv=5)
grid_search.fit(x_train, y_train)
svc_model = grid_search.best_estimator_
svc_y_pred = svc_model.predict(x_test)

#evaluating the model
print(f"SVC Stats:")
print(f"Accuracy: {accuracy_score(y_test, svc_y_pred)}")
print(f"f1: {f1_score(y_test, svc_y_pred)}")
print(f"Precision: {precision_score(y_test, svc_y_pred)}")
print(f"Recall: {recall_score(y_test, svc_y_pred)}")
print(f"AUC: {roc_auc_score(y_test, svc_y_pred)}")

#adding to the arrays from earlier
classifier_names.append('SVC')
accuracies.append(accuracy_score(y_test, svc_y_pred))
f1_scores.append(f1_score(y_test, svc_y_pred))
precisions.append(precision_score(y_test, svc_y_pred))
recalls.append(recall_score(y_test, svc_y_pred))
ROC.append(roc_auc_score(y_test, svc_y_pred))

SVC Stats:
Accuracy: 0.9769230769230769
f1: 0.9820359281437125
Precision: 0.9879518072289156
Recall: 0.9761904761904762
AUC: 0.9772256728778469


In [None]:
#hyper-parameter tuning for Bagging
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 1.0],
    'max_features': [0.5, 1.0]
}

bagging = BaggingClassifier()

grid_search = GridSearchCV(bagging, param_grid, cv=5)
grid_search.fit(x_train, y_train)
bagging_model = grid_search.best_estimator_
bagging_y_pred = bagging_model.predict(x_test)

#evaluating the model
print(f"Bagging Stats:")
print(f"Accuracy: {accuracy_score(y_test, bagging_y_pred)}")
print(f"f1: {f1_score(y_test, bagging_y_pred)}")
print(f"Precision: {precision_score(y_test, bagging_y_pred)}")
print(f"Recall: {recall_score(y_test, bagging_y_pred)}")
print(f"AUC: {roc_auc_score(y_test, bagging_y_pred)}")

#adding to arrays from earlier
classifier_names.append('Bagging')
accuracies.append(accuracy_score(y_test, bagging_y_pred))
f1_scores.append(f1_score(y_test, bagging_y_pred))
precisions.append(precision_score(y_test, bagging_y_pred))
recalls.append(recall_score(y_test, bagging_y_pred))
ROC.append(roc_auc_score(y_test, bagging_y_pred))

Bagging Stats:
Accuracy: 0.9692307692307692
f1: 0.9764705882352942
Precision: 0.9651162790697675
Recall: 0.9880952380952381
AUC: 0.9614389233954452


In [None]:
#hyper-parameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rfc = RandomForestClassifier()

grid_search = GridSearchCV(rfc, param_grid, cv=5)
grid_search.fit(x_train, y_train)
rfc_model = grid_search.best_estimator_
rfc_y_pred = rfc_model.predict(x_test)

#evaluating the model
print(f"Random Forest Stats:")
print(f"Accuracy: {accuracy_score(y_test, rfc_y_pred)}")
print(f"f1: {f1_score(y_test, rfc_y_pred)}")
print(f"Precision: {precision_score(y_test, rfc_y_pred)}")
print(f"Recall: {recall_score(y_test, rfc_y_pred)}")
print(f"AUC: {roc_auc_score(y_test, rfc_y_pred)}")

#adding to the arrays from earlier
classifier_names.append('RandomForest')
accuracies.append(accuracy_score(y_test, rfc_y_pred))
f1_scores.append(f1_score(y_test, rfc_y_pred))
precisions.append(precision_score(y_test, rfc_y_pred))
recalls.append(recall_score(y_test, rfc_y_pred))
ROC.append(roc_auc_score(y_test, rfc_y_pred))

Random Forest Stats:
Accuracy: 0.9923076923076923
f1: 0.9940119760479043
Precision: 1.0
Recall: 0.9880952380952381


In [None]:
metrics_df = pd.DataFrame({
    'Classifier': classifier_names,
    'Accuracy': accuracies,
    'F1 Score': f1_scores,
    'Precision': precisions,
    'Recall': recalls,
    'AUC' : ROC
})

# Print the DataFrame as a table
print(metrics_df)

     Classifier  Accuracy  F1 Score  Precision    Recall       AUC
0      Adaboost  0.946154  0.958580   0.952941  0.964286  0.938665
1  DecisionTree  0.961538  0.969697   0.987654  0.952381  0.965321
2           KNN  0.961538  0.969325   1.000000  0.940476  0.970238
3           SVC  0.976923  0.982036   0.987952  0.976190  0.977226
4       Bagging  0.969231  0.976471   0.965116  0.988095  0.961439
5  RandomForest  0.992308  0.994012   1.000000  0.988095  0.994048


In [None]:
# Function to get validated user input for prediction
def get_user_input():
    def get_valid_input(prompt, valid_fn, error_msg):
        while True:
            try:
                value = valid_fn(input(prompt))
                return value
            except ValueError:
                print(error_msg)

    age = get_valid_input("Enter the patient's age (in years): ", int, "Please enter a valid integer for age.")
    gender = get_valid_input("Enter the patient's gender (0 for female, 1 for male): ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for female or 1 for male.")
    polyuria = get_valid_input("Enter if patient has Polyuria (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    polydipsia = get_valid_input("Enter if patient has Polydipsia (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    sudden_weight_loss = get_valid_input("Enter if patient has a sudden wight loss (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    weakness = get_valid_input("Enter if patient show signs of weakness (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    polyphagia = get_valid_input("Enter if patient has Polyphagia (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    genital_thrush = get_valid_input("Enter if patient has Genital Thrush (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    visual_blur = get_valid_input("Enter if patient has Visual Blurring (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    itching = get_valid_input("Enter if patient has itching (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    irritability = get_valid_input("Enter if patient has irritability (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    delayed_healing = get_valid_input("Enter if patient has delayed healing (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    partial_paresis = get_valid_input("Enter if patient has partial peresis (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    muscle_stiffness = get_valid_input("Enter if patient has muscle stiffness (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    alopecia = get_valid_input("Enter if patient has alopecia (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")
    obesity = get_valid_input("Enter if patient has obesity (0 for no, 1 for yes) ", lambda x: int(x) if x in ["0", "1"] else ValueError, "Please enter 0 for no or 1 for yes.")

    user_data = pd.DataFrame({
        'Age': [age],
        'Gender': [gender],
        'Polyuria': [polyuria],
        'Polydipsia': [polydipsia],
        'sudden weight loss': [sudden_weight_loss],
        'weakness': [weakness],
        'Polyphagia': [polyphagia],
        'Genital thrush': [genital_thrush],
        'visual blurring': [visual_blur],
        'Itching': [itching],
        'Irritability': [irritability],
        'delayed healing': [delayed_healing],
        'partial paresis': [partial_paresis],
        'muscle stiffness': [muscle_stiffness],
        'Alopecia': [alopecia],
        'Obesity': [obesity]
    })
    return user_data

user_data = get_user_input()

ada_pred = adaboost_model.predict(user_data)
dt_pred = dt_model.predict(user_data)
knn_pred = knn_model.predict(user_data)
svc_pred = svc_model.predict(user_data)
bagging_pred = bagging_model.predict(user_data)
rfc_pred = rfc_model.predict(user_data)

print()
1
print("Predictions: 0 for negative 1 for positive")
print("Adaboost Prediction: ", ada_pred)
print("Decision Tree Prediction: ", dt_pred)
print("KNN Prediction: ", knn_pred)
print("SVC Prediction: ", svc_pred)
print("Bagging Prediciton: ", bagging_pred)
print("Random Forest Prediciton: ", rfc_pred)

Enter the patient's age (in years): 46
Enter the patient's gender (0 for female, 1 for male): 1
Enter if patient has Polyuria (0 for no, 1 for yes) 0
Enter if patient has Polydipsia (0 for no, 1 for yes) 1
Enter if patient has a sudden wight loss (0 for no, 1 for yes) 1
Enter if patient show signs of weakness (0 for no, 1 for yes) 0
Enter if patient has Polyphagia (0 for no, 1 for yes) 0
Enter if patient has Genital Thrush (0 for no, 1 for yes) 0
Enter if patient has Visual Blurring (0 for no, 1 for yes) 1
Enter if patient has itching (0 for no, 1 for yes) 0
Enter if patient has irritability (0 for no, 1 for yes) 1
Enter if patient has delayed healing (0 for no, 1 for yes) 0
Enter if patient has partial peresis (0 for no, 1 for yes) 0
Enter if patient has muscle stiffness (0 for no, 1 for yes) 1
Enter if patient has alopecia (0 for no, 1 for yes) 0
Enter if patient has obesity (0 for no, 1 for yes) 1

Predictions: 0 for negative 1 for positive
Adaboost Prediction:  [1]
Decision Tree Pr