In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from IPython.display import display
import warnings

Data Collection & Preprocessing

In [2]:
# loading dataset
data = pd.read_csv('/kaggle/input/diabetes-dataset/diabetes.csv')

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# number of rows and column
data.shape

(768, 9)

In [5]:
# getting the statistical measures of the dataset
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


Splitting the data

In [6]:
features = data.drop(columns = 'Outcome', axis = 1)
target = data['Outcome']

In [7]:
features

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [8]:
print(target)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


 Data Standardization

In [9]:
scaler = StandardScaler()

In [10]:
scaler.fit(features)

In [11]:
standardized_data = scaler.transform(features)

In [12]:
features = standardized_data
target = data['Outcome']

In [13]:
features, target

(array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
          0.46849198,  1.4259954 ],
        [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
         -0.36506078, -0.19067191],
        [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
          0.60439732, -0.10558415],
        ...,
        [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
         -0.68519336, -0.27575966],
        [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
         -0.37110101,  1.17073215],
        [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
         -0.47378505, -0.87137393]]),
 0      1
 1      0
 2      1
 3      0
 4      1
       ..
 763    0
 764    0
 765    0
 766    1
 767    0
 Name: Outcome, Length: 768, dtype: int64)

Train Test Split

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size=0.2, random_state = 2)
print(features.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


Training the Model

In [15]:
# Suppress the warning
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')

# LogisticRegression

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
classifier =  LogisticRegression()
classifier.fit(X_train, Y_train)

# accuracy on training data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score on test data = ', test_data_accuracy)

Accuracy score on test data =  0.7662337662337663


import tkinter as tk
from tkinter import Entry, Button, Label

# Define a function to predict diabetes
def predict_diabetes():
    pregnancies = float(pregnancies_input.get())
    glucose = float(glucose_input.get())
    blood_pressure = float(blood_pressure_input.get())
    skin_thickness = float(skin_thickness_input.get())
    insulin = float(insulin_input.get())
    bmi = float(bmi_input.get())
    pedigree = float(pedigree_input.get())
    age = float(age_input.get())

    user_data = [[pregnancies, glucose, blood_pressure, skin_thickness, insulin, bmi, pedigree, age]]
    outcome = classifier.predict(user_data)

    if outcome[0] == 1:
        prediction_output.config(text="The person is diabetic.")
    else:
        prediction_output.config(text="The person is not diabetic.")

# Create a new tkinter window
window = tk.Tk()
window.title("Diabetes Prediction")

# Create input fields
pregnancies_label = Label(window, text="Pregnancies:")
pregnancies_label.pack()
pregnancies_input = Entry(window)
pregnancies_input.pack()

glucose_label = Label(window, text="Glucose:")
glucose_label.pack()
glucose_input = Entry(window)
glucose_input.pack()

# Repeat this process for other input fields (blood pressure, skin thickness, insulin, BMI, pedigree, age)

# Create a button to trigger the prediction
predict_button = Button(window, text="Predict", command=predict_diabetes)
predict_button.pack()

# Create an output label
prediction_output = Label(window, text="")
prediction_output.pack()

# Start the GUI main loop
window.mainloop()


# Define a function to predict diabetes with a descriptive sentence
def predict_diabetes(pregnancies, glucose, blood_pressure, skin_thickness, insulin, bmi, pedigree, age):
    user_data = pd.DataFrame({
       'Pregnancies': [pregnancies],
        'Glucose': [glucose],
       'BloodPressure': [blood_pressure],
        'SkinThickness': [skin_thickness],
        'Insulin': [insulin],
        'BMI': [bmi],
        'DiabetesPedigreeFunction': [pedigree],
        'Age': [age]
    })

    outcome = classifier.predict(user_data)
    if outcome[0] == 1:
        return "The person is diabetic."
    else:
        return "The person is not diabetic."




display(pregnancies_input, glucose_input, blood_pressure_input, skin_thickness_input, insulin_input, bmi_input, pedigree_input, age_input)
display(predict_button, prediction_output)

# Support Vector Machine Classifier

In [18]:
from sklearn import svm

In [19]:
model = svm.SVC(kernel = 'linear')
model.fit(X_train, Y_train)

# Evaluation
predicted_Y = model.predict(X_test)
test_accuracy = accuracy_score(Y_test, predicted_Y)
print(test_accuracy)

0.7662337662337663


# DecisionTree

In [20]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier

In [21]:
# Create an instance of the Decision Tree classifier
decision = DecisionTreeClassifier()

# Train the Decision Tree classifier on the training data
decision.fit(X_train, Y_train)
# Evaluate the Decision Tree classifier on the test data
decision_prediction = decision.predict(X_test)

# Calculate accuracy for the Decision Tree classifier
decision_accuracy = accuracy_score(Y_test, decision_prediction)

# Display accuracy for the Decision Tree classifier
print("Decision Tree Accuracy:", decision_accuracy)


Decision Tree Accuracy: 0.7077922077922078


#  K nearest neighbors

In [22]:
# Import necessary libraries
from sklearn.neighbors import KNeighborsClassifier

In [23]:
# Create an instance of the KNN classifier with a specific number of neighbors (e.g., 3)
knn_classifier = KNeighborsClassifier(n_neighbors=3)

# Train the KNN classifier on the training data
knn_classifier.fit(X_train, Y_train)

# Evaluate the KNN classifier on the test data
knn_test_data_prediction = knn_classifier.predict(X_test)

# Calculate accuracy for the KNN classifier
knn_accuracy = accuracy_score(Y_test, knn_test_data_prediction)

# Display accuracy for the KNN classifier
print("KNN Accuracy:", knn_accuracy)

KNN Accuracy: 0.7597402597402597


# Naive bayes

In [24]:
# Import necessary libraries
from sklearn.naive_bayes import GaussianNB

In [25]:
# Create an instance of the Gaussian Naive Bayes classifier
naive_bayes_classifier = GaussianNB()

# Train the Naive Bayes classifier on the training data
naive_bayes_classifier.fit(X_train, Y_train)

# Evaluate the Naive Bayes classifier on the test data
naive_bayes_test_data_prediction = naive_bayes_classifier.predict(X_test)

# Calculate accuracy for the Naive Bayes classifier
naive_bayes_accuracy = accuracy_score(Y_test, naive_bayes_test_data_prediction)

# Display accuracy for the Naive Bayes classifier
print("Naive Bayes Accuracy:", naive_bayes_accuracy)


Naive Bayes Accuracy: 0.7597402597402597


# RandomForest

In [26]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier

In [27]:
# Create an instance of the Random Forest classifier
random_forest_classifier = RandomForestClassifier()

# Train the Random Forest classifier on the training data
random_forest_classifier.fit(X_train, Y_train)

# Evaluate the Random Forest classifier on the test data
random_forest_test_data_prediction = random_forest_classifier.predict(X_test)

# Calculate accuracy for the Random Forest classifier
random_forest_accuracy = accuracy_score(Y_test, random_forest_test_data_prediction)

# Display accuracy for the Random Forest classifier
print("Random Forest Accuracy:", random_forest_accuracy)


Random Forest Accuracy: 0.7402597402597403


# Neural network

In [28]:
# Import necessary libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Create a neural network model
model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),  # Input layer
    layers.Dense(64, activation='relu'),      # Hidden layer with 64 units and ReLU activation
    layers.Dense(1, activation='sigmoid')     # Output layer with 1 unit and sigmoid activation (binary classification)
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the neural network on the training data
model.fit(X_train, Y_train, epochs=10, batch_size=32)

# Evaluate the neural network on the test data
test_loss, test_accuracy = model.evaluate(X_test, Y_test)

# Display the test accuracy
print("Neural Network Test Accuracy:", test_accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Test Accuracy: 0.7662337422370911


# Accuracy score on for logistic regression =  0.7662337662337663
# Accuracy score on for Support Vector Machine Classifier  = 0.7662337662337663
# Accuracy score on for Decision Tree = 0.7402597402597403
# Accuracy score on for K nearest neighbors: 0.7597402597402597
# Accuracy score on for Naive Bayes : 0.7597402597402597
# Accuracy score on for Random Forest: 0.7792207792207793
# Accuracy score on for Neural Network: 0.7922077775001526

### 