# Diabetic Disease Prediction

In [None]:
# importing necessary libraries
import numpy as np
import pandas as pd
import plotly.express as px

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import precision_recall_fscore_support
import joblib

## Loading dataSet and exploration

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Data Science My Repository/Projects/AI Health Guard Research /AI Health Guard Datasets/diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
df.shape

(768, 9)

In [None]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
# check null values
null_checker = df.apply(lambda x: sum(x.isnull())).to_frame(name='count')
print(null_checker)

                          count
Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                       0
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0


In [None]:
df['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

* 0 - Non Diabetic
* 1- Diabetic

In [None]:
df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [None]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [None]:
# Distribution of Outcome
fig = px.histogram(df, x='Outcome', title='Distribution of Outcome')
fig.show()

In [None]:
# Distribution of Numerical Features
fig = px.histogram(df, x='Age', nbins=50, title='Age Distribution')
fig.show()

* `20 - 30` Age Group count is `396`
* `30 - 40` Age Group count is `165`
* `40 - 50` Age Group count is `118`
* `50 - 60` Age Group count is `57`
* `60 - 70` Age Group count is `29`
* `70 - 80` Age Group count is `2`

So, 20 - 30 age group get the maximum number of diabetic patients.

In [None]:
# Scatter plot to show relationships between features
fig = px.scatter(df, x='Glucose', y='BMI', color='Outcome', title='Glucose vs BMI')
fig.show()

In [None]:
fig = px.scatter(df, x='Insulin', y='SkinThickness', color='Outcome', title='Insulin vs SkinThickness')
fig.show()

In [None]:
# Heatmap to show correlation
corr = df.corr()
fig = px.imshow(corr, text_auto=True, title='Feature Correlation Heatmap')
fig.show()

## Model Building

In [None]:
# Splitting data into train and test sets
X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (614, 8)
Shape of X_test: (154, 8)
Shape of y_train: (614,)
Shape of y_test: (154,)


In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

In [None]:
# Dictionary to store accuracies and confusion matrices
accuracies = {}

# Loop through the models, train, test, and store results
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Test the model
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[model_name] = accuracy

In [None]:
# Print accuracies
for model_name, accuracy in accuracies.items():
    print(f"{model_name} Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.7532467532467533
Decision Tree Accuracy: 0.7467532467532467
Random Forest Accuracy: 0.7402597402597403
Support Vector Machine Accuracy: 0.7337662337662337
K-Nearest Neighbors Accuracy: 0.6948051948051948


In [None]:
def get_metrics(y_test, y_pred):
    metrics = {}
    MSE = mean_squared_error(y_test, y_pred)
    RMSE = np.sqrt(MSE)
    MAE = mean_absolute_error(y_test, y_pred)
    R2 = r2_score(y_test, y_pred)

    metrics['MSE'] = MSE
    metrics['RMSE'] = RMSE
    metrics['MAE'] = MAE
    metrics['R2'] = R2

    return metrics
# Create an empty DataFrame to store metrics
metrics_df = pd.DataFrame(columns=['Model', 'MSE', 'RMSE', 'MAE', 'R2'])
# Iterate through each model in the dictionary
for model_name, model in models.items():
    metrics = get_metrics(y_test, y_pred)
    metrics['Model'] = model_name
    metrics_df = pd.concat([metrics_df, pd.DataFrame(metrics, index=[0])], ignore_index=True)

# Print the DataFrame
print(metrics_df)

                    Model       MSE      RMSE       MAE        R2
0     Logistic Regression  0.305195  0.552444  0.305195 -0.329293
1           Decision Tree  0.305195  0.552444  0.305195 -0.329293
2           Random Forest  0.305195  0.552444  0.305195 -0.329293
3  Support Vector Machine  0.305195  0.552444  0.305195 -0.329293
4     K-Nearest Neighbors  0.305195  0.552444  0.305195 -0.329293


## Hyperparameter Tuning

In [None]:
# Define the parameter grid for RandomForestClassifier
params_RF = {"min_samples_split": [2, 6, 20],
              "min_samples_leaf": [1, 2, 4],
              "n_estimators" :[50,100,200,300,400],
              "max_depth": [None, 10, 20, 30],
              "criterion": ["gini", "entropy"]
              }

# Initialize GridSearchCV
GridSearchCV_RF = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params_RF, cv=3, scoring="accuracy", return_train_score=True)

# Fit the model
GridSearchCV_RF.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", GridSearchCV_RF.best_params_)

# Best estimator
RF_model = GridSearchCV_RF.best_estimator_

# Evaluate the best model
y_pred = RF_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best RandomForestClassifier Model after Hyperparameter Tuning:")
print(f"Accuracy: {accuracy}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R2 Score: {r2}")
#11 m

Best parameters found:  {'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best RandomForestClassifier Model after Hyperparameter Tuning:
Accuracy: 0.7597402597402597
MSE: 0.24025974025974026
RMSE: 0.4901629731627434
MAE: 0.24025974025974026
R2 Score: -0.046464646464646764


In [None]:
# Define the parameter grid for LogisticRegression
params_LR = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear']
}

# Initialize GridSearchCV
GridSearchCV_LR = GridSearchCV(estimator=LogisticRegression(max_iter=1000), param_grid=params_LR, cv=3, scoring="accuracy", return_train_score=True)

# Fit the model
GridSearchCV_LR.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", GridSearchCV_LR.best_params_)

# Best estimator
LR_model = GridSearchCV_LR.best_estimator_

# Evaluate the best model
y_pred = LR_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best LogisticRegression Model after Hyperparameter Tuning:")
print(f"Accuracy: {accuracy}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R2 Score: {r2}")

Best parameters found:  {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best LogisticRegression Model after Hyperparameter Tuning:
Accuracy: 0.7597402597402597
MSE: 0.24025974025974026
RMSE: 0.4901629731627434
MAE: 0.24025974025974026
R2 Score: -0.046464646464646764


In [None]:
# Create a DataFrame to store model evaluation results
results = pd.DataFrame({
    "Model": ["K-Nearest Neighbors", "Logistic Regression", "Decision Tree", "Random Forest", "SVM"],
    "Train Score": [
        # Calculate mean cross-validated accuracy for training set
        cross_val_score(KNeighborsClassifier().fit(X_train, y_train), X_train, y_train, cv=3).mean(),
        cross_val_score(LR_model, X_train, y_train, cv=3).mean(),
        cross_val_score(DecisionTreeClassifier().fit(X_train, y_train), X_train, y_train, cv=3).mean(),
        cross_val_score(RF_model, X_train, y_train, cv=3).mean(),
        cross_val_score(SVC().fit(X_train, y_train), X_train, y_train, cv=3).mean(),
    ],
    "Test Score": [
        # Calculate accuracy on the test set
        KNeighborsClassifier().fit(X_train, y_train).score(X_test, y_test),
        LR_model.score(X_test, y_test),
        DecisionTreeClassifier().fit(X_train, y_train).score(X_test, y_test),
        RF_model.score(X_test, y_test),
        SVC().fit(X_train, y_train).score(X_test, y_test),
    ]
})
# Additional Metrics (precision, recall, F1 score)
metrics = ["precision", "recall", "f1"]
# Fit models before calculating metrics
knn_model = KNeighborsClassifier().fit(X_train, y_train)
dt_model = DecisionTreeClassifier().fit(X_train, y_train)
svm_model = SVC().fit(X_train, y_train)

for metric in metrics:
    results[f"{metric.capitalize()}"] = [
        precision_recall_fscore_support(y_test, model.predict(X_test), average="weighted")[metrics.index(metric)]
        for model in [knn_model, LR_model, dt_model, RF_model, svm_model]
    ]

result_df = results.sort_values(by="Test Score", ascending=False)
result_df = result_df.set_index("Test Score")
result_df

Unnamed: 0_level_0,Model,Train Score,Precision,Recall,F1
Test Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.75974,Logistic Regression,0.770333,0.757191,0.75974,0.758165
0.75974,Decision Tree,0.723139,0.750916,0.74026,0.743596
0.75974,Random Forest,0.781763,0.75881,0.75974,0.759242
0.733766,SVM,0.757325,0.727959,0.733766,0.729265
0.694805,K-Nearest Neighbors,0.744301,0.687444,0.694805,0.689645


In [None]:
# Cross-validation for the best model
cv_scores = cross_val_score(LR_model, X, y, cv=5)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Average Cross-Validation Score: {cv_scores.mean()}')

Cross-Validation Scores: [0.77922078 0.72727273 0.74675325 0.79084967 0.75163399]
Average Cross-Validation Score: 0.7591460826754944


## Model Saving and Predictions

In [None]:
# Save the best model
joblib.dump(LR_model, 'diabetic_model.pkl')
# Load the model
loaded_model = joblib.load('diabetic_model.pkl')

In [None]:
feature_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

def predict_diabetes(input_data):
    input_df = pd.DataFrame([input_data], columns=feature_names)
    input_data_scaled = scaler.transform(input_df)
    prediction = loaded_model.predict(input_data_scaled)
    return 'Patient Diabetic' if prediction[0] == 1 else 'Patient Non-Diabetic'

# Example prediction
example_input = [6, 148, 72, 35, 0, 33.6, 0.627, 50]
print(predict_diabetes(example_input))

Patient Diabetic


In [None]:
# Example prediction
example_input = [1, 89, 66, 23, 94, 28.1, 0.167, 21]
print(predict_diabetes(example_input))

Patient Non-Diabetic


In [None]:
# Example prediction
example_input = [0, 137,	40,	35,	168,	43.1,	2.288,	33]
print(predict_diabetes(example_input))

Patient Diabetic


In [None]:
# Example prediction
example_input = [1,	85,	66,	29,	0,	26.6,	0.351,	31]
print(predict_diabetes(example_input))

Patient Non-Diabetic


In [None]:
import sklearn
print(sklearn.__version__)

1.2.2
