In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import joblib

In [4]:
# Load the dataset
df = pd.read_csv('diabetes_prediction_dataset.csv')

# Data Preprocessing
df['age-label'] = pd.cut(df['age'], bins=[0, 9, 19, 59, 100], labels=['Child', 'Young_Adult', 'Adult', 'Elderly'])
df.drop(['age'], axis=1, inplace=True)

# Prepare the data
X = df.drop(['diabetes'], axis=1)
y = df['diabetes']

In [5]:
# Separate categorical and numerical columns
cat_cols = X.select_dtypes(exclude=np.number).columns
num_cols = X.select_dtypes(include=np.number).columns


In [6]:
# Pipeline and Column Transformation
category_pipeline = Pipeline(steps=[("one_hotencoding", OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse=True))])
numeric_pipeline = Pipeline(steps=[('scaling', StandardScaler())])
full_processor = ColumnTransformer(transformers=[('numeric', numeric_pipeline, num_cols),
                                                  ('categorical', category_pipeline, cat_cols)])
X_preprocessed = full_processor.fit_transform(X)



In [7]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


In [8]:
# Logistic Regression
lc_reg = LogisticRegression()
lc_reg.fit(X_train, y_train)
log_predict = lc_reg.predict(X_test)

print("Logistic Regression:")
print(confusion_matrix(y_test, log_predict))
print(classification_report(y_test, log_predict))

Logistic Regression:
[[18142   150]
 [  665  1043]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     18292
           1       0.87      0.61      0.72      1708

    accuracy                           0.96     20000
   macro avg       0.92      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Random Forest
rm_forest = RandomForestClassifier()
rf_predict = rm_forest.fit(X_train, y_train).predict(X_test)

print("Random Forest:")
print(confusion_matrix(y_test, rf_predict))
print(classification_report(y_test, rf_predict))


Random Forest:
[[18174   118]
 [  513  1195]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     18292
           1       0.91      0.70      0.79      1708

    accuracy                           0.97     20000
   macro avg       0.94      0.85      0.89     20000
weighted avg       0.97      0.97      0.97     20000



In [10]:
# Handling Imbalance with SMOTE
sm = SMOTE(random_state=2)
X_smote, y_smote = sm.fit_resample(X_preprocessed, y)

# Split the balanced data into train and test sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)


In [11]:
# Logistic Regression with balanced data
lc_reg1 = LogisticRegression()
log_predict1 = lc_reg1.fit(X1_train, y1_train).predict(X1_test)

print("Logistic Regression (Balanced Data):")
print(confusion_matrix(y1_test, log_predict1))
print(classification_report(y1_test, log_predict1))

Logistic Regression (Balanced Data):
[[16219  2074]
 [ 2300 16007]]
              precision    recall  f1-score   support

           0       0.88      0.89      0.88     18293
           1       0.89      0.87      0.88     18307

    accuracy                           0.88     36600
   macro avg       0.88      0.88      0.88     36600
weighted avg       0.88      0.88      0.88     36600



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Random Forest with balanced data
rm_forest1 = RandomForestClassifier()
rf_predict1 = rm_forest1.fit(X1_train, y1_train).predict(X1_test)

print("Random Forest (Balanced Data):")
print(confusion_matrix(y1_test, rf_predict1))
print(classification_report(y1_test, rf_predict1))

Random Forest (Balanced Data):
[[17766   527]
 [  518 17789]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     18293
           1       0.97      0.97      0.97     18307

    accuracy                           0.97     36600
   macro avg       0.97      0.97      0.97     36600
weighted avg       0.97      0.97      0.97     36600



In [13]:
# Calculate AUC for each model (continued)
pred_prob2 = rm_forest.predict_proba(X_test)
auc_score2 = roc_auc_score(y_test, pred_prob2[:, 1])
print('AUC Score Random Forest (Imbalanced Data):', auc_score2)

pred_prob3 = lc_reg1.predict_proba(X1_test)
auc_score3 = roc_auc_score(y1_test, pred_prob3[:, 1])
print('AUC Score for Logistic Regression (Balanced Data):', auc_score3)

pred_prob4 = rm_forest1.predict_proba(X1_test)
auc_score4 = roc_auc_score(y1_test, pred_prob4[:, 1])
print('AUC Score for Random Forest (Balanced Data):', auc_score4)

AUC Score Random Forest (Imbalanced Data): 0.9541785969064936
AUC Score for Logistic Regression (Balanced Data): 0.9597676700666362
AUC Score for Random Forest (Balanced Data): 0.9957755241810763


In [15]:
import joblib

# Save the model
joblib.dump(lc_reg, 'logistic_regression_model.pkl')
joblib.dump(rm_forest, 'random_forest_model.pkl')
joblib.dump(lc_reg1, 'logistic_regression_balanced_model.pkl')
joblib.dump(rm_forest1, 'random_forest_balanced_model.pkl')

# Load the model
loaded_lc_reg = joblib.load('logistic_regression_model.pkl')
loaded_rm_forest = joblib.load('random_forest_model.pkl')
loaded_lc_reg1 = joblib.load('logistic_regression_balanced_model.pkl')
loaded_rm_forest1 = joblib.load('random_forest_balanced_model.pkl')


In [22]:
import pandas as pd
import numpy as np
import joblib

# Load the saved models
loaded_lc_reg = joblib.load('logistic_regression_model.pkl')
loaded_rm_forest = joblib.load('random_forest_model.pkl')
loaded_lc_reg1 = joblib.load('logistic_regression_balanced_model.pkl')
loaded_rm_forest1 = joblib.load('random_forest_balanced_model.pkl')

# Create a new DataFrame for the new data
new_data = pd.DataFrame({
    'gender': ['Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female'],
    'age': [80.0, 54.0, 28.0, 36.0, 76.0, 20.0, 44.0],
    'hypertension': [0, 0, 0, 0, 1, 0, 0],
    'heart_disease': [1, 0, 0, 0, 1, 0, 0],
    'smoking_history': ['never', 'No Info', 'never', 'current', 'current', 'never', 'never'],
    'bmi': [25.19, 27.32, 27.32, 23.45, 20.14, 27.32, 19.31],
    'HbA1c_level': [6.6, 6.6, 5.7, 5.0, 4.8, 6.6, 6.5],
    'blood_glucose_level': [140, 80, 158, 155, 155, 85, 200]
})

# Preprocess the new data
new_data['age-label'] = pd.cut(new_data['age'], bins=[0, 9, 19, 59, 100], labels=['Child', 'Young_Adult', 'Adult', 'Elderly'])
new_data.drop(['age'], axis=1, inplace=True)

# Apply the same transformations as before
new_data_preprocessed = full_processor.transform(new_data)

# Make predictions using the loaded models
logistic_regression_predictions = loaded_lc_reg.predict(new_data_preprocessed)
random_forest_predictions = loaded_rm_forest.predict(new_data_preprocessed)
logistic_regression_balanced_predictions = loaded_lc_reg1.predict(new_data_preprocessed)
random_forest_balanced_predictions = loaded_rm_forest1.predict(new_data_preprocessed)

# Print the predictions
print("Logistic Regression Predictions:")
print(logistic_regression_predictions)

print("Random Forest Predictions:")
print(random_forest_predictions)

print("Logistic Regression (Balanced Data) Predictions:")
print(logistic_regression_balanced_predictions)

print("Random Forest (Balanced Data) Predictions:")
print(random_forest_balanced_predictions)


Logistic Regression Predictions:
[0 0 0 0 0 0 0]
Random Forest Predictions:
[0 0 0 0 0 0 1]
Logistic Regression (Balanced Data) Predictions:
[1 0 0 0 0 0 1]
Random Forest (Balanced Data) Predictions:
[0 0 0 0 0 0 1]


In [19]:
import joblib

# Save the preprocessor pipelines
joblib.dump(full_processor, 'preprocessor_pipeline.pkl')

# Load the preprocessor pipelines
full_processor = joblib.load('preprocessor_pipeline.pkl')



In [21]:
import pandas as pd
import numpy as np
import joblib

# Load the saved models
loaded_lc_reg = joblib.load('logistic_regression_model.pkl')
loaded_rm_forest = joblib.load('random_forest_model.pkl')
loaded_lc_reg1 = joblib.load('logistic_regression_balanced_model.pkl')
loaded_rm_forest1 = joblib.load('random_forest_balanced_model.pkl')

# Create a new DataFrame for the new data
new_data = pd.DataFrame({
    'gender': ['Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female'],
    'age': [80.0, 54.0, 28.0, 36.0, 76.0, 20.0, 44.0],
    'hypertension': [0, 0, 0, 0, 1, 0, 0],
    'heart_disease': [1, 0, 0, 0, 1, 0, 0],
    'smoking_history': ['never', 'No Info', 'never', 'current', 'current', 'never', 'never'],
    'bmi': [25.19, 27.32, 27.32, 23.45, 20.14, 27.32, 19.31],
    'HbA1c_level': [6.6, 6.6, 5.7, 5.0, 4.8, 6.6, 6.5],
    'blood_glucose_level': [140, 80, 158, 155, 155, 85, 200]
})

# Preprocess the new data
new_data['age-label'] = pd.cut(new_data['age'], bins=[0, 9, 19, 59, 100], labels=['Child', 'Young_Adult', 'Adult', 'Elderly'])
new_data.drop(['age'], axis=1, inplace=True)

# Apply the same transformations as before
new_data_preprocessed = full_processor.transform(new_data)

# Make predictions using the loaded models
logistic_regression_probabilities = loaded_lc_reg.predict_proba(new_data_preprocessed)
random_forest_probabilities = loaded_rm_forest.predict_proba(new_data_preprocessed)
logistic_regression_balanced_probabilities = loaded_lc_reg1.predict_proba(new_data_preprocessed)
random_forest_balanced_probabilities = loaded_rm_forest1.predict_proba(new_data_preprocessed)

# Convert probabilities to percentages
logistic_regression_percentages = logistic_regression_probabilities[:, 1] * 100
random_forest_percentages = random_forest_probabilities[:, 1] * 100
logistic_regression_balanced_percentages = logistic_regression_balanced_probabilities[:, 1] * 100
random_forest_balanced_percentages = random_forest_balanced_probabilities[:, 1] * 100

# Round the percentages to two decimal places
logistic_regression_percentages = np.round(logistic_regression_percentages, 2)
random_forest_percentages = np.round(random_forest_percentages, 2)
logistic_regression_balanced_percentages = np.round(logistic_regression_balanced_percentages, 2)
random_forest_balanced_percentages = np.round(random_forest_balanced_percentages, 2)

# Print the risk percentages
print("Logistic Regression Risk Percentages:")
for percentage in logistic_regression_percentages:
    print(f"{percentage}%")

print("Random Forest Risk Percentages:")
for percentage in random_forest_percentages:
    print(f"{percentage}%")

print("Logistic Regression (Balanced Data) Risk Percentages:")
for percentage in logistic_regression_balanced_percentages:
    print(f"{percentage}%")

print("Random Forest (Balanced Data) Risk Percentages:")
for percentage in random_forest_balanced_percentages:
    print(f"{percentage}%")


Logistic Regression Risk Percentages:
31.82%
0.52%
1.92%
0.22%
2.43%
1.07%
16.76%
Random Forest Risk Percentages:
4.0%
0.0%
0.0%
0.0%
4.0%
0.0%
61.0%
Logistic Regression (Balanced Data) Risk Percentages:
83.35%
5.74%
17.97%
2.24%
25.59%
10.56%
62.11%
Random Forest (Balanced Data) Risk Percentages:
43.0%
0.0%
0.0%
0.0%
1.0%
0.0%
65.0%
