In [44]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [45]:
file_path = Path("Final_Dataset.csv")
df_diabetes = pd.read_csv(file_path)
df_diabetes.head()

Unnamed: 0,patient_no,gender_numeric,age_range,hypertension_numeric,heart_disease_numeric,smoking_history_numeric,bmi,hba1c_level,blood_glucose_level,diabetes_numeric
0,1,0,5,0,1,0,25.19,6.6,140,0
1,3,1,2,0,0,0,27.32,5.7,158,0
2,4,0,3,0,0,1,23.45,5.0,155,0
3,5,1,5,1,1,1,20.14,4.8,155,0
4,6,0,1,0,0,0,27.32,6.6,85,0


In [46]:
X = df_diabetes.copy()
X = X.drop("diabetes_numeric", axis=1)
X.head()

Unnamed: 0,patient_no,gender_numeric,age_range,hypertension_numeric,heart_disease_numeric,smoking_history_numeric,bmi,hba1c_level,blood_glucose_level
0,1,0,5,0,1,0,25.19,6.6,140
1,3,1,2,0,0,0,27.32,5.7,158
2,4,0,3,0,0,1,23.45,5.0,155
3,5,1,5,1,1,1,20.14,4.8,155
4,6,0,1,0,0,0,27.32,6.6,85


In [47]:
y = df_diabetes["diabetes_numeric"].values
y[:5]

array([0, 0, 0, 0, 0])

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [49]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(48138, 9)
(16046, 9)
(48138,)
(16046,)


In [50]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [51]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(51347, 9)
(12837, 9)
(51347,)
(12837,)


In [52]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [53]:
model = tree.DecisionTreeClassifier()

In [54]:
model = model.fit(X_train_scaled, y_train)

In [55]:
predictions = model.predict(X_test_scaled)

In [56]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [57]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13792,481
Actual 1,454,1319


Accuracy Score : 0.9417300261747475
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     14273
           1       0.73      0.74      0.74      1773

    accuracy                           0.94     16046
   macro avg       0.85      0.86      0.85     16046
weighted avg       0.94      0.94      0.94     16046



In [58]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [59]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [60]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13790,483
Actual 1,454,1319


In [61]:
acc_score = accuracy_score(y_test, predictions)

In [62]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13790,483
Actual 1,454,1319


Accuracy Score : 0.9416053845195064
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     14273
           1       0.73      0.74      0.74      1773

    accuracy                           0.94     16046
   macro avg       0.85      0.86      0.85     16046
weighted avg       0.94      0.94      0.94     16046



In [63]:
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [64]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [65]:
predictions = rf_model.predict(X_test_scaled)

In [66]:
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14226,47
Actual 1,546,1227


In [67]:
acc_score = accuracy_score(y_test, predictions)

In [68]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14226,47
Actual 1,546,1227


Accuracy Score : 0.9630437492209897
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14273
           1       0.96      0.69      0.81      1773

    accuracy                           0.96     16046
   macro avg       0.96      0.84      0.89     16046
weighted avg       0.96      0.96      0.96     16046



In [69]:
importances = rf_model.feature_importances_
importances

array([0.11198723, 0.00692996, 0.03027236, 0.01416123, 0.00891592,
       0.00916959, 0.10908354, 0.39796976, 0.31151041])

In [70]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3979697589845624, 'hba1c_level'),
 (0.31151040702921473, 'blood_glucose_level'),
 (0.11198723123831995, 'patient_no'),
 (0.10908353794617606, 'bmi'),
 (0.030272363172485245, 'age_range'),
 (0.014161233126391216, 'hypertension_numeric'),
 (0.009169594654452788, 'smoking_history_numeric'),
 (0.008915918504071257, 'heart_disease_numeric'),
 (0.006929955344326306, 'gender_numeric')]

In [71]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.963
Accuracy score (validation): 0.964

Learning rate:  0.1
Accuracy score (training): 0.963
Accuracy score (validation): 0.964

Learning rate:  0.25
Accuracy score (training): 0.963
Accuracy score (validation): 0.964

Learning rate:  0.5
Accuracy score (training): 0.964
Accuracy score (validation): 0.965

Learning rate:  0.75
Accuracy score (training): 0.964
Accuracy score (validation): 0.965

Learning rate:  1
Accuracy score (training): 0.964
Accuracy score (validation): 0.965



In [72]:
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.5,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,1
7,0,0
8,0,0
9,0,1


In [73]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.965287299015331


In [74]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14249,24
Actual 1,533,1240


In [75]:
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14273
           1       0.98      0.70      0.82      1773

    accuracy                           0.97     16046
   macro avg       0.97      0.85      0.90     16046
weighted avg       0.97      0.97      0.96     16046



In [76]:
from collections import Counter
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y)
Counter(y_train)

Counter({0: 42854, 1: 5284})

In [77]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 42854, 1: 42854})

In [78]:
from sklearn.linear_model import LogisticRegression

In [79]:
model = LogisticRegression(solver='lbfgs', random_state=1, max_iter=1000)
model.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=1000, random_state=1)

In [80]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[11697,  2587],
       [  358,  1404]])

In [81]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.8078550300042815

In [82]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.82      0.80      0.89      0.81      0.65     14284
          1       0.35      0.80      0.82      0.49      0.81      0.65      1762

avg / total       0.90      0.82      0.80      0.84      0.81      0.65     16046



In [83]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 42854, 1: 42854})

In [84]:
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8078550300042815

In [85]:
confusion_matrix(y_test, y_pred)

array([[11697,  2587],
       [  358,  1404]])

In [86]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.82      0.80      0.89      0.81      0.65     14284
          1       0.35      0.80      0.82      0.49      0.81      0.65      1762

avg / total       0.90      0.82      0.80      0.84      0.81      0.65     16046

