In [49]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [50]:
file_path = Path("full_numeric_table.csv")
df_diabetes = pd.read_csv(file_path)
df_diabetes.head()

Unnamed: 0,patient_no,gender_numeric,age_range,hypertension_numeric,heart_disease_numeric,smoking_history_numeric,bmi,hba1c_level,blood_glucose_level,diabetes_numeric
0,1,0,5,0,1,0,25.19,6.6,140,0
1,3,1,2,0,0,0,27.32,5.7,158,0
2,4,0,3,0,0,1,23.45,5.0,155,0
3,5,1,5,1,1,1,20.14,4.8,155,0
4,6,0,1,0,0,0,27.32,6.6,85,0


In [51]:
X = df_diabetes.copy()
X = X.drop("diabetes_numeric", axis=1)
X.head()

Unnamed: 0,patient_no,gender_numeric,age_range,hypertension_numeric,heart_disease_numeric,smoking_history_numeric,bmi,hba1c_level,blood_glucose_level
0,1,0,5,0,1,0,25.19,6.6,140
1,3,1,2,0,0,0,27.32,5.7,158
2,4,0,3,0,0,1,23.45,5.0,155
3,5,1,5,1,1,1,20.14,4.8,155
4,6,0,1,0,0,0,27.32,6.6,85


In [52]:
y = df_diabetes["diabetes_numeric"].values
y[:5]

array([0, 0, 0, 0, 0])

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [54]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10569, 9)
(3523, 9)
(10569,)
(3523,)


In [55]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [56]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(11273, 9)
(2819, 9)
(11273,)
(2819,)


In [57]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [58]:
model = tree.DecisionTreeClassifier()

In [59]:
model = model.fit(X_train_scaled, y_train)

In [60]:
predictions = model.predict(X_test_scaled)

In [61]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [62]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1696,84
Actual 1,55,1688


Accuracy Score : 0.9605449900652853
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1780
           1       0.95      0.97      0.96      1743

    accuracy                           0.96      3523
   macro avg       0.96      0.96      0.96      3523
weighted avg       0.96      0.96      0.96      3523



In [63]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [64]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [65]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1700,80
Actual 1,57,1686


In [66]:
acc_score = accuracy_score(y_test, predictions)

In [67]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1700,80
Actual 1,57,1686


Accuracy Score : 0.9611126880499574
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1780
           1       0.95      0.97      0.96      1743

    accuracy                           0.96      3523
   macro avg       0.96      0.96      0.96      3523
weighted avg       0.96      0.96      0.96      3523



In [68]:
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [69]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [70]:
predictions = rf_model.predict(X_test_scaled)

In [71]:
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1775,5
Actual 1,73,1670


In [72]:
acc_score = accuracy_score(y_test, predictions)

In [73]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1775,5
Actual 1,73,1670


Accuracy Score : 0.977859778597786
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1780
           1       1.00      0.96      0.98      1743

    accuracy                           0.98      3523
   macro avg       0.98      0.98      0.98      3523
weighted avg       0.98      0.98      0.98      3523



In [74]:
importances = rf_model.feature_importances_
importances

array([0.5709181 , 0.0026532 , 0.03609661, 0.00748086, 0.00400117,
       0.00271139, 0.03472492, 0.20268206, 0.13873169])

In [75]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5709181037327437, 'patient_no'),
 (0.20268206154836022, 'hba1c_level'),
 (0.13873168763568744, 'blood_glucose_level'),
 (0.03609661202185851, 'age_range'),
 (0.03472492226110394, 'bmi'),
 (0.00748086370717771, 'hypertension_numeric'),
 (0.0040011681729291235, 'heart_disease_numeric'),
 (0.0027113854239360588, 'smoking_history_numeric'),
 (0.0026531954962033897, 'gender_numeric')]

In [76]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.979
Accuracy score (validation): 0.978

Learning rate:  0.1
Accuracy score (training): 0.979
Accuracy score (validation): 0.978

Learning rate:  0.25
Accuracy score (training): 0.979
Accuracy score (validation): 0.978

Learning rate:  0.5
Accuracy score (training): 0.981
Accuracy score (validation): 0.977

Learning rate:  0.75
Accuracy score (training): 0.982
Accuracy score (validation): 0.976

Learning rate:  1
Accuracy score (training): 0.983
Accuracy score (validation): 0.976



In [77]:
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.5,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,1,1
3,1,1
4,0,0
5,0,0
6,1,1
7,1,1
8,0,0
9,1,1


In [78]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.9772920806131138


In [79]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1776,4
Actual 1,76,1667


In [80]:
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1780
           1       1.00      0.96      0.98      1743

    accuracy                           0.98      3523
   macro avg       0.98      0.98      0.98      3523
weighted avg       0.98      0.98      0.98      3523



In [81]:
from collections import Counter
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y)
Counter(y_train)

Counter({0: 5281, 1: 5288})

In [82]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 5288, 1: 5288})

In [83]:
from sklearn.linear_model import LogisticRegression

In [84]:
model = LogisticRegression(solver='lbfgs', random_state=1, max_iter=1000)
model.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=1000, random_state=1)

In [85]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[1713,   52],
       [  85, 1673]])

In [86]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9610939227231563

In [87]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.95      0.97      0.95      0.96      0.96      0.93      1765
          1       0.97      0.95      0.97      0.96      0.96      0.92      1758

avg / total       0.96      0.96      0.96      0.96      0.96      0.92      3523



In [88]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 5288, 1: 5288})

In [89]:
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9610939227231563

In [90]:
confusion_matrix(y_test, y_pred)

array([[1713,   52],
       [  85, 1673]])

In [91]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.95      0.97      0.95      0.96      0.96      0.93      1765
          1       0.97      0.95      0.97      0.96      0.96      0.92      1758

avg / total       0.96      0.96      0.96      0.96      0.96      0.92      3523

