In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
file_path = Path("full_numeric_table.csv")
df_diabetes = pd.read_csv(file_path)
df_diabetes.head()

Unnamed: 0,patient_no,gender_numeric,age,age_range,hypertension_numeric,heart_disease_numeric,smoking_history_numeric,bmi,hba1c_level,blood_glucose_level,diabetes_numeric
0,1,0,80.0,5,0,1,0,25.19,6.6,140,0
1,3,1,28.0,2,0,0,0,27.32,5.7,158,0
2,4,0,36.0,3,0,0,1,23.45,5.0,155,0
3,5,1,76.0,5,1,1,1,20.14,4.8,155,0
4,6,0,20.0,1,0,0,0,27.32,6.6,85,0


In [3]:
X = df_diabetes.copy()
X = X.drop("diabetes_numeric", axis=1)
X.head()

Unnamed: 0,patient_no,gender_numeric,age,age_range,hypertension_numeric,heart_disease_numeric,smoking_history_numeric,bmi,hba1c_level,blood_glucose_level
0,1,0,80.0,5,0,1,0,25.19,6.6,140
1,3,1,28.0,2,0,0,0,27.32,5.7,158
2,4,0,36.0,3,0,0,1,23.45,5.0,155
3,5,1,76.0,5,1,1,1,20.14,4.8,155
4,6,0,20.0,1,0,0,0,27.32,6.6,85


In [4]:
y = df_diabetes["diabetes_numeric"].values
y[:5]

array([0, 0, 0, 0, 0])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10569, 10)
(3523, 10)
(10569,)
(3523,)


In [7]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [8]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(11273, 10)
(2819, 10)
(11273,)
(2819,)


In [9]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
model = tree.DecisionTreeClassifier()

In [11]:
model = model.fit(X_train_scaled, y_train)

In [12]:
predictions = model.predict(X_test_scaled)

In [13]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [14]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1695,85
Actual 1,57,1686


Accuracy Score : 0.9596934430882771
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1780
           1       0.95      0.97      0.96      1743

    accuracy                           0.96      3523
   macro avg       0.96      0.96      0.96      3523
weighted avg       0.96      0.96      0.96      3523



In [15]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [16]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [17]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1700,80
Actual 1,56,1687


In [18]:
acc_score = accuracy_score(y_test, predictions)

In [19]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1700,80
Actual 1,56,1687


Accuracy Score : 0.9613965370422936
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1780
           1       0.95      0.97      0.96      1743

    accuracy                           0.96      3523
   macro avg       0.96      0.96      0.96      3523
weighted avg       0.96      0.96      0.96      3523



In [20]:
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [21]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [22]:
predictions = rf_model.predict(X_test_scaled)

In [23]:
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1775,5
Actual 1,76,1667


In [24]:
acc_score = accuracy_score(y_test, predictions)

In [25]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1775,5
Actual 1,76,1667


Accuracy Score : 0.9770082316207778
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1780
           1       1.00      0.96      0.98      1743

    accuracy                           0.98      3523
   macro avg       0.98      0.98      0.98      3523
weighted avg       0.98      0.98      0.98      3523



In [26]:
importances = rf_model.feature_importances_
importances

array([0.56786224, 0.00250127, 0.0439268 , 0.02343192, 0.00573179,
       0.0035303 , 0.00265438, 0.02843812, 0.179931  , 0.14199217])

In [27]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5678622399289267, 'patient_no'),
 (0.17993100129318304, 'hba1c_level'),
 (0.14199217459678623, 'blood_glucose_level'),
 (0.043926800088168794, 'age'),
 (0.028438120595941755, 'bmi'),
 (0.02343191872973711, 'age_range'),
 (0.005731794377569304, 'hypertension_numeric'),
 (0.00353030282331363, 'heart_disease_numeric'),
 (0.002654379198199298, 'smoking_history_numeric'),
 (0.0025012683681739727, 'gender_numeric')]

In [28]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.979
Accuracy score (validation): 0.978

Learning rate:  0.1
Accuracy score (training): 0.979
Accuracy score (validation): 0.978

Learning rate:  0.25
Accuracy score (training): 0.979
Accuracy score (validation): 0.978

Learning rate:  0.5
Accuracy score (training): 0.981
Accuracy score (validation): 0.978

Learning rate:  0.75
Accuracy score (training): 0.982
Accuracy score (validation): 0.978

Learning rate:  1
Accuracy score (training): 0.982
Accuracy score (validation): 0.977



In [29]:
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.5,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,1,1
3,1,1
4,0,0
5,0,0
6,1,1
7,1,1
8,0,0
9,1,1


In [30]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.9784274765824581


In [31]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1778,2
Actual 1,74,1669


In [32]:
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1780
           1       1.00      0.96      0.98      1743

    accuracy                           0.98      3523
   macro avg       0.98      0.98      0.98      3523
weighted avg       0.98      0.98      0.98      3523

