In [30]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [31]:
file_path = Path("full_numeric_table.csv")
df_diabetes = pd.read_csv(file_path)
df_diabetes.head()

Unnamed: 0,patient_no,gender_numeric,age_range,hypertension_numeric,heart_disease_numeric,smoking_history_numeric,bmi,hba1c_level,blood_glucose_level,diabetes_numeric
0,1,0,5,0,1,0,25.19,6.6,140,0
1,3,1,2,0,0,0,27.32,5.7,158,0
2,4,0,3,0,0,1,23.45,5.0,155,0
3,5,1,5,1,1,1,20.14,4.8,155,0
4,6,0,1,0,0,0,27.32,6.6,85,0


In [32]:
X = df_diabetes.copy()
X = X.drop("diabetes_numeric", axis=1)
X.head()

Unnamed: 0,patient_no,gender_numeric,age_range,hypertension_numeric,heart_disease_numeric,smoking_history_numeric,bmi,hba1c_level,blood_glucose_level
0,1,0,5,0,1,0,25.19,6.6,140
1,3,1,2,0,0,0,27.32,5.7,158
2,4,0,3,0,0,1,23.45,5.0,155
3,5,1,5,1,1,1,20.14,4.8,155
4,6,0,1,0,0,0,27.32,6.6,85


In [33]:
y = df_diabetes["diabetes_numeric"].values
y[:5]

array([0, 0, 0, 0, 0])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [35]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10569, 9)
(3523, 9)
(10569,)
(3523,)


In [36]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [37]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(11273, 9)
(2819, 9)
(11273,)
(2819,)


In [38]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [39]:
model = tree.DecisionTreeClassifier()

In [40]:
model = model.fit(X_train_scaled, y_train)

In [41]:
predictions = model.predict(X_test_scaled)

In [42]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [43]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1696,84
Actual 1,55,1688


Accuracy Score : 0.9605449900652853
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1780
           1       0.95      0.97      0.96      1743

    accuracy                           0.96      3523
   macro avg       0.96      0.96      0.96      3523
weighted avg       0.96      0.96      0.96      3523



In [44]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [45]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [46]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1698,82
Actual 1,54,1689


In [47]:
acc_score = accuracy_score(y_test, predictions)

In [48]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1698,82
Actual 1,54,1689


Accuracy Score : 0.9613965370422936
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1780
           1       0.95      0.97      0.96      1743

    accuracy                           0.96      3523
   macro avg       0.96      0.96      0.96      3523
weighted avg       0.96      0.96      0.96      3523



In [49]:
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [50]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [51]:
predictions = rf_model.predict(X_test_scaled)

In [52]:
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1775,5
Actual 1,73,1670


In [53]:
acc_score = accuracy_score(y_test, predictions)

In [54]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1775,5
Actual 1,73,1670


Accuracy Score : 0.977859778597786
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1780
           1       1.00      0.96      0.98      1743

    accuracy                           0.98      3523
   macro avg       0.98      0.98      0.98      3523
weighted avg       0.98      0.98      0.98      3523



In [55]:
importances = rf_model.feature_importances_
importances

array([0.5709181 , 0.0026532 , 0.03609661, 0.00748086, 0.00400117,
       0.00271139, 0.03472492, 0.20268206, 0.13873169])

In [56]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5709181037327437, 'patient_no'),
 (0.20268206154836022, 'hba1c_level'),
 (0.13873168763568744, 'blood_glucose_level'),
 (0.03609661202185851, 'age_range'),
 (0.03472492226110394, 'bmi'),
 (0.00748086370717771, 'hypertension_numeric'),
 (0.0040011681729291235, 'heart_disease_numeric'),
 (0.0027113854239360588, 'smoking_history_numeric'),
 (0.0026531954962033897, 'gender_numeric')]