In [11]:
# Imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Decision Tree Modeling

In this step we'll load the cleaned data set and then perform the modeling steps.

Diabetes_012 class types:
- 0 is for no diabetes or only during pregnancy
- 1 is for prediabetes
- 2 is for diabetes.

In [12]:
# Load cleaned data from eda step
file_path = "../data/cleaned_diabetes_health_indicators_dataset.csv"
df = pd.read_csv(file_path)

In [13]:
# Separate features and target
X = df.drop("Diabetes_012", axis=1)
y = df["Diabetes_012"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Train the decision tree using entropy as the criterion and a max depth of 5
clf_entropy_5 = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
clf_entropy_5.fit(X_train, y_train)

# Predict and evaluate the results
y_entropy_5 = clf_entropy_5.predict(X_test)
print(classification_report(y_test, y_entropy_5))

              precision    recall  f1-score   support

         0.0       0.84      0.98      0.91     38116
         1.0       0.00      0.00      0.00       906
         2.0       0.55      0.13      0.21      6935

    accuracy                           0.83     45957
   macro avg       0.47      0.37      0.37     45957
weighted avg       0.78      0.83      0.79     45957



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Train the decision tree using entropy as the criterion and a max depth of 5
clf_gini_5 = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=42)
clf_gini_5.fit(X_train, y_train)

# Predict and evaluate the results
y_gini_5 = clf_gini_5.predict(X_test)
print(classification_report(y_test, y_gini_5))

              precision    recall  f1-score   support

         0.0       0.84      0.99      0.91     38116
         1.0       0.00      0.00      0.00       906
         2.0       0.57      0.12      0.19      6935

    accuracy                           0.83     45957
   macro avg       0.47      0.37      0.37     45957
weighted avg       0.78      0.83      0.78     45957



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


We can see that using the decision tree with criterion set to entropy and gini with a max tree depth of 5 will be unable to predict class 1 (pre-prediabetes). Let's try the following:
1. No max depth set
2. Max depth of 50

In [18]:
# Train the decision tree using entropy as the criterion and no max depth
clf_entropy_5 = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf_entropy_5.fit(X_train, y_train)

# Predict and evaluate the results
y_entropy_5 = clf_entropy_5.predict(X_test)
print(classification_report(y_test, y_entropy_5))

              precision    recall  f1-score   support

         0.0       0.86      0.84      0.85     38116
         1.0       0.03      0.04      0.04       906
         2.0       0.29      0.31      0.30      6935

    accuracy                           0.75     45957
   macro avg       0.39      0.40      0.40     45957
weighted avg       0.76      0.75      0.75     45957



In [19]:
# Train the decision tree using entropy as the criterion and no max depth
clf_gini_5 = DecisionTreeClassifier(criterion='gini', random_state=42)
clf_gini_5.fit(X_train, y_train)

# Predict and evaluate the results
y_gini_5 = clf_gini_5.predict(X_test)
print(classification_report(y_test, y_gini_5))

              precision    recall  f1-score   support

         0.0       0.86      0.83      0.85     38116
         1.0       0.02      0.02      0.02       906
         2.0       0.28      0.32      0.30      6935

    accuracy                           0.74     45957
   macro avg       0.39      0.39      0.39     45957
weighted avg       0.76      0.74      0.75     45957



In [22]:
# Train the decision tree using entropy as the criterion and max depth 50
clf_entropy_5 = DecisionTreeClassifier(criterion='entropy', max_depth=50, random_state=42)
clf_entropy_5.fit(X_train, y_train)

# Predict and evaluate the results
y_entropy_5 = clf_entropy_5.predict(X_test)
print(classification_report(y_test, y_entropy_5))

              precision    recall  f1-score   support

         0.0       0.86      0.84      0.85     38116
         1.0       0.03      0.04      0.04       906
         2.0       0.29      0.31      0.30      6935

    accuracy                           0.75     45957
   macro avg       0.39      0.40      0.40     45957
weighted avg       0.76      0.75      0.75     45957



In [23]:
# Train the decision tree using entropy as the criterion and max depth 50
clf_gini_5 = DecisionTreeClassifier(criterion='gini', max_depth=50, random_state=42)
clf_gini_5.fit(X_train, y_train)

# Predict and evaluate the results
y_gini_5 = clf_gini_5.predict(X_test)
print(classification_report(y_test, y_gini_5))

              precision    recall  f1-score   support

         0.0       0.86      0.83      0.85     38116
         1.0       0.02      0.02      0.02       906
         2.0       0.28      0.32      0.30      6935

    accuracy                           0.74     45957
   macro avg       0.39      0.39      0.39     45957
weighted avg       0.76      0.74      0.75     45957



Due to how imbalanced this data set is the precision, recall, and f1-score is very low for class 1 and class 2. We can try to improve these results by trying the following:
1. Set class_weight="balanced". This will help give more weight to the imbalanced classes.
2. Try controling the complexity of the decision tree by providing a maximum depth.
3. Set min_samples_leaf, which will help the Decision Tree not overfit to the majority class (class 0, no diabetes)

In [31]:
# Train the decision tree using entropy as the criterion and max depth 50
clf_entropy_5 = DecisionTreeClassifier(criterion='entropy',  class_weight='balanced', min_samples_leaf=8, max_depth=50, random_state=42)
clf_entropy_5.fit(X_train, y_train)

# Predict and evaluate the results
y_entropy_5 = clf_entropy_5.predict(X_test)
print(classification_report(y_test, y_entropy_5))

              precision    recall  f1-score   support

         0.0       0.91      0.65      0.75     38116
         1.0       0.03      0.17      0.05       906
         2.0       0.27      0.52      0.36      6935

    accuracy                           0.62     45957
   macro avg       0.40      0.45      0.39     45957
weighted avg       0.79      0.62      0.68     45957



In [30]:
# Train the decision tree using entropy as the criterion and max depth 50
clf_gini_5 = DecisionTreeClassifier(criterion='gini',  class_weight='balanced', min_samples_leaf=8, max_depth=15, random_state=42)
clf_gini_5.fit(X_train, y_train)

# Predict and evaluate the results
y_gini_5 = clf_gini_5.predict(X_test)
print(classification_report(y_test, y_gini_5))

              precision    recall  f1-score   support

         0.0       0.93      0.59      0.72     38116
         1.0       0.03      0.31      0.06       906
         2.0       0.30      0.54      0.38      6935

    accuracy                           0.57     45957
   macro avg       0.42      0.48      0.39     45957
weighted avg       0.82      0.57      0.66     45957

