# Imports

In [1]:
# Imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

# Load
This is where the prepared data is loaded in for use in training and testing of the model

In [2]:
# Load cleaned and preprocessed data
X_train = pd.read_csv("data/X_train.csv", index_col=0).astype(float)
y_train = pd.read_csv("data/y_train.csv", index_col=0).astype(int)
y_train = pd.Series(y_train.values.ravel())
y_train.index = X_train.index
y_train_binary = pd.read_csv("data/y_train_binary.csv", index_col=0).astype(int)
y_train_binary = pd.Series(y_train_binary.values.ravel())
y_train_binary.index = X_train.index
X_test = pd.read_csv("data/X_test.csv", index_col=0).astype(float)
y_test = pd.read_csv("data/y_test.csv", index_col=0).astype(int)
y_test = pd.Series(y_test.values.ravel())
y_test.index = X_test.index
y_test_binary = pd.read_csv("data/y_test_binary.csv", index_col=0).astype(int)
y_test_binary = pd.Series(y_test_binary.values.ravel())
y_test_binary.index = X_test.index

# Two Stage Decision Tree Model
This section is where the two stage decision tree model is trained and tested. First class 1 and class 2 are merged into a single class and a decision tree model is trained to predict class 0 against the combined class 1 and 2. A second model is then trained to predict class 1 against class 2. The output of these two models is then combined to create a prediction of all classes

In [3]:
#Model parameters
desired_max_depth = 5

# Train first Model: 0 vs (1+2)
first_stage_tree = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=desired_max_depth,
    random_state=42,
    class_weight='balanced'
)
first_stage_tree.fit(X_train, y_train_binary)

# Predict first stage
y_pred_binary = first_stage_tree.predict(X_test)

print("\nFirst Stage Classification Report (class 0 vs class 1+2):")
print(classification_report(y_test_binary, y_pred_binary))

# Stage 2: 1 vs 2
# Get original test labels with 0/1/2
y_test_full = y_test.loc[y_test_binary.index]

# Find samples predicted as diabetic
indices_pred_diabetes = np.where(y_pred_binary == 1)[0]
X_test_diabetes = X_test.iloc[indices_pred_diabetes]
y_test_diabetes = y_test_full.iloc[indices_pred_diabetes]

# Keep only class 1 and 2
# Pull records where y_test_diabetes is class 1 or class 2, along with their indexes
test_diabetes = y_test_diabetes.loc[(y_test_diabetes == 1) | (y_test_diabetes == 2)].index
X_test_diabetes = X_test_diabetes.loc[test_diabetes]
y_test_diabetes = y_test_diabetes.loc[test_diabetes]

# Prepare second-stage training data
# Get the indexes where y_train_binary == 1
diabetes_indexes = y_train_binary.loc[y_train_binary == 1].index

# Use these indexes to select records from X_train_diabetes and y_train_diabetes
X_train_diabetes = X_train.loc[diabetes_indexes]
y_train_diabetes = y_train.loc[diabetes_indexes]

second_stage_tree = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=desired_max_depth,
    random_state=42,
    class_weight='balanced'
)
second_stage_tree.fit(X_train_diabetes, y_train_diabetes)

# Predict second stage
y_pred_second_stage = second_stage_tree.predict(X_test_diabetes)

print("\nSecond Stage Classification Report (class 1 vs class 2):")
print(classification_report(y_test_diabetes, y_pred_second_stage))

# Reconstruct final prediction array
y_pred_final = y_pred_binary.copy()
pred_diabetes_indices = np.where(y_pred_binary == 1)[0]

# Reindex and assign
y_test_full_diabetes = y_test_full.iloc[pred_diabetes_indices]
valid_indices = pred_diabetes_indices[(y_test_full_diabetes == 1) | (y_test_full_diabetes == 2)]

for i, idx in enumerate(valid_indices):
    y_pred_final[idx] = y_pred_second_stage[i]

print("\nFinal SVM Classification Report:")
print(classification_report(y_test_full, y_pred_final))


First Stage Classification Report (class 0 vs class 1+2):
              precision    recall  f1-score   support

           0       0.93      0.62      0.74     38012
           1       0.30      0.77      0.43      7945

    accuracy                           0.64     45957
   macro avg       0.61      0.69      0.58     45957
weighted avg       0.82      0.64      0.69     45957


Second Stage Classification Report (class 1 vs class 2):
              precision    recall  f1-score   support

           1       0.13      0.35      0.19       619
           2       0.91      0.75      0.82      5517

    accuracy                           0.71      6136
   macro avg       0.52      0.55      0.51      6136
weighted avg       0.83      0.71      0.76      6136


Final SVM Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.62      0.74     38012
           1       0.01      0.23      0.03       926
           2       0.91      0.5

# Export
This section is where the final predictions of the two stage decision tree model are exported for use by the ensemble

In [4]:
# Save predictions
# --------------------------------------------
import os
os.makedirs("results", exist_ok=True)
np.save("results/y_pred_dt.npy", y_pred_final)