In [4]:
import time
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier as skverdtc
from pmlb import fetch_data

In [12]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        unique_classes, class_counts = np.unique(y, return_counts=True)

        if (len(unique_classes) == 1) or (depth == self.max_depth) or (n_samples < self.min_samples_split):
            # If all samples have the same class or the tree depth limit is reached,
            # create a leaf node with the most common class
            return unique_classes[np.argmax(class_counts)]

        # Find the best split based on information gain
        best_split = self._find_best_split(X, y)

        if best_split is None:
            # If no split improves information gain, create a leaf node
            return unique_classes[np.argmax(class_counts)]

        # Create a decision node based on the best split
        feature_index, threshold, gini = best_split
        node = {}
        node["feature_index"] = feature_index
        node["threshold"] = threshold
        node["left"] = self._build_tree(X[X[:, feature_index] <= threshold], y[X[:, feature_index] <= threshold], depth + 1)
        node["right"] = self._build_tree(X[X[:, feature_index] > threshold], y[X[:, feature_index] > threshold], depth + 1)
        node["gini"] = gini
        node["num_samples"] = class_counts
        node["depth"] = depth
        return node

    def _find_best_split(self, X, y):
        n_samples, n_features = X.shape

        gini = self._calculate_gini(y) 

        best_info_gain = 0
        best_split = None

        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                y_left = y[X[:, feature_index] <= threshold]
                y_right = y[X[:, feature_index] > threshold]

                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                p_left = len(y_left) / n_samples
                p_right = len(y_right) / n_samples
                gain = gini - (p_left * self._calculate_gini(y_left) + p_right * self._calculate_gini(y_right))

                if gain > best_info_gain:
                    best_info_gain = gain
                    best_split = (feature_index, threshold, gini)

        return best_split

    def _calculate_gini(self, y):
        _, class_counts = np.unique(y, return_counts=True)
        return 1.0 - sum((count / len(y)) ** 2 for count in class_counts)

    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) for x in X])

    def _predict_tree(self, x, tree):
        if isinstance(tree, np.int64) or isinstance(tree, np.int32):
            return tree
        feature_index, threshold, left_tree, right_tree = tree["feature_index"], tree["threshold"], tree["left"], tree["right"]
        if x[feature_index] <= threshold:
            return self._predict_tree(x, left_tree)
        else:
            return self._predict_tree(x, right_tree)

## Iris Dataset

In [13]:
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(max_depth=2)
start_time = time.perf_counter()
clf.fit(X_train, y_train)
end_time = time.perf_counter()
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

elapsed_time_microsec = (end_time - start_time) * 1000

print("Accuracy:", accuracy)
print(f"Training Time: {elapsed_time_microsec:.2f} milliseconds")

clf2 = skverdtc()
start_time = time.perf_counter()
clf2.fit(X_train, y_train)
end_time = time.perf_counter()
y_pred = clf2.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

elapsed_time_microsec = (end_time - start_time) * 1000

print("Accuracy:", accuracy)
print(f"Training Time: {elapsed_time_microsec:.2f} milliseconds")

Accuracy: 0.9777777777777777
Training Time: 39.29 milliseconds
Accuracy: 1.0
Training Time: 2.76 milliseconds


## Breast Cancer Dataset

In [16]:
X2, y2 = fetch_data('breast_cancer', return_X_y=True, local_cache_dir='./')

X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=10, test_size=0.2, shuffle=True)

model = DecisionTreeClassifier(max_depth=3)
start_time = time.perf_counter()
model.fit(X_train, y_train)
end_time = time.perf_counter()
predictions = model.predict(X_test)

elapsed_time_microsec = (end_time - start_time) * 1000

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(f"Training Time: {elapsed_time_microsec:.2f} milliseconds")

clf = skverdtc()
start_time = time.perf_counter()
clf.fit(X_train, y_train)
end_time = time.perf_counter()

elapsed_time_microsec = (end_time - start_time) * 1000

# Calculate the accuracy of the model
print("Accuracy:", clf.score(X_test, y_test))
print(f"Training Time: {elapsed_time_microsec:.2f} milliseconds")

Accuracy: 0.7758620689655172
Training Time: 49.05 milliseconds
Accuracy: 0.6379310344827587
Training Time: 4.65 milliseconds


## Ann Thyroid

In [19]:
X3, y3 = fetch_data('ann_thyroid', return_X_y=True, local_cache_dir='./')

X_train, X_test, y_train, y_test = train_test_split(X3, y3, random_state=10, test_size=0.2, shuffle=True)

model = DecisionTreeClassifier(max_depth=4)
start_time = time.perf_counter()
model.fit(X_train, y_train)
end_time = time.perf_counter()
predictions = model.predict(X_test)

elapsed_time_microsec = (end_time - start_time) * 1000

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(f"Training Time: {elapsed_time_microsec:.2f} milliseconds")

clf = skverdtc()
start_time = time.perf_counter()
clf.fit(X_train, y_train)
end_time = time.perf_counter()

elapsed_time_microsec = (end_time - start_time) * 1000

# Calculate the accuracy of the model
print("Accuracy:", clf.score(X_test, y_test))
print(f"Training Time: {elapsed_time_microsec:.2f} milliseconds")

Accuracy: 0.99375
Training Time: 806.40 milliseconds
Accuracy: 0.9972222222222222
Training Time: 10.54 milliseconds


## XGBoost (Ann Thyroid)

In [24]:
import xgboost as xgb

X4, y4 = fetch_data('ann_thyroid', return_X_y=True, local_cache_dir='./')

conditions = [y4 == 1, y4 == 2, y4 == 3]
values = [0, 1, 2]

y4 = np.select(conditions, values, default=y4)

X_train, X_test, y_train, y_test = train_test_split(X4, y4, random_state=10, test_size=0.2, shuffle=True)


dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


params = {
    "objective": "multi:softmax",  
    "num_class": len(np.unique(y4)),       
    "max_depth": 3,                  
    "learning_rate": 0.1,            
    "n_estimators": 500              
}


start_time = time.perf_counter()
model = xgb.train(params, dtrain)
end_time = time.perf_counter()

elapsed_time_millisec = (end_time - start_time) * 1000

y_pred = model.predict(dtest)
y_pred_int = y_pred.astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred_int))
print(f"Training Time: {elapsed_time_millisec:.2f} milliseconds")

Accuracy: 0.9909722222222223
Training Time: 25.43 milliseconds


Parameters: { "n_estimators" } are not used.

