In [1]:
import random

import numpy as np

from dt_class import DecisionTree

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
RANDOM_STATE = 14

In [3]:
def insert_null_values(X, feat_count, null_ratio):
    for feat in range(feat_count):
        null_indexes = random.sample(range(X.shape[0]), round(X.shape[0] * null_ratio))
        X[:, feat][null_indexes] = np.nan

In [4]:
tree = DecisionTree(criterion='entropy', max_depth=5, verbose=True)

DecisionTree class set params:
{'max_depth': 5, 'min_samples_split': 2, 'criterion': 'entropy', 'verbose': True}


In [5]:
X, y = make_classification(n_features=30, n_redundant=5, n_samples=1000, random_state=RANDOM_STATE)

# insert null values
insert_null_values(X, 5, 0.2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=RANDOM_STATE)

In [6]:
%%time

tree.fit(X_train, y_train)

node (depth=1 n_objects=950) division: feature_idx = 21 threshold = 0.11
node (depth=2 n_objects=546) division: feature_idx = 16 threshold = 0.11
node (depth=3 n_objects=511) division: feature_idx = 27 threshold = 0.12
node (depth=4 n_objects=78) division: feature_idx = 23 threshold = -1.63
create leaf (depth=5 n_objects=43): value=0 labels_ratio=[0.79 0.21]
create leaf (depth=5 n_objects=35): value=1 labels_ratio=[0.34 0.66]
node (depth=4 n_objects=433) division: feature_idx = 16 threshold = -0.36
create leaf (depth=5 n_objects=183): value=0 labels_ratio=[0.98 0.02]
create leaf (depth=5 n_objects=250): value=0 labels_ratio=[0.84 0.16]
create leaf (depth=3 n_objects=35): value=1 labels_ratio=[0. 1.]
node (depth=2 n_objects=404) division: feature_idx = 16 threshold = 0.21
node (depth=3 n_objects=167) division: feature_idx = 16 threshold = -0.28
node (depth=4 n_objects=14) division: feature_idx = 21 threshold = 0.12
create leaf (depth=5 n_objects=3): value=1 labels_ratio=[0. 1.]
create l

DecisionTree(criterion='entropy', max_depth=5, min_samples_split=2,
       verbose=True)

In [7]:
# %prun -l dt_class.py tree.fit(X_train, y_train)
# %prun tree.fit(X_train, y_train)

In [8]:
tree.predict(X_test)[:10]

array([1, 0, 0, 0, 0, 0, 1, 1, 0, 1])

In [9]:
tree.predict_proba(X_test)[:10]

array([[0.01075269, 0.98924731],
       [0.98360656, 0.01639344],
       [0.98360656, 0.01639344],
       [0.98360656, 0.01639344],
       [0.98360656, 0.01639344],
       [0.844     , 0.156     ],
       [0.        , 1.        ],
       [0.38333333, 0.61666667],
       [0.844     , 0.156     ],
       [0.        , 1.        ]])

In [10]:
round(roc_auc_score(y_test, tree.predict_proba(X_test)[:, 1]), 2)

0.9

In [11]:
round(roc_auc_score(y_train, tree.predict_proba(X_train)[:, 1]), 2)

0.96