In [3]:
import numpy as np
from mpitree.decision_tree import DecisionTreeClassifier

np.random.seed(42)

In [4]:
m, n = 10, 3
n_classes = 2

In [5]:
X = np.random.choice([f"f{i}" for i in range(n)], (m, n))
y = np.random.randint(n_classes, size=m)

In [6]:
X

array([['f2', 'f0', 'f2'],
       ['f2', 'f0', 'f0'],
       ['f2', 'f1', 'f2'],
       ['f2', 'f2', 'f2'],
       ['f0', 'f2', 'f1'],
       ['f0', 'f1', 'f1'],
       ['f1', 'f1', 'f0'],
       ['f0', 'f1', 'f1'],
       ['f0', 'f0', 'f0'],
       ['f2', 'f2', 'f2']], dtype='<U2')

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
print(clf)

┌── feature_0
│  ├── feature_1 [f0]
│  │  └── class: 1 [f2]
│  │  └── class: 0 [f1]
│  │  └── class: 1 [f0]
│  └── class: 1 [f2]
│  └── class: 1 [f1]


In [9]:
# test `proba` attribute

proba = clf.predict_proba(X_test)

assert proba.shape[1] == n_classes
assert ((proba >= 0.0) & (proba <= 1.0)).all()
assert (np.isclose(np.sum(proba, axis=1), 1.0)).all()

In [10]:
clf.score(X_test, y_test)

0.5

In [11]:
def cmp(a, b):
    # display(a.export_graphviz())
    print(a)
    assert str(a).splitlines() == b.split("\n")

In [12]:
# check not fitted
from sklearn.exceptions import NotFittedError

try:
    a = DecisionTreeClassifier()
    str(a)
except NotFittedError as e:
    print(e)

This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.


In [13]:
# empty dataset

try:
    X, y = [[]], []
    DecisionTreeClassifier().fit(X, y)
except ValueError as e:
    print(e)

Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required.


In [14]:
# single instance

X, y = [["f0"]], np.zeros(1, dtype=int)

cmp(
    DecisionTreeClassifier().fit(X, y),
    """\
└── class: 0""",
)

└── class: 0


In [15]:
# multiple instance, same label

X, y = np.full((5, 1), "f0"), np.zeros(5, dtype=int)

cmp(
    DecisionTreeClassifier().fit(X, y),
    """\
└── class: 0""",
)

└── class: 0


In [16]:
# multiple same instances

X, y = np.full((3, 2), ["f0", "f1"]), [1, 1, 0]

cmp(
    DecisionTreeClassifier().fit(X, y),
    """\
└── class: 1""",
)

└── class: 1


In [17]:
# empty partitioned dataset

X, y = [["f0", "f0"], ["f0", "f1"], ["f1", "f2"], ["f2", "f1"]], [1, 0, 1, 1]

cmp(
    DecisionTreeClassifier().fit(X, y),
    """\
┌── feature_0
│  ├── feature_1 [f0]
│  │  └── class: 1 [f2]
│  │  └── class: 0 [f1]
│  │  └── class: 1 [f0]
│  └── class: 1 [f2]
│  └── class: 1 [f1]""",
)

┌── feature_0
│  ├── feature_1 [f0]
│  │  └── class: 1 [f2]
│  │  └── class: 0 [f1]
│  │  └── class: 1 [f0]
│  └── class: 1 [f2]
│  └── class: 1 [f1]


In [18]:
# testing depth and is_leaf

X, y = [["f0", "f0"], ["f0", "f1"], ["f1", "f2"], ["f2", "f1"]], [1, 0, 1, 1]

from collections import deque


def bfs(source):
    queue = deque([source])
    while queue:
        node = queue.popleft()
        yield node
        queue.extend(node.children.values())


clf = DecisionTreeClassifier().fit(X, y)

for depth, node in enumerate(bfs(clf.tree_), start=1):
    assert all(n.depth == depth for n in node.children.values())
    assert not node.is_leaf if node.children else node.is_leaf

In [19]:
X, y = [["f0", "f0"], ["f0", "f1"], ["f1", "f2"], ["f2", "f1"]], [1, 0, 1, 1]

cmp(
    DecisionTreeClassifier(max_depth=0).fit(X, y),
    """\
└── class: 1""",
)

cmp(
    DecisionTreeClassifier(max_depth=1).fit(X, y),
    """\
┌── feature_0
│  └── class: 1 [f2]
│  └── class: 1 [f1]
│  └── class: 1 [f0]""",
)

└── class: 1
┌── feature_0
│  └── class: 1 [f2]
│  └── class: 1 [f1]
│  └── class: 1 [f0]


In [44]:
test_cases = [
    (np.array([["0"]]), np.array([0])),
    (np.array([["0", "1"]]), np.array([0])),
]

# X = np.arange(2).reshape(1, 2).astype(str)
# y = np.arange(1)

for D in test_cases:
    X, y = D

    feature_idx = 0
    level = "0"

    mask = X[:, feature_idx] == level

    X_new = np.delete(X[mask], feature_idx, axis=1)
    y_new = y[mask]

    print(X_new, y_new)

[] [0]
[['1']] [0]


In [20]:
cmp(
    DecisionTreeClassifier(min_samples_split=0).fit(X, y),
    """\
┌── feature_0
│  ├── feature_1 [f0]
│  │  └── class: 1 [f2]
│  │  └── class: 0 [f1]
│  │  └── class: 1 [f0]
│  └── class: 1 [f2]
│  └── class: 1 [f1]""",
)

cmp(
    DecisionTreeClassifier(min_samples_split=4).fit(X, y),
    """\
┌── feature_0
│  └── class: 1 [f2]
│  └── class: 1 [f1]
│  └── class: 1 [f0]""",
)

┌── feature_0
│  ├── feature_1 [f0]
│  │  └── class: 1 [f2]
│  │  └── class: 0 [f1]
│  │  └── class: 1 [f0]
│  └── class: 1 [f2]
│  └── class: 1 [f1]
┌── feature_0
│  └── class: 1 [f2]
│  └── class: 1 [f1]
│  └── class: 1 [f0]
