In [1]:
import numpy as np
import pandas as pd

from sktree.decision_tree import DecisionTreeClassifier

# from sklearn.tree import DecisionTreeClassifier as DTC

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris(as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(
    iris.data[["sepal length (cm)", "sepal width (cm)"]],
    iris.target,
    test_size=0.2,
    random_state=42,
)

In [3]:
data = pd.DataFrame(
    {
        "Stream": ["false", "true", "true", "false", "false", "true", "true"],
        "Slope": [
            "steep",
            "moderate",
            "steep",
            "steep",
            "flat",
            "steep",
            "steep",
        ],
        "Elevation": ["high", "low", "medium", "medium", "high", "highest", "high"],
        "Vegetation": [
            "chapparal",
            "riparian",
            "riparian",
            "chapparal",
            "conifer",
            "conifer",
            "chapparal",
        ],
    }
)

X, y = data.iloc[:, :-1], data.iloc[:, -1]

In [4]:
clf = DecisionTreeClassifier().fit(X, y)
print(clf)

┌── Elevation
│  ├── Stream [medium]
│  │  └── class: chapparal [false]
│  │  └── class: riparian [true]
│  ├── Slope [high]
│  │  └── class: conifer [flat]
│  │  └── class: chapparal [moderate]
│  │  └── class: chapparal [steep]
│  └── class: conifer [highest]
│  └── class: riparian [low]


In [5]:
print(clf.predict(X))
clf.predict_proba(X).round(3)

['chapparal' 'riparian' 'riparian' 'chapparal' 'conifer' 'conifer'
 'chapparal']


array([1., 1., 1., 1., 1., 1., 1.])

In [6]:
data = pd.DataFrame(
    {
        "Stream": ["false", "true", "true", "false", "false", "true", "true"],
        "Slope": [
            "steep",
            "moderate",
            "steep",
            "steep",
            "flat",
            "steep",
            "steep",
        ],
        "Elevation": [3900, 300, 1500, 1200, 4450, 5000, 3000],
        "Vegetation": [
            "chapparal",
            "riparian",
            "riparian",
            "chapparal",
            "conifer",
            "conifer",
            "chapparal",
        ],
    }
)

X, y = data.iloc[:, :-1], data.iloc[:, -1]

In [7]:
clf = DecisionTreeClassifier().fit(X, y)
print(clf)

[(array([['false', 'steep'],
       ['true', 'moderate'],
       ['true', 'steep'],
       ['false', 'steep'],
       ['true', 'steep']], dtype=object), array(['chapparal', 'riparian', 'riparian', 'chapparal', 'chapparal'],
      dtype=object), 'True'), (array([['false', 'flat'],
       ['true', 'steep']], dtype=object), array(['chapparal', 'riparian', 'riparian', 'chapparal', 'chapparal'],
      dtype=object), 'False')]
[(array([['false', 'steep'],
       ['true', 'moderate'],
       ['true', 'steep'],
       ['false', 'steep'],
       ['true', 'steep']], dtype=object), array(['chapparal', 'riparian', 'riparian', 'chapparal', 'chapparal'],
      dtype=object), 'True'), (array([['false', 'flat'],
       ['true', 'steep']], dtype=object), array(['chapparal', 'riparian', 'riparian', 'chapparal', 'chapparal'],
      dtype=object), 'False')]
┌── Elevation
│  ├── Stream [< 4450.00]
│  │  ├── Stream [true]
│  │  │  └── class: riparian [false]
│  │  │  └── class: riparian [true]
│  │  └── cla

In [8]:
X = X.to_numpy()
y = y.to_numpy()


def is_numeric_dtype(arr):
    try:
        arr.astype(np.float64)
        return True
    except ValueError:
        return False


split_feature_idx = 2


def proba(X):
    return np.unique(X, return_counts=True)[1] / len(X)


def _entropy(y):
    proba = np.proba(y)
    return -np.sum(proba * np.log2(proba))


def split_mask(X, mask):
    return [X[mask], X[~mask]]


np.split_mask = split_mask


def cost(x):
    threshold = x[-1]
    mask = X[:, split_feature_idx] < threshold
    levels = np.split_mask(X, mask)
    weights = [len(l) / len(X) for l in levels]
    impurity = [_entropy(y[mask]), _entropy(y[~mask])]
    return np.dot(weights, impurity)


if is_numeric_dtype(X[:, split_feature_idx]):
    costs = np.apply_along_axis(cost, axis=1, arr=X)
    idx = np.argmin(costs)
    min_cost = min(costs)
    t_hat = X[idx, -1]

    print(t_hat)
    print(min_cost)
    print(_entropy(y) - min_cost)

4450
0.6935361388961918
0.863120568566631


In [9]:
# from sklearn.utils.estimator_checks import check_estimator
# check_estimator(DecisionTreeClassifier())

In [10]:
# clf = DecisionTreeClassifier(criterion={"max_depth": 2}).fit(X_train, y_train)
# print(clf)

# train_score, test_score = clf.score(X_train, y_train), clf.score(X_test, y_test)

# print(f"\nTrain-Test Accuracy: ({train_score:.2%}, {test_score:.2%})")
# print(f"\nPredict Probability: {clf.predict_prob(X_test.iloc[3]).round(3)}")

In [11]:
# from matplotlib.colors import ListedColormap
# import matplotlib.patches as mpatches

# step size in the mesh
# h = 0.02

# a = "#4993c3"
# b = "#e73031"
# c = "#e3c471"

# aa = "#0000ec"
# bb = "#ff0000"
# cc = "#bfbf00"

# # Create color maps
# cmap_light = ListedColormap([a, b, c])
# cmap_bold = ListedColormap([aa, bb, cc])

# fig = plt.figure()
# gs = fig.add_gridspec(2, 2, hspace=0, wspace=0)
# (ax1, ax2), (ax3, ax4) = gs.subplots(sharex="col", sharey="row")

# depths = [1, 3, 5, np.inf]
# coor = [ax1, ax2, ax3, ax4]

# for depth, ax in zip(depths, coor):
#     clf = DecisionTreeClassifier(criterion={"max_depth": depth}).fit(X_train, y_train)

#     x_min, x_max = (
#         X_train.values[:, 0].min() - 1,
#         X_train.values[:, 0].max() + 1,
#     )
#     y_min, y_max = (
#         X_train.values[:, 1].min() - 1,
#         X_train.values[:, 1].max() + 1,
#     )

#     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
#     X_t = pd.DataFrame(
#         np.c_[xx.ravel(), yy.ravel()], columns=["sepal length (cm)", "sepal width (cm)"]
#     )

#     Z = np.array([clf.predict(X_t.iloc[x]) for x in range(len(X_t))])
#     Z = Z.reshape(xx.shape)

#     ax.pcolormesh(xx, yy, Z, cmap=cmap_light)

#     ax.scatter(
#         X_train.values[:, 0],
#         X_train.values[:, 1],
#         c=y_train.values,
#         cmap=cmap_bold,
#         s=15,
#         edgecolors="black",
#         label=f"Depth={depth}",
#     )

#     ax.set_xlim(xx.min(), xx.max())
#     ax.set_ylim(yy.min(), yy.max())

#     if depth == 3:
#         patches = [
#             mpatches.Patch(color=aa, label="iris setosa"),
#             mpatches.Patch(color=bb, label="iris versicolor"),
#             mpatches.Patch(color=cc, label="iris virginica"),
#         ]
#         handles, labels = ax.get_legend_handles_labels()
#         ax.legend(handles=patches, prop={"size": 6})

# fig.supxlabel("sepal length (cm)")
# fig.supylabel("sepal width (cm)")

# plt.show()