In [1]:
import numpy as np

def entropy(y):
    """
    y: 1D array-like of class labels
    returns: entropy H(y)
    """
    y = np.asarray(y)
    if y.size == 0:
        return 0.0

    _, counts = np.unique(y, return_counts=True)
    p = counts / counts.sum()

    # avoid log2(0)
    p = p[p > 0]
    return float(-(p * np.log2(p)).sum())


In [2]:
def gini(y):
    y = np.asarray(y)
    if y.size == 0:
        return 0.0

    _, counts = np.unique(y, return_counts=True)
    p = counts / counts.sum()
    return float(1.0 - (p**2).sum())


In [3]:
def information_gain(parent_y, left_y, right_y, impurity_fn):
    """
    impurity_fn: function like entropy or gini
    """
    parent_y = np.asarray(parent_y)
    left_y = np.asarray(left_y)
    right_y = np.asarray(right_y)

    n = parent_y.size
    if n == 0:
        return 0.0

    nL, nR = left_y.size, right_y.size
    if nL == 0 or nR == 0:
        return 0.0

    parent_imp = impurity_fn(parent_y)
    child_imp = (nL / n) * impurity_fn(left_y) + (nR / n) * impurity_fn(right_y)
    return float(parent_imp - child_imp)


In [4]:
def best_split(X, y, impurity_fn):
    """
    X: (n_samples, n_features)
    y: (n_samples,)
    returns: best_feature, best_threshold, best_gain
    """
    X = np.asarray(X)
    y = np.asarray(y)

    n_samples, n_features = X.shape
    if n_samples <= 1:
        return None, None, 0.0

    best_gain = 0.0
    best_feature = None
    best_threshold = None

    for j in range(n_features):
        xj = X[:, j]
        vals = np.unique(xj)
        if vals.size <= 1:
            continue

        # candidate thresholds: midpoints
        thresholds = (vals[:-1] + vals[1:]) / 2.0

        for t in thresholds:
            left_mask = xj <= t
            right_mask = ~left_mask

            if left_mask.sum() == 0 or right_mask.sum() == 0:
                continue

            gain = information_gain(y, y[left_mask], y[right_mask], impurity_fn)
            if gain > best_gain:
                best_gain = gain
                best_feature = j
                best_threshold = t

    return best_feature, best_threshold, float(best_gain)


In [None]:
import numpy as np
#sanity checks
y = np.array([0, 0, 1, 1])
print("entropy:", entropy(y))   # should be 1.0
print("gini:", gini(y))         # should be 0.5

X = np.array([[0],[1],[2],[3]])
feat, thr, gain = best_split(X, y, entropy)
print("best_split:", feat, thr, gain)


entropy: 1.0
gini: 0.5
best_split: 0 1.5 1.0
