In [19]:
import numpy as np
from collections import Counter

In [20]:
def gini(y):
    classes = np.unique(y)
    impurity = 1.0

    for c in classes:
        p = np.sum(y == c) / len(y)
        impurity -= p  ** 2
    return impurity

In [21]:
def split_dataset(X, y, feature_idx, threshold):
    left_mask = X[: , feature_idx] < threshold
    right_mask = ~left_mask
    return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

In [22]:
class TreeNode:
    def __init__(self, feature_idx=None, threshold=None, left=None, right=None, *, value=None):
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    

In [31]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, max_features=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.root = None

    def fit(self, X, y):
        self.n_featrues = X.shape[1]
        if not self.max_features:
            self.max_features = int(np.sqrt(self.n_featrues))
        self.root = self._build_tree(X, y)

    def _best_split(self, X, y):
        best_gini = 1.0
        best_featrue, best_threshole = None, None
        feature_idxs = np.random.choice(self.n_featrues, self.max_features, replace=False)


        for feature_idx in feature_idxs:
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                X_left, y_left, X_right, y_right = split_dataset(X, y, feature_idx, threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                gini_left = gini(y_left)
                gini_right = gini(y_right)

                weighted_gini = (len(y_left) * y_left + len(y_right) * y_right ) / len(y)

                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    best_featrue = feature_idx
                    best_thresholed = threshold
            return best_featrue, best_threshole


    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_classes = len(np.unique(y))
        if(depth > self.max_depth) or (num_samples < self.min_samples_split) or (num_classes == 1):
            leaf_value = self.most_common_label(y)
            return TreeNode(value=leaf_value)

        feature_idx, threshold = self._best_split(X, y)

        if feature_idx is None:
            return TreeNode(value=self.most_common_label(y))
        X_left, y_left, X_right, y_right = split_dataset(X, y, feature_idx, threshold)
        left_child = self._build_tree(X_left, y_left, depth+1)
        right_child = self._build_tree(X_right, y_right, depth + 1)
        return TreeNode(feature_idx, threshold, left_child, right_child)

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature_idx] < node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)



        

In [27]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=None, min_samples_split=2, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            idxs = np.random.choice(len(X), len(y), replace=True)
            X_sample, y_sample = X[idxs], y[idxs]
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split, max_features=self.max_features)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)


    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.array([Counter(tree_preds[:,i]).most_common(1)[0][0] for i in range(len(X))])

In [32]:
# Sample data
from sklearn.datasets import load_iris
data = load_iris()
X, y = data.data, data.target

# Initialize Random Forest
rf = RandomForest(n_trees=5, max_depth=10, max_features=2)
rf.fit(X, y)

predictions = rf.predict(X)
print("Predictions:", predictions)

ValueError: operands could not be broadcast together with shapes (3,) (147,) 