In [2]:
import numpy as np
import sys
from pathlib import Path
from scipy import stats
import math

file = Path.cwd()
package_root_directory = file.parents[0]
sys.path.append(str(package_root_directory))

from decision_tree.decision_tree_classifier import MyDecisionTreeClassifier

In [4]:
x = np.array([[0.], [1.]])
len(x)

2

In [2]:
def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]

In [3]:
class MyRandomForestClassifier:
    def __init__(self, n_estimators=100, min_samples_split=2,
                max_depth=100, max_features='auto'):
        self.n_estimators = n_estimators
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.max_features = max_features
        self.estimators_ = []

    def fit(self, X, y):
        n_samples, n_features = X.shape

        if not self.max_features:
            self.max_features = n_features
        elif self.max_features == 'auto':
            self.max_features = math.ceil(np.sqrt(n_features))

        self.estimators_ = []
        for _ in range(self.n_estimators):
            tree = MyDecisionTreeClassifier(
                min_samples_split=self.min_samples_split,
                max_depth=self.max_depth,
                max_features=self.max_features,
            )
            X_sample, y_sample = bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.estimators_.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.estimators_])
        y_pred_majority_votes, n_votes = stats.mode(tree_preds, axis=0)
        return y_pred_majority_votes.flatten()


In [4]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
seed = 42
max_depth = 3
np.random.seed(seed)
clf = MyRandomForestClassifier(n_estimators=3, max_depth=max_depth)
clf.fit(X_train, y_train)

In [6]:
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("My implementation: Accuracy =", acc)

My implementation: Accuracy = 0.9824561403508771


In [7]:
sk_clf = RandomForestClassifier(n_estimators=3, max_depth=max_depth, random_state=seed)
sk_clf.fit(X_train, y_train)
sk_y_pred = sk_clf.predict(X_test)
acc = accuracy_score(y_test, sk_y_pred)
print("Sklearn implementation: Accuracy =", acc)

Sklearn implementation: Accuracy = 0.956140350877193
