In [1]:
from collections import Counter
import numpy as np
import import_ipynb
from customized_DecisionTree import DecisionTree, accuracy

importing Jupyter notebook from customized_DecisionTree.ipynb
DecisionTree model accuracy = 92.1%


In [2]:
class RandomForest:
    def __init__(self, n_trees=10, min_sample_split=2, max_depth=100, n_random_features=None, mode='gini'):
        self.n_trees = n_trees
        self.min_sample_split = min_sample_split
        self.max_depth = max_depth
        self.n_random_features = n_random_features
        self.mode = mode
        
        self.trees = []
    
    def fit(self, X, y):
        self.trees = []
        
        for _ in range(self.n_trees):
            tree = DecisionTree(min_sample_split=2, max_depth=100, n_random_features=None, mode='gini')
            X_sampled, y_sampled = self._bootstrap_sample(X, y)
            tree.fit(X_sampled, y_sampled)
            self.trees.append(tree)
    
    def predict(self, X):
        trees_predictions = [tree.predict(X) for tree in self.trees]
        predictions_trees_element = np.swapaxes(trees_predictions, 0, 1) #predictions of all trees per each element
        majority_vote = [self._most_common_label(element_predictions) for element_predictions in predictions_trees_element]
        return np.array(majority_vote)
    
    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        sampled_indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[sampled_indices], y[sampled_indices]
    
    def _most_common_label(self, y):
        counter = Counter(y)
        most_common_class = counter.most_common(1)[0][0]
        return most_common_class

In [3]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [4]:
dataset = datasets.load_breast_cancer()
X, y = dataset.data, dataset.target

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [6]:
clf = RandomForest(n_trees=3, max_depth=10)
clf.fit(X_train, y_train)

In [7]:
y_predicted = clf.predict(X_test)

In [8]:
acc = accuracy(y_test, y_predicted)

In [9]:
print(f'accuracy = {acc*100:.1f}%')

accuracy = 93.9%
