In [43]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from collections import Counter
import import_ipynb
from decision_tree import DecisionTree

In [44]:
class RandomForestClassifier:
    
    def __init__(self,
                 maxDepth = 10,
                 minSamples = 2,
                 numTrees = 10,
                 numFeatures = 5
                ):
        
        self.maxDepth = maxDepth
        self.minSamples = minSamples
        self.numTrees = numTrees
        self.numFeatures = numFeatures
        self.trees = []
        
    
    def fit(self, X, y):
        
        #create numTrees number of decision trees
        for i in range(self.numTrees):
            dTree = DecisionTree(maxDepth = self.maxDepth,
                                 minSamples = self.minSamples,
                                 numFeatures = self.numFeatures #use a random subset of features at each node creation step
                                )
            
            #for each tree, use a newly created bootstrapped dataset
            sample = self.__bootstrap(X)
            
            dTree.fit(X[sample], y[sample])
            self.trees.append(dTree)
    
    def predict(self, X):
        #at prediction time, generate a prediction for each fitted tree. the final prediction for each observation represents the majority vote across all trees
        predictions = np.swapaxes([tree.predict(X) for tree in self.trees], 0, 1)
        return [Counter(p).most_common()[0][0] for p in predictions]
           
    
    def __bootstrap(self, X):
        #function to bootstrap dataset. samples from original dataset with replacement until len(sampled dataset) = len(orginal dataset)
        return np.random.choice(X.shape[0], X.shape[0], replace=True)

In [45]:
data = datasets.load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=123)

In [46]:
model = RandomForestClassifier()
model.fit(X_train, Y_train)

In [47]:
predictions = model.predict(X_test)

In [48]:
accuracy_score(Y_test, predictions)

0.9122807017543859