In [4]:
import numpy as np

X = np.array([
    [0.25, 0.75, 0.1, 0.9],
    [0.1, 0.3, 0.8, 0.4],
    [0.7, 0.5, 0.2, 0.6],
    [0.9, 0.2, 0.3, 0.1],
    [0.4, 0.6, 0.7, 0.3],
    [0.6, 0.9, 0.8, 0.2],
    [0.3, 0.4, 0.5, 0.7],
    [0.8, 0.1, 0.6, 0.5],
    [0.2, 0.7, 0.9, 0.4],
    [0.5, 0.8, 0.4, 0.6],
    [0.9, 0.3, 0.1, 0.7],
    [0.1, 0.2, 0.7, 0.8],
    [0.7, 0.6, 0.5, 0.3],
    [0.4, 0.9, 0.3, 0.2],
    [0.8, 0.5, 0.6, 0.1],
    [0.3, 0.7, 0.4, 0.9],
    [0.6, 0.4, 0.9, 0.5],
    [0.2, 0.1, 0.8, 0.7],
    [0.5, 0.3, 0.6, 0.4],
    [0.9, 0.7, 0.2, 0.3]
])

y = np.array([0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1])


In [49]:
from scipy import stats
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.linear_model import LogisticRegression

class RandomForest:
    def __init__(self, n_trees, X, y,  base_model = DecisionTree, sample_percentage = 0.4, feature_percentage = 0.5):
        self.trees = [base_model() for i in range(n_trees)]
        # each tree has its own (X,y) tuple as training dataset. 
        self.sample_percentage = sample_percentage
        self.feature_percentage = feature_percentage
        self.subsets = [self._bootstrap_sample(X,y) for i in range(n_trees)]


    def _bootstrap_sample(self, X, y):
        # sampling the records
        sample_size = int(len(X) * self.sample_percentage)
        random_indices = np.random.choice(len(X), size=sample_size, replace=True)
        sample_X = X[random_indices]
        sample_y = y[random_indices]
        # sampling the features.
        feature_indices = np.arange(X.shape[1])
        np.random.shuffle(feature_indices)
        feature_indices_sample = feature_indices[:2]
        sample_X = sample_X[:, feature_indices_sample]
        return (sample_X, sample_y, feature_indices_sample)

    def fit(self, X, y):
        # fit each tree using the X and y.
        for idx, tree in enumerate(self.trees):
            sample_X, sample_y = self.subsets[idx][0], self.subsets[idx][1]
            tree.fit(sample_X, sample_y)

    def predict(self, X): # this X here is N by 4. 
        # step 1: each tree predicts
        predictions = []
        for idx, tree in enumerate(self.trees):
            feature_indices = self.subsets[idx][2]
            sample_X = X[:, feature_indices]
            pred_y = tree.predict(sample_X)
            predictions.append(pred_y)
        
        # step 2: vote
        final_prediction = []
        for record_idx in range(X.shape[0]):
            opinions = [pred[record_idx] for pred in predictions]
            # taking each tree's first opinion
            unique_values, counts = np.unique(opinions, return_counts=True)
            max_count = np.max(counts)
            final_opinion = unique_values[counts == max_count]
            final_prediction.append(final_opinion)
        
        return predictions, final_prediction
        


In [50]:
from sklearn.svm import SVC
lr_rf = RandomForest(7, X, y, base_model = LogisticRegression)
real_rf = RandomForest(7, X, y, base_model = LogisticRegression)
svm_rf = RandomForest(7, X, y, base_model = SVC)