In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
import random
from scipy import stats
from sklearn.metrics import classification_report, accuracy_score

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

In [3]:
class RandomForest:
    
    def __init__(self, B, boostrap_ratio=1):
        self.B = B
        self.boostrap_ratio = boostrap_ratio
        
    def fit(self, X, y):
        
        m, n = X.shape
        tree_params = {'max_depth' :2, 'criterion':'gini', 'min_samples_split':5}
        self.models = [DecisionTreeClassifier(**tree_params) for _ in range(self.B)]
        
        sample_size = int(self.boostrap_ratio * m)
        
        xsamples = np.zeros((self.B, sample_size, n))
        ysamples = np.zeros((self.B, sample_size))
        
        xsamples_oob = []
        ysamples_oob = []
        
        for i in range(self.B):
            used_idx = []
            oob_idx = []
            for j in range(sample_size):
                idx = random.randrange(m)
                while idx in used_idx:
                    idx = random.randrange(m)
                
                xsamples[i, j, :] = X[idx]
                ysamples[i, j] = y[idx]
                
                used_idx.append(idx)
                oob_idx.append(idx)
                mask = np.zeros((m), dtype=bool)
                mask[oob_idx] = True
                
                xsamples_oob.append(X[~mask])
                ysamples_oob.append(y[~mask])
        
        score = 0
        for i, model in enumerate(self.models):
            _X = xsamples[i, :]
            _y = ysamples[i, :]
            model.fit(_X, _y)
            
            X_oob = np.asarray(xsamples_oob[i])
            y_oob = np.asarray(ysamples_oob[i])
            yhat = model.predict(X_oob)
            score += accuracy_score(y_oob, yhat)
            print(f"Tree {i}", accuracy_score(y_oob, yhat))
        avg_score = score / len(self.models)
        print('avg oob score : ',avg_score)
                
    def predict(self, X):
        predictions = np.zeros((self.B, X.shape[0]))
        for i, model in enumerate(self.models):
            yhat = model.predict(X)
            predictions[i, :] = yhat
        
        yhat = stats.mode(predictions)[0][0]
        return yhat

In [4]:
RF = RandomForest(5, 0.8)
RF.fit(X_train, y_train)
yhat = RF.predict(X_test)
print(classification_report(y_test, yhat))

Tree 0 0.9230769230769231
Tree 1 0.9514563106796117
Tree 2 0.9509803921568627
Tree 3 0.9207920792079208
Tree 4 0.93
avg oob score :  0.9352611410242636
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.93      1.00      0.96        13
           2       1.00      0.92      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45



In [5]:
class RandomForest:
    
    def __init__(self, B, boostrap_ratio=1):
        self.B = B
        self.boostrap_ratio = boostrap_ratio
        
    def fit(self, X, y):
        
        m, n = X.shape
        tree_params = {'max_depth' :2, 'criterion':'gini', 'min_samples_split':5, 'max_features':'sqrt'}
        self.models = [DecisionTreeClassifier(**tree_params) for _ in range(self.B)]
        
        sample_size = int(self.boostrap_ratio * m)
        
        xsamples = np.zeros((self.B, sample_size, n))
        ysamples = np.zeros((self.B, sample_size))
        
        xsamples_oob = []
        ysamples_oob = []
        
        for i in range(self.B):
            used_idx = []
            oob_idx = []
            for j in range(sample_size):
                idx = random.randrange(m)
                while idx in used_idx:
                    idx = random.randrange(m)
                
                xsamples[i, j, :] = X[idx]
                ysamples[i, j] = y[idx]
                
                used_idx.append(idx)
                oob_idx.append(idx)
                mask = np.zeros((m), dtype=bool)
                mask[oob_idx] = True
                
                xsamples_oob.append(X[~mask])
                ysamples_oob.append(y[~mask])
        
        score = 0
        for i, model in enumerate(self.models):
            _X = xsamples[i, :]
            _y = ysamples[i, :]
            model.fit(_X, _y)
            
            X_oob = np.asarray(xsamples_oob[i])
            y_oob = np.asarray(ysamples_oob[i])
            yhat = model.predict(X_oob)
            score += accuracy_score(y_oob, yhat)
            print(f"Tree {i}", accuracy_score(y_oob, yhat))
        avg_score = score / len(self.models)
        print('avg oob score : ',avg_score)
                
    def predict(self, X):
        predictions = np.zeros((self.B, X.shape[0]))
        for i, model in enumerate(self.models):
            yhat = model.predict(X)
            predictions[i, :] = yhat
        
        yhat = stats.mode(predictions)[0][0]
        return yhat

In [6]:
RF = RandomForest(5, 0.8)
RF.fit(X_train, y_train)
yhat = RF.predict(X_test)
print(classification_report(y_test, yhat))

Tree 0 0.9519230769230769
Tree 1 0.9514563106796117
Tree 2 0.9411764705882353
Tree 3 0.9405940594059405
Tree 4 0.92
avg oob score :  0.941029983519373
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

