In [12]:
from tree_based_models.decision_tree import DecisionTree
import numpy as np
from numpy.random import randint
from numpy import sqrt
from scipy.stats import mode
from numpy import log2


In [13]:
def Bootstrap(y,n_estimators): 
    # this function generates n_estimators lists of indices of size num_instances of y 
    # dim: n_estimators * len(y)
    num_instances = y.shape[0]
    indices = randint(0,high=num_instances,size=(n_estimators,num_instances)) # bootstrap with replacement
    return indices

In [14]:
# We do not explain too much detail since the code should be self explainatory after going through Deccision Tree notebook
class RandomForest:

    def __init__(self,
                 n_estimators=100,
                 criterion='gini',
                 max_depth=None,
                 min_samples_split=5,
                 max_features='sqrt',
                 random_state=None):
        self.n_estimators = n_estimators  # how many trees in the forest
        self.criterion = criterion  # gini or entropy
        self.max_depth = max_depth  # int or None
        self.min_samples_split = min_samples_split  # int or None
        self.max_features = max_features  # 'sqrt' or 'log2'; maximum features considered at each split
        self.random_state = random_state
        self.estimators = [
        ]  # stores n_estimators number of estimators (trees)

    def fit(self, X, y):
        self.estimators = []
        if self.max_features == 'sqrt':
            n_features = round(sqrt(X.shape[1]))
        elif self.max_features == 'log2':
            n_features = round(log2(X.shape[1]))
        elif self.max_features == None:
            n_features = X.shape[1]
        np.random.seed(self.random_state)
        indices_array = Bootstrap(y, self.n_estimators)
        # X_samples dim: n_estimators * n * k
        # y_samples dim: n_estimators * n
        X_samples, y_samples = X[indices_array], y[indices_array]
        for i in range(self.n_estimators):
            tree = DecisionTree(loss_func=self.criterion,
                                max_depth=self.max_depth,
                                min_samples_split=self.min_samples_split,
                                max_features=n_features,
                                random_state=i)
            tree.fit(X_samples[i], y_samples[i])
            self.estimators.append(tree)
        return self

    def predict(self, X):
        # dim: n_estimators * len(X)
        y_preds = np.array([tree.predict(X) for tree in self.estimators])
        # dim: len(X)
        y_preds = mode(y_preds)[0][0]
        return y_preds

In [15]:
if __name__ == "__main__":
    import pandas as pd
    df = pd.read_csv('titanic.csv',
                     usecols=[
                         'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
                         'Fare', 'Embarked'
                     ])
    df.isna().sum()

    y = df['Survived']
    X = df.drop('Survived', axis=1)
    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)

    ### Do some imputation since algorithms cannot handle missing values

    from sklearn.impute import SimpleImputer

    imp = SimpleImputer(strategy='mean')
    X_train[['Age']] = imp.fit_transform(X_train[['Age']])

    imp2 = SimpleImputer(strategy='most_frequent')
    X_train[['Embarked']] = imp2.fit_transform(X_train[['Embarked']])

    # Prepare train and test data
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import make_column_transformer

    ohe = OneHotEncoder()
    ct = make_column_transformer((ohe, ['Sex', 'Embarked']),
                                 remainder='passthrough')
    X_train_vect = ct.fit_transform(X_train)
    y_train = y_train.values

    X_test[['Age']] = imp.transform(X_test[['Age']])
    X_test[['Embarked']] = imp2.transform(X_test[['Embarked']])
    X_test_vect = ct.transform(X_test)
    y_test = y_test.values
    model = RandomForest(criterion='gini',
                         min_samples_split=10,
                         max_depth=5,
                         random_state=4,
                         max_features='log2')
    model.fit(X_train_vect, y_train)

    y_pred = model.predict(X_test_vect)

    accuracy = sum(y_pred == y_test) / len(y_test)
    print("Accuracy:", accuracy)

Accuracy: 0.8134328358208955


In [16]:
if __name__ == "__main__":
    # Imports
    from time import time
    t0 = time()
    from sklearn import datasets
    from sklearn.model_selection import train_test_split

    def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

    data = datasets.load_breast_cancer()
    X, y = data.data, data.target

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1234)

    clf = RandomForest(max_depth=10)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    acc = accuracy(y_test, y_pred)

    print("Accuracy:", acc)

    t1 = time()

    total = t1 - t0
    print(total, 's')

Accuracy: 0.9122807017543859
92.95777940750122 s


In [5]:
from time import time

t0 = time()
rf = RandomForest(n_estimators=100, max_depth=5)
rf.fit(X_train, y_train)
t1 = time()

total = t1 - t0
print(total, 's')

47.12033486366272 s


In [12]:
y_pred = rf.predict(X_test)
acc = accuracy(y_test, y_pred)

print("Accuracy:", acc)

Accuracy: 0.9035087719298246


In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
from time import time

t0 = time()
rf = RandomForestClassifier(n_estimators=100, max_depth=5)
rf.fit(X_train, y_train)
t1 = time()

total = t1 - t0
print(total, 's')

0.13984298706054688 s


In [15]:
y_pred = rf.predict(X_test)
acc = accuracy(y_test, y_pred)

print("Accuracy:", acc)

Accuracy: 0.8947368421052632
