The purpose of these experiments is to evaluate the predictive performance **(test accuracy)** of MF as a function of 

(i) fraction of training data and 

(ii) training time.

- We divide the training data into **100 mini-batches** and we compare the performance of online random forests (MF, ORF-Saffari [20]) to batch random forests (Breiman-RF, ERT-k, ERT-1) which are trained on the same fraction of the training data.
- We evaluate on four of the five datasets (usps, satimages, letter, dna) — we excluded the mushroom dataset as even very simple logical rules achieve > 99% accuracy on this dataset. 
- We re-scaled the datasets such that each feature takes on values in the range [0, 1] (by subtracting the min value along that dimension and dividing by the range along that dimension, where range = max − min).

As is common in the random forest literature [2], we set **the number of trees M = 100.**

- For Mondrian forests, we set the lifetime λ = ∞ and the HNSP discount parameter γ = 10D. 

- For Breiman-RF and ERT, the hyper parameters are set to default values. 

- We repeat each algorithm with five random initializations and report the mean performance.

# Load data

In [9]:
import h5py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import MinMaxScaler

In [10]:
def rescale(X_train, X_test):
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

def print_shapes(X_train, X_test, y_train, y_test):
    print('X_train.shape:\t', X_train.shape)
    print('y_train.shape:\t', y_train.shape)
    print('X_test.shape:\t', X_test.shape)
    print('y_test.shape:\t', y_test.shape)

def load_usps():
    print('Loading usps...\n')
    
    with h5py.File('data/usps.h5', 'r') as hf:
        train = hf.get('train')
        test = hf.get('test')
        
        X_train = train.get('data')[:]
        y_train = train.get('target')[:]
        X_test = test.get('data')[:]
        y_test = test.get('target')[:]
    
    X_train, X_test = rescale(X_train, X_test)
    print_shapes(X_train, X_test, y_train, y_test)
    return X_train, X_test, y_train, y_test


def load_letter():
    print('Loading letter...\n')
    dataset = pd.read_csv('data/letter-recognition.data', header=None)
    
    data = dataset.loc[:,1:]
    data = np.array(data)
    
    target = dataset[0]
    target = target.apply(lambda x: ord(x) - ord('A'))
    target = np.array(target)

    X_train, X_test, y_train, y_test = train_test_split(data, target)
    
    X_train, X_test = rescale(X_train, X_test)
    print_shapes(X_train, X_test, y_train, y_test)
    return X_train, X_test, y_train, y_test


def load_satim():
    print('Loading satim...\n')
    train = pd.read_csv('data/sat.trn', sep=' ', header=None)
    test = pd.read_csv('data/sat.tst', sep=' ', header=None)
    
    train = np.array(train)
    X_train = train[:,:36]
    y_train = train[:,36]

    test = np.array(test)
    X_test = test[:,:36]
    y_test = test[:,36]
    
    X_train, X_test = rescale(X_train, X_test)
    print_shapes(X_train, X_test, y_train, y_test)
    return X_train, X_test, y_train, y_test


def load_dna():
    print('Loading dna...\n')
    X_train, y_train = load_svmlight_file('data/dna.scale.tr')
    X_train = X_train.A
    X_test, y_test = load_svmlight_file('data/dna.scale.t')
    X_test = X_test.A
   
    X_train, X_test = rescale(X_train, X_test)
    print_shapes(X_train, X_test, y_train, y_test)
    return X_train, X_test, y_train, y_test

In [11]:
X_train, X_test, y_train, y_test = load_usps()
X_train, X_test, y_train, y_test = load_letter()
X_train, X_test, y_train, y_test = load_satim()
X_train, X_test, y_train, y_test = load_dna()

Loading usps...

X_train.shape:	 (7291, 256)
y_train.shape:	 (7291,)
X_test.shape:	 (2007, 256)
y_test.shape:	 (2007,)
Loading letter...

X_train.shape:	 (15000, 16)
y_train.shape:	 (15000,)
X_test.shape:	 (5000, 16)
y_test.shape:	 (5000,)
Loading satim...

X_train.shape:	 (4435, 36)
y_train.shape:	 (4435,)
X_test.shape:	 (2000, 36)
y_test.shape:	 (2000,)
Loading dna...

X_train.shape:	 (1400, 180)
y_train.shape:	 (1400,)
X_test.shape:	 (1186, 180)
y_test.shape:	 (1186,)




# Classical RF

Compare (both in time complexity & performance) the coded algorithm [on 2-3 real-world datasets] with classical random forests in the online mode, i.e. with forests that are

(1) completely refitted on data of steps [1, t] each step t

(2) Partially fitted for every new observation (some new trees fitted, some old removed)

(3) rolling-window refitting on [t-h, t]

In [12]:
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline

In [34]:
def iterate_batches(n_samples, n_batches=100):
    batch_size = n_samples // n_batches
    for i in range(n_batches):
        start = i * batch_size
        end = (i + 1) * batch_size
        yield start, end
        

def classical_rf_refit(clf, dataset, n_batches):
    
    X_train, X_test, y_train, y_test = dataset

    fit_time = []
    train_accuracy = []
    test_accuracy = []
    
    n_samples = X_train.shape[0]
    
    for start, end in iterate_batches(n_samples, n_batches):
        
        X_batch = X_train[0:end]
        y_batch = y_train[0:end]
        
        t = time.time()
        clf.fit(X_batch, y_batch)
        ft = time.time() - t

        tr_acc = accuracy_score(y_batch, clf.predict(X_batch))
        test_acc = accuracy_score(y_test, clf.predict(X_test))
    
        fit_time.append(ft)
        train_accuracy.append(tr_acc)
        test_accuracy.append(test_acc)
        
    return fit_time, train_accuracy, test_accuracy


def run_method_on_dataset(method, dataset, n_iter, n_batches, n_estimators, max_depth):
    mean_fit_time = [] 
    mean_train_acc = []
    mean_test_acc = []

    for i in range(n_iter):
        
        if method == 'classical_rf_refit': 
            clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
            fit_time, train_acc, test_acc = classical_rf_refit(clf, dataset, n_batches)
        
        mean_fit_time.append(fit_time)
        mean_train_acc.append(train_acc)
        mean_test_acc.append(test_acc)

    mean_fit_time = np.mean(mean_fit_time, axis=0)
    mean_train_acc = np.mean(mean_train_acc, axis=0)
    mean_test_acc = np.mean(mean_test_acc, axis=0)
    
    return mean_fit_time, mean_train_acc, mean_test_acc


def run_all_methods_dataset(dataset, n_iter, n_batches, n_estimators, max_depth):
    fig, ax = plt.subplots(2, 1, figsize=(5, 10))
    
    methods = ['classical_rf_refit']
    for method in methods:
        mean_fit_time, mean_train_acc, mean_test_acc = run_method_on_dataset(method, dataset, n_iter, n_batches, n_estimators, max_depth)


        ax[0].plot(np.arange(n_batches), mean_test_acc, label=method)
        ax[0].set(xlabel='batch num', ylabel='test accuracy')
        ax[0].legend()

        ax[1].plot(np.arange(n_batches), mean_fit_time, label=method)
        ax[1].set(xlabel='batch num', ylabel='fit time')
        ax[1].legend()

In [35]:
n_batches = 100
n_estimators = 100
max_depth = 3
n_iter = 5

In [None]:
dataset = load_dna()

run_all_methods_dataset(dataset, n_iter, n_batches, n_estimators, max_depth)

Loading dna...

X_train.shape:	 (1400, 180)
y_train.shape:	 (1400,)
X_test.shape:	 (1186, 180)
y_test.shape:	 (1186,)
