In [2]:
%matplotlib inline

import numpy as np

import pickle, os, math
import multiprocessing as mp
import matplotlib.pyplot as plt
from timeit import default_timer as timer

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import svm
from sklearn import gaussian_process
from sklearn import preprocessing
from sklearn import neural_network

from scipy import stats
import gzip

class LeafNode:
    X, y = [], []
        
    def append(self, X, y):
        self.X.append(X)
        self.y.append(y)
    
    def __str__(self):
        return 'X' + str(self.X) + 'y' + str(self.y)

    def lin_reg(self):
        linreg = linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=n_jobs)
        linreg.fit(self.X, self.y)
        return linreg
    
def test_erf():
    n_estimators = 10
    n_jobs = mp.cpu_count()-1
    
    rf = ExtraTreesRegressor(n_estimators=n_estimators, n_jobs=n_jobs, max_depth=3)
    
    X_train = np.random.rand(100, 10)
    y_train = np.random.rand(100)
    
    X_test = np.random.rand(10, 10)
    y_test = np.random.rand(10, 10)
    
    rf.fit(X_train, y_train)
    
    # dictionary for each tree in the ensemble
    ensemble_data = [{}] * 100
    
    # for each sample
    for sample, value in zip(X_train, y_train):
        
        sample = sample.reshape(1,-1)
        print('sample shape:', sample.shape)
        # get a list of leafs that each point lands in
        leaf_indices = rf.apply(sample)[0]
        
        print('current sample:', sample, 'value:', value)
        print('leafs:', leaf_indices)
        
        # put them in the dictionary
        for tree_num, leaf in enumerate(leaf_indices):
            
            print('current tree:', tree_num, 'leaf:', leaf)
            
            leaf_model = ensemble_data[tree_num].get(leaf, LeafNode())
            leaf_model.append(sample, value)
            
            ensemble_data[tree_num][leaf] = leaf_model
            
    rf.predict(y_test)
    
    print(ensemble_data[0])
    
test_erf()

sample shape: (1, 10)
current sample: [[ 0.78304749  0.84526354  0.92929571  0.32846204  0.53615941  0.87010897
   0.85831445  0.01706353  0.21751915  0.08927815]] value: 0.169795174773
leafs: [14  6  3 10  7  4 14  5 10 13]
current tree: 0 leaf: 14
current tree: 1 leaf: 6
current tree: 2 leaf: 3
current tree: 3 leaf: 10
current tree: 4 leaf: 7
current tree: 5 leaf: 4
current tree: 6 leaf: 14
current tree: 7 leaf: 5
current tree: 8 leaf: 10
current tree: 9 leaf: 13
sample shape: (1, 10)
current sample: [[ 0.95694083  0.40615184  0.56044673  0.28089793  0.2000293   0.52836874
   0.07226974  0.14210083  0.48381405  0.9332184 ]] value: 0.131940728608
leafs: [10  4 11  3  3  4  3  4 11 13]
current tree: 0 leaf: 10
current tree: 1 leaf: 4
current tree: 2 leaf: 11
current tree: 3 leaf: 3
current tree: 4 leaf: 3
current tree: 5 leaf: 4
current tree: 6 leaf: 3
current tree: 7 leaf: 4
current tree: 8 leaf: 11
current tree: 9 leaf: 13
sample shape: (1, 10)
current sample: [[ 0.50223066  0.486026