In [10]:
import numpy as np
import time
from collections import deque
from Dynaforest import Dynatree
from sklearn.utils.estimator_checks import check_estimator
from sklearn.datasets import make_friedman1, make_friedman2, make_friedman3
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import inspect

"""
We want to create a series of graphs. 
1. Compare the performance of the Dynaforest algorithm with the Random Forest algorithm.
2. Compare the performance of the Dynaforest algorithm with the XGBoost algorithm.
3. Compare time-complexity of the Dynaforest algorithm with the Random Forest algorithm.
4. Compare average tree bias of Dynaforest with Random Forest.
"""

'\nWe want to create a series of graphs. \n1. Compare the performance of the Dynaforest algorithm with the Random Forest algorithm.\n2. Compare the performance of the Dynaforest algorithm with the XGBoost algorithm.\n3. Compare time-complexity of the Dynaforest algorithm with the Random Forest algorithm.\n4. Compare average tree bias of Dynaforest with Random Forest.\n'

**Comparing Performance of Algorithms on Various Training Datasets**

In [11]:
def get_average_loss(model, dataset_creator, param_dict):
    test_losses = []
    for i in range(1):
        X_train, y_train = dataset_creator(n_samples=200, noise=1)
        X_test, y_test = dataset_creator(n_samples=1000, noise=1)
        
        grid = RandomizedSearchCV(model, param_dict, cv=2, n_iter=3, verbose = 2)
        
        grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)
        test_loss = np.mean((y_pred - y_test) ** 2)
        test_losses.append(test_loss)
        print(f"iteration {i} loss: {np.mean(test_loss)}")
    return test_losses
    
    

In [12]:
# testing with xgboost

param_dist = {
    'n_trees': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'max_depth': [5, 6, 7],
    'feature_subsampling_pct': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'window': [3, 5, 10]
}

param_dist_xgb = {
    'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}


xgb_model = Dynatree()
xgb_losses = get_average_loss(xgb_model, make_friedman1, param_dist)
print("XGBoost average loss: ", np.mean(xgb_losses))

Fitting 2 folds for each of 3 candidates, totalling 6 fits
This forest has a total of 842 splits
This forest has a total of 30 trees
[CV] END feature_subsampling_pct=0.2, max_depth=6, n_trees=30, window=10; total time=   6.7s
This forest has a total of 854 splits
This forest has a total of 30 trees
[CV] END feature_subsampling_pct=0.2, max_depth=6, n_trees=30, window=10; total time=   6.9s
This forest has a total of 1177 splits
This forest has a total of 60 trees
[CV] END feature_subsampling_pct=0.4, max_depth=5, n_trees=60, window=5; total time=   6.6s
This forest has a total of 1301 splits
This forest has a total of 60 trees
[CV] END feature_subsampling_pct=0.4, max_depth=5, n_trees=60, window=5; total time=   7.8s
This forest has a total of 1687 splits
This forest has a total of 90 trees
[CV] END feature_subsampling_pct=0.5, max_depth=5, n_trees=90, window=3; total time=   7.3s
This forest has a total of 1773 splits
This forest has a total of 90 trees
[CV] END feature_subsampling_pc

In [13]:
masked_array = np.ma.masked_array([1, 2, -1, -2], mask = [False, False, True, False])
unique_idx = np.unique(masked_array, return_index=True)[1]
print(unique_idx)
np.ma.argmin(masked_array[unique_idx])

[3 0 1 2]


0