import numpy as np
import logging
from pyrfr import regression
from smac.configspace import CategoricalHyperparameter
from smac.epm.base_epm import AbstractEPM
__author__ = "Aaron Klein"
__copyright__ = "Copyright 2015, ML4AAD"
__license__ = "3-clause BSD"
__maintainer__ = "Aaron Klein"
__email__ = ""
__version__ = "0.0.1"
class RandomForestWithInstances(AbstractEPM):
"""Interface to the random forest that takes instance features
into account.
rf_opts :
Random forest hyperparameter
n_points_per_tree : int
rf : regression.binary_rss_forest
Only available after training
hypers: list
List of random forest hyperparameters
seed : int
types : list
bounds : list
rng : np.random.RandomState
logger : logging.logger
def __init__(self, types: np.ndarray,
bounds: np.ndarray,
num_trees: int=10,
do_bootstrapping: bool=True,
n_points_per_tree: int=-1,
ratio_features: float=5. / 6.,
min_samples_split: int=3,
min_samples_leaf: int=3,
max_depth: int=20,
eps_purity: int=1e-8,
max_num_nodes: int=2**20,
seed: int=42,
types : np.ndarray (D)
Specifies the number of categorical values of an input dimension where
the i-th entry corresponds to the i-th input dimension. Let's say we
have 2 dimension where the first dimension consists of 3 different
categorical choices and the second dimension is continuous than we
have to pass np.array([2, 0]). Note that we count starting from 0.
bounds : np.ndarray (D, 2)
Specifies the bounds for continuous features.
num_trees : int
The number of trees in the random forest.
do_bootstrapping : bool
Turns on / off bootstrapping in the random forest.
n_points_per_tree : int
Number of points per tree. If <= 0 X.shape[0] will be used
in _train(X, y) instead
ratio_features : float
The ratio of features that are considered for splitting.
min_samples_split : int
The minimum number of data points to perform a split.
min_samples_leaf : int
The minimum number of data points in a leaf.
max_depth : int
The maximum depth of a single tree.
eps_purity : float
The minimum difference between two target values to be considered
max_num_nodes : int
The maxmimum total number of nodes in a tree
seed : int
The seed that is passed to the random_forest_run library.
self.types = types
self.bounds = bounds
self.rng = regression.default_random_engine(seed)
self.rf_opts = regression.forest_opts()
self.rf_opts.num_trees = num_trees
self.rf_opts.do_bootstrapping = do_bootstrapping
max_features = 0 if ratio_features > 1.0 else \
max(1, int(types.shape[0] * ratio_features))
self.rf_opts.tree_opts.max_features = max_features
self.rf_opts.tree_opts.min_samples_to_split = min_samples_split
self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
self.rf_opts.tree_opts.max_depth = max_depth
self.rf_opts.tree_opts.epsilon_purity = eps_purity
self.rf_opts.tree_opts.max_num_nodes = max_num_nodes
self.n_points_per_tree = n_points_per_tree
self.rf = None # type: regression.binary_rss_forest
# This list well be read out by save_iteration() in the solver
self.hypers = [num_trees, max_num_nodes, do_bootstrapping,
n_points_per_tree, ratio_features, min_samples_split,
min_samples_leaf, max_depth, eps_purity, seed]
self.seed = seed
self.logger = logging.getLogger(self.__module__ + "." +
def _train(self, X: np.ndarray, y: np.ndarray, **kwargs):
"""Trains the random forest on X and y.
X : np.ndarray [n_samples, n_features (config + instance features)]
Input data points.
Y : np.ndarray [n_samples, ]
The corresponding target values.
self.X = X
self.y = y.flatten()
if self.n_points_per_tree <= 0:
self.rf_opts.num_data_points_per_tree = self.X.shape[0]
self.rf_opts.num_data_points_per_tree = self.n_points_per_tree
self.rf = regression.binary_rss_forest()
self.rf.options = self.rf_opts
data = self.__init_data_container(self.X, self.y), rng=self.rng)
return self
def __init_data_container(self, X: np.ndarray, y: np.ndarray):
"""Fills a pyrfr default data container, s.t. the forest knows
categoricals and bounds for continous data
X : np.ndarray [n_samples, n_features]
Input data points
y : np.ndarray [n_samples, ]
Corresponding target values
data : regression.default_data_container
The filled data container that pyrfr can interpret
# retrieve the types and the bounds from the ConfigSpace
data = regression.default_data_container(X.shape[1])
for i, (mn, mx) in enumerate(self.bounds):
if np.isnan(mx):
data.set_type_of_feature(i, mn)
data.set_bounds_of_feature(i, mn, mx)
for row_X, row_y in zip(X, y):
data.add_data_point(row_X, row_y)
return data
def _predict(self, X: np.ndarray):
"""Predict means and variances for given X.
X : np.ndarray of shape = [n_samples,
n_features (config + instance features)]
means : np.ndarray of shape = [n_samples, 1]
Predictive mean
vars : np.ndarray of shape = [n_samples, 1]
Predictive variance
if len(X.shape) != 2:
raise ValueError(
'Expected 2d array, got %dd array!' % len(X.shape))
if X.shape[1] != self.types.shape[0]:
raise ValueError('Rows in X should have %d entries but have %d!' %
(self.types.shape[0], X.shape[1]))
means, vars_ = [], []
for row_X in X:
mean, var = self.rf.predict_mean_var(row_X)
means = np.array(means)
vars_ = np.array(vars_)
return means.reshape((-1, 1)), vars_.reshape((-1, 1))