/
AutoMLBase.R
274 lines (257 loc) · 11.6 KB
/
AutoMLBase.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#' @title AutoML
#' @format [R6Class] AutoML
#' @usage NULL
#' @format [`R6Class`].
#' @description
#' base class for AutoML in mlr3automl. Has subclasses for Classification and Regression.
#' @section Construction:
#' ```
#' AutoMLBase$new(task)
#' ```
#' @section Internals:
#' The AutoML class uses `mlr3pipelines` to create a machine learning pipeline.
#' This pipeline contains multiple models (decision tree, random forest, XGBoost),
#' which are wrapped in a GraphLearner. This GraphLearner is wrapped in an
#' AutoTuner for Hyperparameter Optimization and during training or resampling.
#' @section Fields:
#' * `task` :: `Task` object from `mlr3` \cr
#' Contains the data and some meta-features (like the target variable)
#' * `learner_list` :: `List` of names for `mlr3 Learners` \cr
#' Can be used to customize the learners to be tuned over. If no parameter space
#' is defined for the selected learner, it will be run with default parameters.
#' Might break mlr3automl if the learner is incompatible with the provided task
#' * `learner_timeout` :: `Integer` \cr
#' Budget (in seconds) for a single learner during training of the pipeline
#' * `resampling` :: `Resampling` object from `mlr3tuning` \cr
#' Contains the resampling method to be used for hyper-parameter optimization
#' * `measure` :: `Measure` object from `mlr_measures` \cr
#' Contains the performance measure, for which we optimize during training
#' * `tuning_terminator` :: `Terminator` object from `mlr3tuning` \cr
#' Contains the termination criterion for model tuning
#' @section Methods:
#' * `train()` \cr
#' Trains the AutoML system.
#' * `predict(data = NULL, row_ids = NULL)` \cr
#' `data.frame | data.table | Task -> PredictionClassif or PredictionRegr`
#' Returns a Prediction object for the given data based on the trained model.
#' If data is NULL, defaults to the task used for training
#' `resample()`
#' `double(1) -> ResampleResult`
#' Performs nested resampling with a train/test split as the outer resampling
#' @rawNamespace import(mlr3, except = c(lrn, lrns))
#' @import mlr3learners
#' @import mlr3extralearners
#' @import mlr3hyperband
#' @import mlr3oml
#' @import mlr3pipelines
#' @import mlr3tuning
#' @import paradox
#' @import checkmate
#' @import testthat
#' @import glmnet
#' @import xgboost
#' @importFrom R6 R6Class
#' @name AutoMLBase
#' @examples
#' "add later"
AutoMLBase = R6Class("AutoMLBase",
public = list(
task = NULL,
learner_list = NULL,
learner_timeout = NULL,
learner = NULL,
resampling = NULL,
measure = NULL,
tuning_terminator = NULL,
tuner = NULL,
#' @description
#' Creates a new AutoMLBase object
#' @param task
#' * `task` :: `Task` object from `mlr3` \cr
#' Contains the task to be solved.
#' @param learner_list
#' * `learner_list` :: `List` of names for `mlr3 Learners` \cr
#' Can be used to customize the learners to be tuned over. If no parameter space
#' is defined for the selected learner, it will be run with default parameters.
#' Default learners for classification: `c("classif.ranger", "classif.xgboost", "classif.liblinear")`,
#' default learners for regression: `c("regr.ranger", "regr.xgboost", "regr.svm", "regr.liblinear", "regr.cv_glmnet")`.
#' Might break mlr3automl if the learner is incompatible with the provided task.
#' @param learner_timeout
#' * `learner_timeout` :: `Integer` \cr
#' Budget (in seconds) for a single learner during training of the pipeline.
#' If this budget is exceeded, the learner is replaced with the fallback
#' learner (`lrn("classif.featureless")` or `lrn("regr.featureless")`).
#' @param resampling
#' * `resampling` :: `Resampling` object from `mlr3tuning` \cr
#' Contains the resampling method to be used for hyper-parameter optimization.
#' Defaults to `rsmp("holdout")`.
#' @param measure
#' * `measure` :: `Measure` object from `mlr_measures` \cr
#' Contains the performance measure, for which we optimize during training.
#' Defaults to `msr("classif.acc")` for classification and `msr("regr.rmse")`
#' for regression.
#' @param terminator
#' * `terminator` :: `Terminator` object from `mlr3tuning` \cr
#' Contains the termination criterion for model tuning. Note that the Hyperband
#' tuner might stop training before the budget is exhausted.
#' Defaults to `trm("none")`
initialize = function(task, learner_list = NULL, learner_timeout = NULL,
resampling = NULL, measure = NULL, terminator = NULL) {
assert_task(task)
for (learner in learner_list) {
testthat::expect_true(learner %in% mlr_learners$keys())
}
if (!is.null(resampling)) assert_resampling(resampling)
if (!is.null(measure)) assert_measure(measure)
self$task = task
self$resampling = resampling %??% rsmp("holdout")
if (is.null(learner_timeout)) {
if (!is.null(terminator$param_set$values$secs)) {
learner_timeout = as.integer(terminator$param_set$values$secs / 5)
} else {
learner_timeout = Inf
}
}
self$learner_timeout = learner_timeout
self$tuning_terminator = terminator %??% trm("none")
self$tuner = tnr("hyperband", eta = 3)
self$learner = private$.get_default_learner()
},
#' @description
#' Train AutoML learner. Calls the `train` method of the associated `AutoTuner`
#' with the training instances in the given task.
#' @param row_ids
#' IDs of observations to be used for training. If no `row_ids` are provided,
#' trains on the entire data set.
train = function(row_ids = NULL) {
self$learner$train(self$task, row_ids)
if (length(self$learner$learner$errors) > 0) {
warning("An error occured during training. Fallback learner was used!")
}
},
#' @description
#' Make predictions for new observations
#' @param data
#' Optional. If provided, predictions are made on this dataset. Needs to have
#' the same format as data used for training.
#' @param row_ids
#' IDs of observations to be used for predictions If no `row_ids` are provided,
#' predictions are made for the entire dataset.
predict = function(data = NULL, row_ids = NULL) {
if (is.null(data)) {
return(self$learner$predict(self$task, row_ids))
} else {
return(self$learner$predict(data, row_ids))
}
},
#' @description
#' Convenience function for resampling with an AutoML Object. Performs nested
#' resampling with `$resampling` as inner resampling and `rsmp("holdout")`
#' as outer resampling.
resample = function() {
outer_resampling = rsmp("holdout")
resample_result = mlr3::resample(self$task, self$learner, outer_resampling)
self$learner = resample_result$learners[[1]]
if (length(self$learner$learner$errors) > 0) {
warning("An error occured during training. Fallback learner was used!")
}
return(resample_result)
},
#' @description
#' Convenience function for trained AutoML objects. Extracts the best
#' performing hyperparameters.
tuned_params = function() {
if (is.null(self$learner$state)) {
warning("Model has not been trained. Run the $train() method first.")
} else {
return(self$learner$tuning_instance$archive$best())
}
}
),
private = list(
.get_default_learner = function() {
learners = list()
for (learner in self$learner_list) {
learners = append(learners, private$.create_robust_learner(learner))
}
names(learners) = self$learner_list
if (length(self$learner_list) > 1) {
pipeline = po("subsample") %>>% ppl("branch", graphs = learners)
} else {
pipeline = po("subsample") %>>% learners[[1]]
}
graph_learner = GraphLearner$new(pipeline)
if (!is.null(self$learner_timeout) || !is.infinite(self$learner_timeout)) {
# fallback learner is featureless learner for classification / regression
graph_learner$fallback = lrn(paste(self$task$task_type, '.featureless',
sep = ""))
# use callr encapsulation so we are able to kill model training, if it
# takes too long
graph_learner$encapsulate = c(train = "callr", predict = "callr")
graph_learner$timeout = c(train = self$learner_timeout, predict = self$learner_timeout)
}
if (any(grepl("ranger", self$learner_list))) {
num_effective_vars = private$.compute_num_effective_vars()
} else {
num_effective_vars = NULL
}
param_set = default_params(self$learner_list, self$task$task_type, num_effective_vars)
return(AutoTuner$new(
learner = graph_learner,
resampling = self$resampling,
measure = self$measure,
search_space = param_set,
terminator = self$tuning_terminator,
tuner = self$tuner))
},
.create_robust_learner = function(learner_name) {
# temporary workaround, see https://github.com/mlr-org/mlr3pipelines/issues/519
pipeline = po("nop")
# robustify_pipeline takes care of imputation, factor encoding etc.
# we always need imputation, because earlier preprocessing pipeops may introduce missing values
pipeline = pipeline %>>%
pipeline_robustify(task = self$task, learner = lrn(learner_name),
impute_missings = TRUE)
# liblinear only works with columns of type double. Convert ints / bools -> dbl
if (grepl('liblinear', learner_name)) {
pipeline = pipeline %>>%
po("colapply", applicator = as.numeric,
param_vals = list(affect_columns = selector_type(c("logical", "integer"))))
}
# avoid name conflicts in pipeline
pipeline$update_ids(prefix = paste0(learner_name, "."))
# liblinear learner offer logistic/linear regression as well as SVMs
# SVMs do not offer probability predictions and can not be tuned for AUC
# thus, only use logistic regression for now
# if (grepl('liblinear', learner_name) && self$task$task_type == "classif") {
# liblinear_learners = list(
# po("learner", lrn(learner_name, predict_type = "prob"), id = paste(learner_name, "logreg", sep = ".")),
# po("learner", lrn(learner_name, predict_type = "response"), id = paste(learner_name, "svm", sep = ".")))
# choices = c("classif.liblinear.logreg", "classif.liblinear.svm")
# return(
# pipeline %>>%
# po("branch", choices, id = "classif.liblinear.branch") %>>%
# gunion(graphs = liblinear_learners) %>>%
# po("unbranch", choices, id = "classif.liblinear.unbranch"))
# }
# predict probabilities for classification if possible
if (self$task$task_type == "classif" && ("prob" %in% lrn(learner_name)$predict_types)) {
return(pipeline %>>% po("learner", lrn(learner_name, predict_type = "prob")))
}
# default: predict with type response
return(pipeline %>>% po("learner", lrn(learner_name)))
},
.compute_num_effective_vars = function() {
rf_learner = lrn(paste(self$task$task_type, 'ranger', sep = "."))
pipeline =
po("nop") %>>%
pipeline_robustify(task = self$task, learner = rf_learner, impute_missings = TRUE) %>>%
lrn(paste(self$task$task_type, '.featureless', sep = ""))
pipeline$train(self$task)
# get number of variables after preprocessing
last_pipeop = paste(self$task$task_type, '.featureless', sep = "")
num_effective_vars = get(last_pipeop, pipeline$state)$train_task$ncol - 1
return(num_effective_vars)
}
)
)