/
tune-ranger-models.Rmd
271 lines (209 loc) · 8.68 KB
/
tune-ranger-models.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
---
title: "Tune Corn Yield Random Forest: Biophysical & Farm(er)"
author: "Britta Schumacher"
date: "Last updated: `r format(Sys.time(), '%d %B, %Y')`"
output:
html_document:
theme: yeti
toc: yes
toc_float: true
code_folding: hide
---
# Bring in the data, clean it up, and run the model on the training set
```{r packages and data, message = FALSE}
library(randomForest) # basic implementation
library(ranger) # a faster implementation of randomForest
library(caret) # an aggregator package for performing many machine learning models
library(h2o) # an extremely fast java-based platform
library(tidyverse) # data tidying & munging
library(ggplot2) # data visualization
library(MLmetrics) # mean absolute percent error, mean-absolute error implementation
#library(ie2misc)
# pull in TRUE train data; all data with recorded yield
corn_yield <- readRDS("./data/data-out/true-train.RDS")
# remove ID variables that aren't included in the model
yield_RF <- corn_yield %>% dplyr::select(-c(GEOID))
# specify qualitative variables that are factors
yield_RF$YEAR <- as.factor(yield_RF$YEAR)
yield_RF$FRR <- as.factor(yield_RF$FRR)
```
# Build training and test sets for: all data, pruned biophysical data, and pruned farm(er) data
```{r train and test}
# Create training (75%) and test (25%) sets for our corn yield data.
# Use set.seed for reproducibility
set.seed(1234)
random_rn <- sample(nrow(yield_RF), ceiling(nrow(yield_RF)*.25))
train_fullset <- yield_RF[-random_rn,]
test_fullset <- yield_RF[random_rn,]
# subset for only biophysical variables
train <- train_fullset %>% select(YEAR,YIELD,PERC_IRR,FRR,GDD,BV2,BV18,SDI_CDL_AG,BV9,BV4,TP,ELEVATION,S_PH_H2O,SLOPE,BV8,BV19,T_CEC_SOIL,BV15,T_REF_BULK_DENSITY,T_OC)
test <- test_fullset %>% select(YEAR,YIELD,PERC_IRR,FRR,GDD,BV2,BV18,SDI_CDL_AG,BV9,BV4,TP,ELEVATION,S_PH_H2O,SLOPE,BV8,BV19,T_CEC_SOIL,BV15,T_REF_BULK_DENSITY,T_OC)
saveRDS(train, "./data/data-out/for-RF-analysis/bio_train.RDS")
saveRDS(test, "./data/data-out/for-RF-analysis/bio_test.RDS")
# subset for biophysical AND farm(er) variables
farmer_train <- train_fullset %>% select(YEAR,YIELD,PERC_IRR,FRR,GDD,BV2,BV18,SDI_CDL_AG,BV9,BV4,TP,ELEVATION,S_PH_H2O,SLOPE,BV8,BV19,T_CEC_SOIL,BV15,T_REF_BULK_DENSITY,T_OC,perc_corn,chem,fert,labor_expense,machinery,insur_acres,gvt_prog,exp,occup,tenant,acres_per_op)
farmer_test <- test_fullset %>% select(YEAR,YIELD,PERC_IRR,FRR,GDD,BV2,BV18,SDI_CDL_AG,BV9,BV4,TP,ELEVATION,S_PH_H2O,SLOPE,BV8,BV19,T_CEC_SOIL,BV15,T_REF_BULK_DENSITY,T_OC,perc_corn,chem,fert,labor_expense,machinery,insur_acres,gvt_prog,exp,occup,tenant,acres_per_op)
saveRDS(farmer_train, "./data/data-out/for-RF-analysis/farmer_train.RDS")
saveRDS(farmer_test, "./data/data-out/for-RF-analysis/farmer_test.RDS")
```
### RF is one the best "out-of-the-box" ML algorithm
It typically preforms remarkably well with very little tuning. There are only a few tuning parameters, all of which should be considered and adjusted for accuracy:
- `ntree` : the number of trees; we want enough trees to stabilize our error, but don't want so many trees that our RF is unnecessarily inefficient.
- `mtry` : the number of variables to randomly sample as candidates at each split; in RF regression, the default is `mtry` = *p* / 3.
- `sampsize` : the number of samples to train on; the default value is 63.25% of the training set (this is the expected value of unique observations in each bootstrap sample). Increasing sample size can increase performance, but also lead to overfitting; when tuned this value typically lies within the 60-80% range.
- `nodesize` : minimum number of samples within the terminal nodes which controls the complexity of the trees. The deeper the trees, the higher the risk of overfitting; the shallower the trees, the high the risk of introducing bias and not fully capturing unique patterns in the data.
- `maxnodes` : maximum number of terminal nodes. More nodes build deeper, more complex trees.
# Tune Biophysical RF, using ranger() implementation
```{r}
# hyperparameter grid search
hyper_grid <- expand.grid(
mtry = seq(1, 15, by = 2),
node_size = seq(4, 20, by = 2),
sampe_size = c(.55, .632, .70, .80),
OOB_RMSE = 0
)
# total number of combinations
nrow(hyper_grid)
for(i in 1:nrow(hyper_grid)) {
# train model
model <- ranger(
formula = YIELD ~ .,
data = train,
num.trees = 2000,
mtry = hyper_grid$mtry[i],
min.node.size = hyper_grid$node_size[i],
sample.fraction = hyper_grid$sampe_size[i],
seed = 123
)
# add OOB error to grid
hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)
}
saveRDS(hyper_grid, "./data/data-out/tune-bioph-ranger.RDS")
```
*from previous run, @Brennan, use parameters that minimize the OOB_RMSE
ntree = 2000
mtry = 7
node size = 4
sample size = 0.800
OOB RMSE = 17.54877
# Biophysical RF variable importance with various mtry levels, using the ranger() implementation
* @Brennan, we take these results and publish them in SI Table 3
```{r}
rf_mtry <- function(mtry) {
#index
index <- mtry
# build RF on training data
set.seed(1234)
rf <- ranger(formula = YIELD ~ .,
data = train,
num.trees = 2000,
mtry = index,
min.node.size = 4,
sample.fraction = 0.8,
importance = "permutation")
# test the model
preds <- predict(rf, data = test)
preds <- as.data.frame(preds$predictions)
colnames(preds) <- "PREDS"
test <- cbind(test, preds)
# performance on test set
RMSE <- caret::RMSE(test$PREDS, test$YIELD)
R2 <- 1 - (sum((test$YIELD - test$PREDS)^2) / sum((test$YIELD - mean(test$YIELD))^2))
# build argument to feed save, below
list <- list("RMSE" = RMSE, "R2" = R2, "var_exp" = rf)
print(list)
# variable importance
imp <- as.data.frame(rf$variable.importance)
imp$Variable <- row.names(imp)
colnames(imp) <- c("VI", "VAR")
ranger_imp <- imp
# save .RDS's
saveRDS(imp, file = paste0('./results/ranger-imp/mtry_bioRF_',index,'.RDS'))
return(list)
}
rf_mtry_1 <- rf_mtry(1)
rf_mtry_3 <- rf_mtry(3)
rf_mtry_5 <- rf_mtry(5)
rf_mtry_7 <- rf_mtry(7)
rf_mtry_9 <- rf_mtry(9)
rf_mtry_11 <- rf_mtry(11)
rf_mtry_13 <- rf_mtry(13)
rf_mtry_15 <- rf_mtry(15)
```
# Tune Farm(er) RF, using ranger() implementation
```{r}
# hyperparameter grid search
hyper_grid <- expand.grid(
mtry = seq(5, 20, by = 2),
node_size = seq(4, 20, by = 2),
sampe_size = c(.55, .632, .70, .80),
OOB_RMSE = 0
)
# total number of combinations
nrow(hyper_grid)
for(i in 1:nrow(hyper_grid)) {
# train model
model <- ranger(
formula = YIELD ~ .,
data = farmer_train,
num.trees = 2000,
mtry = hyper_grid$mtry[i],
min.node.size = hyper_grid$node_size[i],
sample.fraction = hyper_grid$sampe_size[i],
seed = 123
)
# add OOB error to grid
hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)
}
saveRDS(hyper_grid, "./data/data-out/tune-farmer-ranger.RDS")
```
*from previous run, @Brennan, use parameters that minimize the OOB_RMSE
ntree = 2000
mtry = 19
node size = 4
sample size = 0.800
OOB RMSE = 17.53468
# Farm(er) RF variable importance with various mtry levels, using the ranger() implementation
* @Brennan, we take these results and publish them in SI Table 4
```{r}
rf_mtry <- function(mtry) {
#index
index <- mtry
# build RF on training data
set.seed(1234)
rf <- ranger(formula = YIELD ~ .,
data = farmer_train,
num.trees = 2000,
mtry = index,
min.node.size = 4,
sample.fraction = 0.8,
importance = "permutation")
# test the model
preds <- predict(rf, data = farmer_test)
preds <- as.data.frame(preds$predictions)
colnames(preds) <- "PREDS"
test <- cbind(farmer_test, preds)
# performance on test set
RMSE <- caret::RMSE(test$PREDS, test$YIELD)
R2 <- 1 - (sum((test$YIELD - test$PREDS)^2) / sum((test$YIELD - mean(test$YIELD))^2))
# build argument to feed save, below
list <- list("RMSE" = RMSE, "R2" = R2, "var_exp" = rf)
print(list)
# variable importance
imp <- as.data.frame(rf$variable.importance)
imp$Variable <- row.names(imp)
colnames(imp) <- c("VI", "VAR")
ranger_imp <- imp
# save .RDS's
saveRDS(imp, file = paste0('./results/ranger-imp/mtry_farmRF_',index,'.RDS'))
return(list)
}
rf_mtry_5 <- rf_mtry(5)
rf_mtry_7 <- rf_mtry(7)
rf_mtry_9 <- rf_mtry(9)
rf_mtry_11 <- rf_mtry(11)
rf_mtry_13 <- rf_mtry(13)
rf_mtry_15 <- rf_mtry(15)
rf_mtry_17 <- rf_mtry(17)
rf_mtry_19 <- rf_mtry(19)
```