tune-ranger-models.Rmd

---
title: "Tune Corn Yield Random Forest: Biophysical & Farm(er)"
author: "Britta Schumacher"
date: "Last updated: `r format(Sys.time(), '%d %B, %Y')`"
output: 
  html_document:
    theme: yeti
    toc: yes
    toc_float: true
    code_folding: hide
---

# Bring in the data, clean it up, and run the model on the training set
```{r packages and data, message = FALSE}
library(randomForest) # basic implementation
library(ranger)       # a faster implementation of randomForest
library(caret)        # an aggregator package for performing many machine learning models
library(h2o)          # an extremely fast java-based platform
library(tidyverse)    # data tidying & munging
library(ggplot2)      # data visualization
library(MLmetrics)    # mean absolute percent error, mean-absolute error implementation
#library(ie2misc)  

# pull in TRUE train data; all data with recorded yield
corn_yield <- readRDS("./data/data-out/true-train.RDS")

# remove ID variables that aren't included in the model
yield_RF <- corn_yield %>% dplyr::select(-c(GEOID))

# specify qualitative variables that are factors
yield_RF$YEAR <- as.factor(yield_RF$YEAR)
yield_RF$FRR <- as.factor(yield_RF$FRR)

```

# Build training and test sets for: all data, pruned biophysical data, and pruned farm(er) data
```{r train and test}
# Create training (75%) and test (25%) sets for our corn yield data.
# Use set.seed for reproducibility
set.seed(1234) 
random_rn <- sample(nrow(yield_RF), ceiling(nrow(yield_RF)*.25)) 
train_fullset <- yield_RF[-random_rn,] 
test_fullset <- yield_RF[random_rn,] 

# subset for only biophysical variables
train <- train_fullset %>% select(YEAR,YIELD,PERC_IRR,FRR,GDD,BV2,BV18,SDI_CDL_AG,BV9,BV4,TP,ELEVATION,S_PH_H2O,SLOPE,BV8,BV19,T_CEC_SOIL,BV15,T_REF_BULK_DENSITY,T_OC)
test <- test_fullset %>% select(YEAR,YIELD,PERC_IRR,FRR,GDD,BV2,BV18,SDI_CDL_AG,BV9,BV4,TP,ELEVATION,S_PH_H2O,SLOPE,BV8,BV19,T_CEC_SOIL,BV15,T_REF_BULK_DENSITY,T_OC)
saveRDS(train, "./data/data-out/for-RF-analysis/bio_train.RDS")
saveRDS(test, "./data/data-out/for-RF-analysis/bio_test.RDS")

# subset for biophysical AND farm(er) variables
farmer_train <- train_fullset %>% select(YEAR,YIELD,PERC_IRR,FRR,GDD,BV2,BV18,SDI_CDL_AG,BV9,BV4,TP,ELEVATION,S_PH_H2O,SLOPE,BV8,BV19,T_CEC_SOIL,BV15,T_REF_BULK_DENSITY,T_OC,perc_corn,chem,fert,labor_expense,machinery,insur_acres,gvt_prog,exp,occup,tenant,acres_per_op)
farmer_test <- test_fullset %>% select(YEAR,YIELD,PERC_IRR,FRR,GDD,BV2,BV18,SDI_CDL_AG,BV9,BV4,TP,ELEVATION,S_PH_H2O,SLOPE,BV8,BV19,T_CEC_SOIL,BV15,T_REF_BULK_DENSITY,T_OC,perc_corn,chem,fert,labor_expense,machinery,insur_acres,gvt_prog,exp,occup,tenant,acres_per_op)
saveRDS(farmer_train, "./data/data-out/for-RF-analysis/farmer_train.RDS")
saveRDS(farmer_test, "./data/data-out/for-RF-analysis/farmer_test.RDS")
```

### RF is one the best "out-of-the-box" ML algorithm

It typically preforms remarkably well with very little tuning. There are only a few tuning parameters, all of which should be considered and adjusted for accuracy:

- `ntree` : the number of trees; we want enough trees to stabilize our error, but don't want so many trees that our RF is unnecessarily inefficient.

- `mtry` : the number of variables to randomly sample as candidates at each split; in RF regression, the default is   `mtry` = *p* / 3.

- `sampsize` : the number of samples to train on; the default value is 63.25% of the training set (this is the expected value of unique observations in each bootstrap sample). Increasing sample size can increase performance, but also lead to overfitting; when tuned this value typically lies within the 60-80% range.

- `nodesize` : minimum number of samples within the terminal nodes which controls the complexity of the trees. The deeper the trees, the higher the risk of overfitting; the shallower the trees, the high the risk of introducing bias and not fully capturing unique patterns in the data.

- `maxnodes` : maximum number of terminal nodes. More nodes build deeper, more complex trees.

# Tune Biophysical RF, using ranger() implementation
```{r}
# hyperparameter grid search
hyper_grid <- expand.grid(
  mtry       = seq(1, 15, by = 2),
  node_size  = seq(4, 20, by = 2),
  sampe_size = c(.55, .632, .70, .80),
  OOB_RMSE   = 0
)

# total number of combinations
nrow(hyper_grid)

for(i in 1:nrow(hyper_grid)) {
  
  # train model
  model <- ranger(
    formula         = YIELD ~ ., 
    data            = train, 
    num.trees       = 2000,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$node_size[i],
    sample.fraction = hyper_grid$sampe_size[i],
    seed            = 123
  )
  
  # add OOB error to grid
  hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)
}

saveRDS(hyper_grid, "./data/data-out/tune-bioph-ranger.RDS") 
```

*from previous run, @Brennan, use parameters that minimize the OOB_RMSE
ntree = 2000
mtry = 7
node size = 4
sample size = 0.800
OOB RMSE = 17.54877

# Biophysical RF variable importance with various mtry levels, using the ranger() implementation
* @Brennan, we take these results and publish them in SI Table 3
```{r}
rf_mtry <- function(mtry) {
  
  #index
  index <- mtry
  
  # build RF on training data
  set.seed(1234) 
  rf <- ranger(formula          = YIELD ~ ., 
               data             = train, 
               num.trees        = 2000,
               mtry             = index, 
               min.node.size    = 4,
               sample.fraction  = 0.8,
               importance       = "permutation")
  
  # test the model
  preds <- predict(rf, data = test)
  preds <- as.data.frame(preds$predictions)
  colnames(preds) <- "PREDS"
  test <- cbind(test, preds) 

  # performance on test set
  RMSE <- caret::RMSE(test$PREDS, test$YIELD)
  R2 <- 1 - (sum((test$YIELD - test$PREDS)^2) / sum((test$YIELD - mean(test$YIELD))^2))
  
  # build argument to feed save, below
  list <- list("RMSE" = RMSE, "R2" = R2, "var_exp" = rf)
  print(list)
  
  # variable importance
  imp <- as.data.frame(rf$variable.importance)
  imp$Variable <- row.names(imp)
  colnames(imp) <- c("VI", "VAR")
  ranger_imp <- imp
  
  # save .RDS's
  saveRDS(imp, file = paste0('./results/ranger-imp/mtry_bioRF_',index,'.RDS'))
  return(list)

}

rf_mtry_1 <- rf_mtry(1)

rf_mtry_3 <- rf_mtry(3)

rf_mtry_5 <- rf_mtry(5)

rf_mtry_7 <- rf_mtry(7)

rf_mtry_9 <- rf_mtry(9)

rf_mtry_11 <- rf_mtry(11)

rf_mtry_13 <- rf_mtry(13)

rf_mtry_15 <- rf_mtry(15)
```

# Tune Farm(er) RF, using ranger() implementation
```{r}
# hyperparameter grid search
hyper_grid <- expand.grid(
  mtry       = seq(5, 20, by = 2),
  node_size  = seq(4, 20, by = 2),
  sampe_size = c(.55, .632, .70, .80),
  OOB_RMSE   = 0
)

# total number of combinations
nrow(hyper_grid)

for(i in 1:nrow(hyper_grid)) {
  
  # train model
  model <- ranger(
    formula         = YIELD ~ ., 
    data            = farmer_train, 
    num.trees       = 2000,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$node_size[i],
    sample.fraction = hyper_grid$sampe_size[i],
    seed            = 123
  )
  
  # add OOB error to grid
  hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)
}

saveRDS(hyper_grid, "./data/data-out/tune-farmer-ranger.RDS")
```
*from previous run, @Brennan, use parameters that minimize the OOB_RMSE
ntree = 2000
mtry = 19
node size = 4
sample size = 0.800
OOB RMSE = 17.53468

# Farm(er) RF variable importance with various mtry levels, using the ranger() implementation
* @Brennan, we take these results and publish them in SI Table 4
```{r}
rf_mtry <- function(mtry) {
  
  #index
  index <- mtry
   
  # build RF on training data
  set.seed(1234) 
  rf <- ranger(formula          = YIELD ~ ., 
               data             = farmer_train, 
               num.trees        = 2000,
               mtry             = index, 
               min.node.size    = 4,
               sample.fraction  = 0.8,
               importance       = "permutation")
  
  # test the model
  preds <- predict(rf, data = farmer_test)
  preds <- as.data.frame(preds$predictions)
  colnames(preds) <- "PREDS"
  test <- cbind(farmer_test, preds) 

  # performance on test set
  RMSE <- caret::RMSE(test$PREDS, test$YIELD)
  R2 <- 1 - (sum((test$YIELD - test$PREDS)^2) / sum((test$YIELD - mean(test$YIELD))^2))
  
  # build argument to feed save, below
  list <- list("RMSE" = RMSE, "R2" = R2, "var_exp" = rf)
  print(list)
  
  # variable importance
  imp <- as.data.frame(rf$variable.importance)
  imp$Variable <- row.names(imp)
  colnames(imp) <- c("VI", "VAR")
  ranger_imp <- imp
  
  # save .RDS's
  saveRDS(imp, file = paste0('./results/ranger-imp/mtry_farmRF_',index,'.RDS'))
  return(list)

}

rf_mtry_5 <- rf_mtry(5)

rf_mtry_7 <- rf_mtry(7)

rf_mtry_9 <- rf_mtry(9)

rf_mtry_11 <- rf_mtry(11)

rf_mtry_13 <- rf_mtry(13)

rf_mtry_15 <- rf_mtry(15)

rf_mtry_17 <- rf_mtry(17)

rf_mtry_19 <- rf_mtry(19)
```