algorithms/reg_tree_imp.R

# Regression Tree ---------------------------------------------------------

# This is the splitting criterion we minimize (SSE [Sum Of Squared Errors]):
# $SSE = \sum_{i \in S_1} (y_i - \bar(y)1)^2 + \sum_{i \in S_2} (y_i - \bar(y)2)^2$
sse_var <- function(x, y) {
  splits <- sort(unique(x))
  sse <- c()
  for (i in seq_along(splits)) {
    sp <- splits[i]
    sse[i] <- sum((y[x < sp] - mean(y[x < sp]))^2) + sum((y[x >= sp] - mean(y[x >= sp]))^2) 
  }
  split_at <- splits[which.min(sse)]
  return(c(sse = min(sse), split = split_at))
}


#' reg_tree
#' Fits a simple regression tree with SSE splitting criterion. The estimator function
#' is the mean.
#' 
#' @param formula an object of class formula
#' @param data a data.frame or matrix
#' @param minsize a numeric value indicating the minimum size of observations
#'                in a leaf
#'
#' @return \itemize{
#' \item tree - the tree object containing all splitting rules and observations
#' \item imp - returns the feature importance
#' \item fit - our fitted values, i.e. X %*% theta
#' \item formula - the underlying formula
#' \item data - the underlying data
#' }
#' @export
#'
#' @examples # Complete runthrough see: www.github.com/andrebleier/cheapml
reg_tree_imp <- function(formula, data, minsize) {
  
  # coerce to data.frame
  data <- as.data.frame(data)
  
  # handle formula
  formula <- terms.formula(formula)
  
  # get the design matrix
  X <- model.matrix(formula, data)
  
  # extract target
  y <- data[, as.character(formula)[2]]
  
  # initialize while loop
  do_splits <- TRUE
  
  # create output data.frame with splitting rules and observations
  tree_info <- data.frame(NODE = 1, NOBS = nrow(data), FILTER = NA, TERMINAL = "SPLIT",
                          IMP_GINI = NA, SPLIT = NA, stringsAsFactors = FALSE)
  
  # keep splitting until there are only leafs left
  while(do_splits) {
    
    # which parents have to be splitted
    to_calculate <- which(tree_info$TERMINAL == "SPLIT")
    
    for (j in to_calculate) {
      
      # handle root node
      if (!is.na(tree_info[j, "FILTER"])) {
        # subset data according to the filter
        this_data <- subset(data, eval(parse(text = tree_info[j, "FILTER"])))
        # get the design matrix
        X <- model.matrix(formula, this_data)
      } else {
        this_data <- data
      }
      
      # estimate splitting criteria
      splitting <- apply(X,  MARGIN = 2, FUN = sse_var, y = this_data[, all.vars(formula)[1]])
      
      # get the min SSE
      tmp_splitter <- which.min(splitting[1,])
      
      # define maxnode
      mn <- max(tree_info$NODE)
      
      # paste filter rules
      current_filter <- c(paste(names(tmp_splitter), ">=", 
                            splitting[2,tmp_splitter]),
                      paste(names(tmp_splitter), "<", 
                            splitting[2,tmp_splitter]))
      
      # Error handling! check if the splitting rule has already been invoked
      split_here  <- !sapply(current_filter,
                             FUN = function(x,y) any(grepl(x, x = y)),
                             y = tree_info$FILTER)
      
      # append the splitting rules
      if (!is.na(tree_info[j, "FILTER"])) {
        current_filter  <- paste(tree_info[j, "FILTER"], 
                             current_filter, sep = " & ")
      } 
      
      # calculate metrics within the children
      metr <- lapply(current_filter,
                         FUN = function(i, x, data, formula) {
                          df <- subset(x = x, subset = eval(parse(text = i)))
                          nobs <- nrow(df)
                          w <- nobs/nrow(data)
                          y <- df[, all.vars(formula)[1]]
                          imp <- mean((y - mean(y, na.rm = TRUE))^2)
                          return(c(nobs, w*imp))
                         },
                         x = this_data, data = data, formula = formula)  
      
      # extract relevant information
      current_nobs <- sapply(metr, function(x) x[[1]])
      imp_sum_child <- sum(sapply(metr, function(x) x[[2]]))
      current_y <- this_data[, all.vars(formula)[1]]
      imp_parent <- nrow(this_data)/nrow(data) * mean((current_y-mean(current_y))^2)
      imp_gini <- imp_parent - imp_sum_child
      
      # insufficient minsize for split
      if (any(current_nobs <= minsize)) {
        split_here <- rep(FALSE, 2)
      }
      
      # create children data frame
      children <- data.frame(NODE = c(mn+1, mn+2),
                             NOBS = current_nobs,
                             FILTER = current_filter,
                             TERMINAL = rep("SPLIT", 2),
                             IMP_GINI = NA,
                             SPLIT = NA,
                             row.names = NULL)[split_here,]
      
      # overwrite state of current node, add gini importance and split variable
      tree_info[j, "TERMINAL"] <- ifelse(all(!split_here), "LEAF", "PARENT")
      tree_info[j, "IMP_GINI"] <- imp_gini
      if (tree_info[j, "TERMINAL"] == "PARENT") {
      tree_info[j, "SPLIT"] <- names(tmp_splitter)
      }
      
      # bind everything
      tree_info <- rbind(tree_info, children)
      
      # check if there are any open splits left
      do_splits <- !all(tree_info$TERMINAL != "SPLIT")
    } # end for
  } # end while
  
  # calculate fitted values
  leafs <- tree_info[tree_info$TERMINAL == "LEAF", ]
  fitted <- c()
  for (i in seq_len(nrow(leafs))) {
    # extract index
    ind <- as.numeric(rownames(subset(data, eval(parse(text = leafs[i, "FILTER"])))))
    # estimator is the mean y value of the leaf
    fitted[ind] <- mean(y[ind])
  }
  
  # calculate feature importance
  imp <- tree_info[, c("SPLIT", "IMP_GINI")]
  
  if (!all(is.na(imp$SPLIT))) {
  imp <- aggregate(IMP_GINI ~ SPLIT, FUN = function(x, all) sum(x, na.rm = T)/sum(all, na.rm = T), 
                    data = imp, all = imp$IMP_GINI)
  }
  
  # rename to importance
  names(imp) <- c("FEATURES", "IMPORTANCE")
  imp <- imp[order(imp$IMPORTANCE, decreasing = TRUE),]
  
  # return everything
  return(list(tree = tree_info, fit = fitted, formula = formula, 
              importance = imp, data = data))
}