Add Shapley Value option #52

andykrause · 2019-09-16T04:06:40Z

Re-add the shapley value option. See forest branch prior to 84fc71d

The text was updated successfully, but these errors were encountered:

andykrause · 2019-09-18T14:21:49Z

'
#' Hedonic model approach with base estimator
#'
#' Use of base estimator in hedonic model approach
#'
#' @section Further Details:
#' See ?rfModel for more information
#' @inherit rfModel params
#' @method rfModel shap
#' @importFrom ranger ranger
#' @importFrom pdp partial
#' @export

rfModel.shap <- function(estimator,
rf_df,
rf_spec,
ntrees = 200,
seed = 1,
shap_k = 10,
...){

n <- 1

Estimate model

mod_df <- rf_df[, unique(c(list(...)$ind_var, 'trans_period', 'price'))]
mod_df$price <- log(mod_df$price)

regr.task = makeRegrTask(id = "aa", data = mod_df, target = "price")
regr.lrn = mlr::makeLearner("regr.ranger", par.vals = list(num.trees = ntrees))
rf_model = mlr::train(regr.lrn, regr.task)

shap_df <- mod_df %>%
dplyr::mutate(row_id = 1:nrow(.)) %>%
dplyr::group_by(trans_period) %>%
dplyr::slice(1:shap_k) %>%
dplyr::arrange(row_id)

shapvalue_df <- shapleyR::getShapleyValues(
shapley(shap_df$row_id,
task = regr.task,
model = rf_model)) %>%
dplyr::mutate(period = shap_df$trans_period) %>%
dplyr::group_by(period) %>%
dplyr::summarize(value = mean(trans_period)) %>%
dplyr::filter(period %in% rf_df$trans_period)

rf_model$coefficients <- data.frame(time = 1:max(rf_df$trans_period)) %>%
dplyr::left_join(shapvalue_df %>%
dplyr::select(time = period,
coefficient = value),
by = 'time') %>%
dplyr::mutate(coefficient = coefficient - coefficient[1])

a <- as.data.frame(cbind(X$trans_period[kk], x$trans_period))

# Add 'coefficients'

log_dep <- ifelse(grepl('log', rf_spec[2]), TRUE, FALSE)

if(log_dep){

coefs <- pdp_df$yhat - pdp_df$yhat[1]

} else {

coefs <- pdp_df$yhat / pdp_df$yhat[1]

}

rf_model$coefficients <- data.frame(time = 1:max(rf_df$trans_period),

coefficient = coefs)

Structure and return

structure(rf_model, class = c('rfmodel', class(rf_model)))

}

#'
#' Hedonic model approach with base estimator
#'
#' Use of base estimator in hedonic model approach
#'
#' @section Further Details:
#' See ?rfModel for more information
#' @inherit rfModel params
#' @method rfModel sim
#' @importFrom ranger ranger
#' @export

rfModel.sim <- function(estimator,
rf_df,
rf_spec,
ntrees = 200,
seed = 1,
...){

set.seed(seed)

Estimate model

rf_model <- ranger::ranger(rf_spec,
data = rf_df,
num.tree = ntrees,
seed = seed)

Add class

class(rf_model) <- c('rfmodel', class(rf_model))

log_dep <- ifelse(grepl('log', rf_spec[2]), TRUE, FALSE)

rfSimulate(rf_obj = rf_model,
rf_df = rf_df,
log_dep = log_dep,
...)
}

#'
#' Simulate selected properties
#'
#' Handle simulation of all chosen properties
#' '
#' @param rf_obj A ranger random forest object
#' @param rf_df Full data.frame used to build the random forest
#' @param sim_type ['random'] Sampling type to use
#' @param sim_per [0.1] Percentage of the total set to simulate
#' @param sim_count [NULL] If not giving a percentage, the total number of properties to simulate
#' @param seed [1] Seed for reproducibility
#' @param ... Additional arguments
#' @importFrom purrr map
#' @export

rfSimulate <- function(rf_obj,
rf_df,
sim_type = 'random',
sim_per = .1,
sim_count = NULL,
seed = 1,
...){

if no count of simulation is given

if (is.null(sim_count)) sim_count <- floor(sim_per * nrow(rf_df))

Get simulation observations

set.seed(seed)
sim_df <- rf_df[sample(1:nrow(rf_df), sim_count, replace = TRUE), ]

Calculate individual price movements

sim_coefs <- purrr::map(.x = sim_df %>% split(., sim_df$trans_id),
.f = rfSim,
rf_obj = rf_obj,
periods = 1:max(rf_df$trans_period),
...)

rf_obj$coefficients <- data.frame(time = 1:max(rf_df$trans_period),
coefficient = Reduce('+', sim_coefs) / length(sim_coefs) - 1)

rf_obj

}

#'
#' Simulation engine
#'
#' Helper function to simulate each example proeprty over the time period(s)
#' '
#' @param rf_obj A ranger random forest object
#' @param sim_df Single property to simulate over time
#' @param periods Time periods to simulate over
#' @param log_dep [fALSE] Is the dependent variables in log form?
#' @param ... Additional arguments
#' @importFrom dplyr mutate
#' @importFrom stats predict
#' @export

rfSim <- function(rf_obj,
sim_df,
periods,
log_dep = FALSE,
...){

new_data <- sim_df[rep(1,length(periods)), ] %>%
dplyr::mutate(trans_period = periods)

pred <- stats::predict(rf_obj, new_data)$prediction
if (log_dep) pred <- exp(pred)
pred / pred[1]

}

andykrause self-assigned this Sep 16, 2019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add Shapley Value option #52

Add Shapley Value option #52

andykrause commented Sep 16, 2019 •

edited

Loading

andykrause commented Sep 18, 2019

Add Shapley Value option #52

Add Shapley Value option #52

Comments

andykrause commented Sep 16, 2019 • edited Loading

andykrause commented Sep 18, 2019

Estimate model

a <- as.data.frame(cbind(X$trans_period[kk], x$trans_period))

# Add 'coefficients'

log_dep <- ifelse(grepl('log', rf_spec[2]), TRUE, FALSE)

if(log_dep){

coefs <- pdp_df$yhat - pdp_df$yhat[1]

} else {

coefs <- pdp_df$yhat / pdp_df$yhat[1]

}

rf_model$coefficients <- data.frame(time = 1:max(rf_df$trans_period),

coefficient = coefs)

Structure and return

Estimate model

Add class

if no count of simulation is given

Get simulation observations

Calculate individual price movements

andykrause commented Sep 16, 2019 •

edited

Loading