Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Shapley Value option #52

Open
andykrause opened this issue Sep 16, 2019 · 1 comment
Open

Add Shapley Value option #52

andykrause opened this issue Sep 16, 2019 · 1 comment
Assignees

Comments

@andykrause
Copy link
Owner

andykrause commented Sep 16, 2019

Re-add the shapley value option. See forest branch prior to 84fc71d

@andykrause andykrause self-assigned this Sep 16, 2019
@andykrause
Copy link
Owner Author

'
#' Hedonic model approach with base estimator
#'
#' Use of base estimator in hedonic model approach
#'
#' @section Further Details:
#' See ?rfModel for more information
#' @inherit rfModel params
#' @method rfModel shap
#' @importFrom ranger ranger
#' @importFrom pdp partial
#' @export

rfModel.shap <- function(estimator,
rf_df,
rf_spec,
ntrees = 200,
seed = 1,
shap_k = 10,
...){

n <- 1

Estimate model

mod_df <- rf_df[, unique(c(list(...)$ind_var, 'trans_period', 'price'))]
mod_df$price <- log(mod_df$price)

regr.task = makeRegrTask(id = "aa", data = mod_df, target = "price")
regr.lrn = mlr::makeLearner("regr.ranger", par.vals = list(num.trees = ntrees))
rf_model = mlr::train(regr.lrn, regr.task)

shap_df <- mod_df %>%
dplyr::mutate(row_id = 1:nrow(.)) %>%
dplyr::group_by(trans_period) %>%
dplyr::slice(1:shap_k) %>%
dplyr::arrange(row_id)

shapvalue_df <- shapleyR::getShapleyValues(
shapley(shap_df$row_id,
task = regr.task,
model = rf_model)) %>%
dplyr::mutate(period = shap_df$trans_period) %>%
dplyr::group_by(period) %>%
dplyr::summarize(value = mean(trans_period)) %>%
dplyr::filter(period %in% rf_df$trans_period)

rf_model$coefficients <- data.frame(time = 1:max(rf_df$trans_period)) %>%
dplyr::left_join(shapvalue_df %>%
dplyr::select(time = period,
coefficient = value),
by = 'time') %>%
dplyr::mutate(coefficient = coefficient - coefficient[1])

a <- as.data.frame(cbind(X$trans_period[kk], x$trans_period))

# Add 'coefficients'

log_dep <- ifelse(grepl('log', rf_spec[2]), TRUE, FALSE)

if(log_dep){

coefs <- pdp_df$yhat - pdp_df$yhat[1]

} else {

coefs <- pdp_df$yhat / pdp_df$yhat[1]

}

rf_model$coefficients <- data.frame(time = 1:max(rf_df$trans_period),

coefficient = coefs)

Structure and return

structure(rf_model, class = c('rfmodel', class(rf_model)))

}

#'
#' Hedonic model approach with base estimator
#'
#' Use of base estimator in hedonic model approach
#'
#' @section Further Details:
#' See ?rfModel for more information
#' @inherit rfModel params
#' @method rfModel sim
#' @importFrom ranger ranger
#' @export

rfModel.sim <- function(estimator,
rf_df,
rf_spec,
ntrees = 200,
seed = 1,
...){

set.seed(seed)

Estimate model

rf_model <- ranger::ranger(rf_spec,
data = rf_df,
num.tree = ntrees,
seed = seed)

Add class

class(rf_model) <- c('rfmodel', class(rf_model))

log_dep <- ifelse(grepl('log', rf_spec[2]), TRUE, FALSE)

rfSimulate(rf_obj = rf_model,
rf_df = rf_df,
log_dep = log_dep,
...)
}

#'
#' Simulate selected properties
#'
#' Handle simulation of all chosen properties
#' '
#' @param rf_obj A ranger random forest object
#' @param rf_df Full data.frame used to build the random forest
#' @param sim_type ['random'] Sampling type to use
#' @param sim_per [0.1] Percentage of the total set to simulate
#' @param sim_count [NULL] If not giving a percentage, the total number of properties to simulate
#' @param seed [1] Seed for reproducibility
#' @param ... Additional arguments
#' @importFrom purrr map
#' @export

rfSimulate <- function(rf_obj,
rf_df,
sim_type = 'random',
sim_per = .1,
sim_count = NULL,
seed = 1,
...){

if no count of simulation is given

if (is.null(sim_count)) sim_count <- floor(sim_per * nrow(rf_df))

Get simulation observations

set.seed(seed)
sim_df <- rf_df[sample(1:nrow(rf_df), sim_count, replace = TRUE), ]

Calculate individual price movements

sim_coefs <- purrr::map(.x = sim_df %>% split(., sim_df$trans_id),
.f = rfSim,
rf_obj = rf_obj,
periods = 1:max(rf_df$trans_period),
...)

rf_obj$coefficients <- data.frame(time = 1:max(rf_df$trans_period),
coefficient = Reduce('+', sim_coefs) / length(sim_coefs) - 1)

rf_obj

}

#'
#' Simulation engine
#'
#' Helper function to simulate each example proeprty over the time period(s)
#' '
#' @param rf_obj A ranger random forest object
#' @param sim_df Single property to simulate over time
#' @param periods Time periods to simulate over
#' @param log_dep [fALSE] Is the dependent variables in log form?
#' @param ... Additional arguments
#' @importFrom dplyr mutate
#' @importFrom stats predict
#' @export

rfSim <- function(rf_obj,
sim_df,
periods,
log_dep = FALSE,
...){

new_data <- sim_df[rep(1,length(periods)), ] %>%
dplyr::mutate(trans_period = periods)

pred <- stats::predict(rf_obj, new_data)$prediction
if (log_dep) pred <- exp(pred)
pred / pred[1]

}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant