diff --git a/.gitignore b/.gitignore index 2d20778..82a4aa5 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ public/ renv/ renv.lock +vignettes/*_files +vignettes/*.html diff --git a/DESCRIPTION b/DESCRIPTION index 4c5becd..6c558a1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,19 +22,25 @@ Imports: tibble, zip Suggests: + AmesHousing, covr, + data.table, + DiagrammeR, hardhat, - testthat (>= 2.1.0), + jsonlite, + kableExtra, knitr, - modeldata, - rmarkdown, lintr, + modeldata, pkgdown, + purrr, readr, - jsonlite, recipes, + rmarkdown, rsample, scales, + snakecase, + testthat (>= 2.1.0), tidyr, tune, workflows, diff --git a/README.Rmd b/README.Rmd index 39ca435..da9f54b 100644 --- a/README.Rmd +++ b/README.Rmd @@ -68,7 +68,9 @@ Here is a quick example using `lightsnip` with a Tidymodels cross-validation wor library(dplyr) library(lightgbm) library(lightsnip) -library(tidymodels) +library(parsnip) +library(recipes) +library(workflows) # Create a dataset for training mtcars_train <- mtcars %>% diff --git a/README.md b/README.md index f8e3d28..6315530 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,9 @@ cross-validation workflow: library(dplyr) library(lightgbm) library(lightsnip) -library(tidymodels) +library(parsnip) +library(recipes) +library(workflows) # Create a dataset for training mtcars_train <- mtcars %>% @@ -134,7 +136,7 @@ mtcars_test %>% | | actual_mpg | pred_mpg | |:---------------|-----------:|---------:| -| Ford Pantera L | 15.8 | 13.72 | -| Ferrari Dino | 19.7 | 21.29 | -| Maserati Bora | 15.0 | 13.65 | -| Volvo 142E | 21.4 | 22.04 | +| Ford Pantera L | 15.8 | 14.08 | +| Ferrari Dino | 19.7 | 21.20 | +| Maserati Bora | 15.0 | 13.72 | +| Volvo 142E | 21.4 | 22.12 | diff --git a/vignettes/finding-comps.rmd b/vignettes/finding-comps.rmd new file mode 100644 index 0000000..d93893a --- /dev/null +++ b/vignettes/finding-comps.rmd @@ -0,0 +1,787 @@ +--- +title: "Finding comparables with LightGBM" +format: html +--- + +```{css, echo=FALSE} +table { + display: table !important; +} +``` + +```{r opts, include=FALSE} +knitr::opts_chunk$set(echo = FALSE, out.width = "100%") +``` + +```{r setup, message=FALSE, warning=FALSE} +library(AmesHousing) +library(data.table) +library(DiagrammeR) +library(dplyr) +library(kableExtra) +library(lightgbm) +library(lightsnip) +library(parsnip) +library(purrr) +library(recipes) +library(scales) +library(snakecase) +library(tidyr) +library(workflows) + +# Function to plot LightGBM trees. Sourced originally from: +# https://github.com/microsoft/LightGBM/issues/1222#issuecomment-1134401696 +lgb.plot.tree <- function(model = NULL, tree = NULL, rules = NULL) { + dt <- lgb.model.dt.tree(model) + dt <- dt[tree_index == tree, ] + data.table::setnames( + dt, + old = c("tree_index", "split_feature", "threshold", "split_gain"), + new = c("Tree", "Feature", "Split", "Gain") + ) + dt[, Value := 0.0] + dt[, Value := leaf_value] + dt[is.na(Value), Value := internal_value] + dt[is.na(Gain), Gain := leaf_value] + dt[is.na(Feature), Feature := "Leaf"] + dt[, Node := split_index] + max_node <- max(dt[["Node"]], na.rm = TRUE) + dt[is.na(Node), Node := max_node + leaf_index + 1] + dt[, ID := paste(Tree, Node, sep = "-")] + dt[, parent := node_parent][is.na(parent), parent := leaf_parent] + dt[, Yes := dt$ID[match(dt$Node, dt$parent)]] + dt <- dt[nrow(dt):1, ] # nolint + dt[, No := dt$ID[match(dt$Node, dt$parent)]] + dt[default_left == TRUE, Missing := Yes] + dt[default_left == FALSE, Missing := Yes] + zero_present <- function(x) { + sapply(strsplit(as.character(x), "||", fixed = TRUE), function(el) { + any(el == "0") + }) + } + dt[zero_present(Split), Missing := Yes] + dt[Feature == "Leaf", label := paste0( + Feature, " ", leaf_index, + "\nValue: ", scales::dollar(Value, accuracy = 1) + )] + dt[Feature != "Leaf", label := paste0( + "Split ", split_index, "\n", Feature, + "\nValue: ", scales::dollar(Value, accuracy = 1) + )] + dt[Node == 0, label := paste0("Tree ", Tree, "\n", label)] + dt[, shape := "rectangle"][Feature == "Leaf", shape := "oval"] + dt[, filledcolor := "Beige"][Feature == "Leaf", filledcolor := "Khaki"] + + dt <- dt[order(-Tree)] + nodes <- DiagrammeR::create_node_df( + n = nrow(dt), + ID = dt$ID, + label = dt$label, + fillcolor = dt$filledcolor, + shape = dt$shape, + data = dt$Feature, + fontcolor = "black" + ) + numeric_idx <- suppressWarnings(!is.na(as.numeric(dt[["Split"]]))) + dt[numeric_idx, Split := round(as.numeric(Split), 4)] + levels.to.names <- function(x, feature_name, rules) { + lvls <- sort(rules[[feature_name]]) + result <- strsplit(x, "||", fixed = TRUE) + result <- lapply(result, as.numeric) + levels_to_names <- function(x) { + names(lvls)[as.numeric(x)] + } + result <- lapply(result, levels_to_names) + result <- lapply(result, paste, collapse = "\n") + result <- as.character(result) + } + if (!is.null(rules)) { + for (f in names(rules)) { + dt[ + Feature == f & decision_type == "==", + Split := levels.to.names(Split, f, rules) + ] + } + } + dt[nchar(Split) > 500, Split := "Split too long to render"] + + edges <- DiagrammeR::create_edge_df( + from = match(dt[Feature != "Leaf", c(ID)] %>% rep(2), dt$ID), + to = match(dt[Feature != "Leaf", c(Yes, No)], dt$ID), + label = dt[Feature != "Leaf", paste(decision_type, Split)] %>% + c(rep("", nrow(dt[Feature != "Leaf"]))), + style = dt[Feature != "Leaf", ifelse(Missing == Yes, "bold", "solid")] %>% + c(dt[Feature != "Leaf", ifelse(Missing == No, "bold", "solid")]), + rel = "leading_to" + ) + + graph <- DiagrammeR::create_graph( + nodes_df = nodes, + edges_df = edges, + attr_theme = NULL + ) %>% + DiagrammeR::add_global_graph_attrs( + attr_type = "graph", + attr = c("layout", "rankdir"), + value = c("dot", "LR") + ) %>% + DiagrammeR::add_global_graph_attrs( + attr_type = "node", + attr = c("color", "style", "fontname"), + value = c("DimGray", "filled", "Helvetica") + ) %>% + DiagrammeR::add_global_graph_attrs( + attr_type = "edge", + attr = c("color", "arrowsize", "arrowhead", "fontname"), + value = c("DimGray", "1.5", "vee", "Helvetica") + ) + + return(graph) +} +``` + +```{r train_model, warning=FALSE, message=FALSE, results='hide'} +# Create a standard tidymodels workflow using LightGBM +qmd_seed <- 2021 +set.seed(qmd_seed) + +# Train/test split +ames <- AmesHousing::make_ames() %>% + rename_with(~ snakecase::to_snake_case(.x)) %>% + rsample::initial_split(prop = 0.8) +ames_train <- rsample::training(ames) +ames_test <- rsample::testing(ames) + +# Tell model which vars to use and convert factors to int +ames_recp <- recipes::recipe( + sale_price ~ lot_area + overall_cond + + year_built + gr_liv_area + neighborhood, + data = ames_train +) %>% + step_integer(all_nominal(), zero_based = TRUE) + +# Add the model specification and workflow +ames_model <- parsnip::boost_tree(trees = 2) %>% + set_mode("regression") %>% + set_engine( + engine = "lightgbm", + seed = qmd_seed, + deterministic = TRUE, + categorical_feature = c("overall_cond", "neighborhood"), + num_leaves = 11 + ) + +ames_wflow <- workflow() %>% + add_model(ames_model) %>% + add_recipe( + recipe = ames_recp, + blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE) + ) + +# Fit the model, then extract the fitted engine object +ames_fit <- parsnip::fit(ames_wflow, ames_train) +ames_fit_eng <- parsnip::extract_fit_engine(ames_fit) + +# Extract the tree structure from the fitted engine object +ames_tree_0 <- lgb.plot.tree(ames_fit_eng, 0) +``` + +The [Cook County Assessor's Office (CCAO)](https://www.cookcountyassessor.com/) uses LightGBM for its [residential](https://github.com/ccao-data/model-res-avm) and [condominium](https://github.com/ccao-data/model-condo-avm) valuation models. [LightGBM](https://github.com/microsoft/LightGBM) is a machine learning framework that works by iteratively growing many [decision trees](https://en.wikipedia.org/wiki/Decision_tree). It is especially suited to data with many categorical variables and complex interactions, such as housing data. + +However, LightGBM is also complicated. Its outputs can be difficult to explain and interpret, especially as model complexity grows. Techniques such as [SHAP values](https://github.com/shap/shap) can help, but aren't inuitive to the average person. + +This vignette outlines a novel technique to explain LightGBM outputs specifically for housing data. The technique finds [comparable properties](https://www.investopedia.com/terms/c/comparables.asp) by exploiting the tree structure of a trained LightGBM model. Its goal is to help diagnose model issues and answer a common question from property owners, "What comparables did you use to value my property?" + +## How Decision Trees Work + +To understand the comparable (comp) finding technique, you must first understand how decision trees determine a property's value. Below is a simple decision tree trained on the [Ames Housing Dataset](https://www.tmwr.org/ames.html). It is made up of rectangular splits and oval leaves. + +#### Ames Housing Data: Example Decision Tree + +```{r render_tree, message=FALSE} +ames_tree_0 %>% + render_graph(as_svg = TRUE) +``` + +
+ +Each split has a _property variable_ (i.e. `gr_liv_area`) and a _rule_ (`<= 1224.5`). The **bold** arrow points to the node you should go to when the rule is true about a given property. For example, at Split 4 in the tree above, properties with a living area of less than `1224.5` should proceed to Split 9. + +Each leaf shows the value a property will get from the tree after going through all the splits. For example, the Ames property below (ID = 2) follows the purple path through through the tree based on its characteristics. Its predicted value from the tree is $184,306, as shown at the terminal leaf node (Leaf 4). + +```{r} +ames_test_prep <- recipes::bake( + prep(ames_recp), + new_data = ames_test, + all_predictors() +) + +example_prop_1 <- 2 +example_prop_2 <- 75 +ames_example_props <- ames_test_prep %>% + dplyr::slice(c(example_prop_1, example_prop_2)) %>% + mutate( + id = c(example_prop_1, example_prop_2), + across(c(gr_liv_area, lot_area), scales::comma) + ) %>% + select( + ID = id, `Livable Area` = gr_liv_area, `Lot Area` = lot_area, + `Year Built` = year_built, Condition = overall_cond, + Neighborhood = neighborhood + ) %>% + mutate( + `Pred. Value` = scales::dollar(predict( + ames_fit_eng, + data = as.matrix(ames_test_prep[c(example_prop_1, example_prop_2), ]), + num_iteration = 1 + ), accuracy = 1) + ) + +ames_example_props[1, 7] <- paste0( + "", + ames_example_props[1, 7], + "" +) + +ames_example_props %>% + dplyr::slice(1) %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1, background = "thistle", align = "right") +``` + +
+ +```{r} +ames_tree_0 %>% + set_node_attrs(fillcolor, "thistle", c(21, 13, 12, 10)) %>% + set_node_attrs(fillcolor, "plum", c(9)) %>% + render_graph(as_svg = TRUE) +``` +

+ +Other properties may have a different path through the tree. Below, property number 75 takes the green path through the tree and receives a predicted value of $195,367. + +```{r} +ames_example_props[2, 7] <- paste0( + "", + ames_example_props[2, 7], + "" +) + +ames_example_props %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1, background = "thistle", align = "right") %>% + row_spec(2, background = "palegreen", align = "right") +``` + +
+ +```{r} +ames_tree_0 %>% + set_node_attrs(fillcolor, "grey85", c(21)) %>% + set_node_attrs(fillcolor, "thistle", c(13, 12, 10)) %>% + set_node_attrs(fillcolor, "plum", c(9)) %>% + set_node_attrs(fillcolor, "palegreen", c(20, 16)) %>% + set_node_attrs(fillcolor, "limegreen", c(15)) %>% + render_graph(as_svg = TRUE) +``` + +
+ +The process of "predicting" with a decision tree is just running each property through the rules established by the splits. Properties with similar split outcomes will also have similar characteristics and will end up in the same leaf node of the tree. As such, extracting comparables from a single tree is simple: just find all the properties that share a leaf node with your target property. + +Let's use the property with ID = 2 as our target. As we saw above, it ends up in Leaf 4. Here are some comparables from the same leaf node: + +```{r, warning=FALSE} +leaf_4_ids <- predict( + ames_fit_eng, + data = as.matrix(ames_test_prep), + num_iteration = 1, + predleaf = TRUE +) %>% + as_tibble() %>% + mutate(id = row_number()) %>% + filter(V1 == 4) %>% + dplyr::slice(2:11) %>% + pull(id) + +ames_example_comps1 <- ames_test_prep %>% + dplyr::slice(leaf_4_ids) %>% + mutate( + id = leaf_4_ids, + across(c(gr_liv_area, lot_area), scales::comma) + ) %>% + select( + ID = id, `Livable Area` = gr_liv_area, `Lot Area` = lot_area, + `Year Built` = year_built, Condition = overall_cond, + Neighborhood = neighborhood + ) %>% + mutate( + `Pred. Value` = scales::dollar(predict( + ames_fit_eng, + data = as.matrix(ames_test_prep[leaf_4_ids, ]), + num_iteration = 1 + ), accuracy = 1) + ) + +for (i in seq_len(10)) { + ames_example_comps1[i, 7] <- paste0( + "", + ames_example_comps1[i, 7], + "" + ) +} + +ames_example_comps1 %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1:10, background = "thistle", align = "right") +``` + +All of these properties follow the same purple path through the tree and end up receiving the same predicted value from Leaf 4. However, their characteristics aren't actually very comparable. They have vastly different lot areas and ages. Any appraiser seeing these comps would probably laugh at you. + +So what's the issue? Why aren't our properties in the same leaf node actually comparable? It's because our model is too simple. It doesn't yet have enough rules to distinguish a house built in 1959 from one built in 2008. We can solve this by adding more splits to our single tree, by adding more trees, or by doing both. + +## How LightGBM Works + +In practice, frameworks like LightGBM don't use just one decision tree. Instead, they combine many decision trees together, either by taking the average or adding their results. This can create incredibly complex rulesets that are difficult for humans to follow, but which lead to much better predictions. Let's extend our decision tree from before with an additional tree. + +**Tree 0** is the same as before. and the purple path shows the rules followed by our target property (ID = 2). + +#### Ames Housing Data: Tree 0 + +```{r} +ames_tree_0 %>% + set_node_attrs(fillcolor, "thistle", c(21, 13, 12, 10)) %>% + set_node_attrs(fillcolor, "plum", c(9)) %>% + render_graph(as_svg = TRUE) +``` + +
+ +**Tree 1** is a new tree. The target property still follows the purple path, but notice the new values at each split and leaf. These values are substantially less than the values from **Tree 0**. That's because they are _added_ to **Tree 0**'s results. Using LightGBM, the final predicted value for a given property is the _sum of predicted values from all trees_. + +In the case of our target property, the final predicted value is $187,627, which comes from adding **Tree 0:** Leaf 4 and **Tree 1:** Leaf 4. + +#### Ames Housing Data: Tree 1 + +```{r} +ames_tree_1 <- lgb.plot.tree(ames_fit_eng, 1) +ames_tree_1 %>% + set_node_attrs(fillcolor, "thistle", c(21, 11, 10, 8)) %>% + set_node_attrs(fillcolor, "lightpink", c(7)) %>% + set_node_attrs(fillcolor, "sandybrown", c(6)) %>% + set_node_attrs(fillcolor, "paleturquoise", c(9)) %>% + render_graph(as_svg = TRUE) +``` + +
+ +```{r} +ames_example_lgb <- ames_test_prep %>% + dplyr::slice(c(example_prop_1, example_prop_2)) %>% + mutate( + id = c(example_prop_1, example_prop_2), + across(c(gr_liv_area, lot_area), scales::comma) + ) %>% + select( + ID = id, `Livable Area` = gr_liv_area, `Lot Area` = lot_area, + `Year Built` = year_built, Condition = overall_cond, + Neighborhood = neighborhood + ) %>% + mutate( + `Pred. Value (Tree 0)` = predict( + ames_fit_eng, + data = as.matrix(ames_test_prep[c(example_prop_1, example_prop_2), ]), + num_iteration = 1 + ), + `Pred. Value (Final)` = predict( + ames_fit_eng, + data = as.matrix(ames_test_prep[c(example_prop_1, example_prop_2), ]), + num_iteration = 2 + ), + `Pred. Value (Tree 1)` = + `Pred. Value (Final)` - `Pred. Value (Tree 0)` + ) %>% + dplyr::relocate(`Pred. Value (Final)`, .after = everything()) %>% + mutate(across(starts_with("Pred."), ~ scales::dollar(.x, accuracy = 1))) + +ames_example_lgb[1, 7] <- paste0( + "", + ames_example_lgb[1, 7], + "" +) +ames_example_lgb[1, 8] <- paste0( + "", + ames_example_lgb[1, 8], + "" +) + +ames_example_lgb %>% + dplyr::slice(1) %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1, background = "thistle", align = "right") %>% + column_spec(9, background = "#d9d9d9") +``` + +What about our comparables from earlier? Like before, they all share the same first leaf node (in **Tree 0**), but now they start to differ in the second tree. The newest property (ID = 55) receives a higher final value and the property with the smallest livable area (ID = 84) receives a lower final value. + +```{r} +ames_example_comps2 <- ames_test_prep %>% + dplyr::slice(leaf_4_ids) %>% + mutate( + id = leaf_4_ids, + across(c(gr_liv_area, lot_area), scales::comma) + ) %>% + select( + ID = id, `Livable Area` = gr_liv_area, `Lot Area` = lot_area, + `Year Built` = year_built, Condition = overall_cond, + Neighborhood = neighborhood + ) %>% + mutate( + `Pred. Value (Tree 0)` = predict( + ames_fit_eng, + data = as.matrix(ames_test_prep[leaf_4_ids, ]), + num_iteration = 1 + ), + `Pred. Value (Final)` = predict( + ames_fit_eng, + data = as.matrix(ames_test_prep[leaf_4_ids, ]), + num_iteration = 2 + ), + `Pred. Value (Tree 1)` = + `Pred. Value (Final)` - `Pred. Value (Tree 0)` + ) %>% + dplyr::relocate(`Pred. Value (Final)`, .after = everything()) %>% + mutate(across(starts_with("Pred."), ~ scales::dollar(.x, accuracy = 1))) + +for (i in seq_len(10)) { + ames_example_comps2[i, 7] <- paste0( + "", + ames_example_comps2[i, 7], + "" + ) +} +for (i in c(1:4, 6:9)) { + ames_example_comps2[i, 8] <- paste0( + "", + ames_example_comps2[i, 8], + "" + ) +} +ames_example_comps2[5, 8] <- paste0( + "", + ames_example_comps2[5, 8], + "" +) +ames_example_comps2[10, 8] <- paste0( + "", + ames_example_comps2[10, 8], + "" +) + +ames_example_comps2 %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1:10, background = "thistle", align = "right") %>% + column_spec(9, background = "#d9d9d9") +``` + +Properties that match the target property in _both trees_ will be more similar than ones that match in just one tree. In this case, all properties that landed in Leaf 4 will be most comparable to our target (ID = 2). + +However, as more trees are added, the number of properties that share all their leaf nodes with the target will shrink rapidly. Here's what happens to our target and comparable properties if we add 10 trees to the model. + +The green cells below show leaf nodes shared with the target property. Note that by the final tree, there are _no comparable properties_ that share all the target's leaf nodes. + +```{r, warning=FALSE, message=FALSE, results='hide'} +ames_model_10 <- parsnip::boost_tree(trees = 10) %>% + set_mode("regression") %>% + set_engine( + engine = "lightgbm", + seed = qmd_seed, + deterministic = TRUE, + categorical_feature = c("overall_cond", "neighborhood"), + num_leaves = 11 + ) + +ames_wflow_10 <- workflow() %>% + add_model(ames_model_10) %>% + add_recipe( + recipe = ames_recp, + blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE) + ) + +ames_fit_10 <- parsnip::fit(ames_wflow_10, ames_train) +ames_fit_eng_10 <- parsnip::extract_fit_engine(ames_fit_10) +``` + +```{r} +predict( + ames_fit_eng_10, + data = as.matrix(ames_test_prep[2, ]), + predleaf = TRUE +) %>% + as_tibble() %>% + mutate(ID = 2) %>% + pivot_longer(starts_with("V")) %>% + mutate(tree = readr::parse_number(name) - 1) %>% + left_join( + lgb.model.dt.tree(ames_fit_eng_10) %>% + select(tree_index, leaf_index, leaf_value), + by = c("tree" = "tree_index", "value" = "leaf_index") + ) %>% + select(-name, -value) %>% + mutate(leaf_value = scales::dollar(leaf_value, accuracy = 1)) %>% + pivot_wider( + id_cols = "ID", + names_from = "tree", + values_from = "leaf_value", + names_prefix = "Tree " + ) %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1, background = "lightgreen", align = "right") +``` + +```{r} +ames_comps_values <- predict( + ames_fit_eng_10, + data = as.matrix(ames_test_prep[leaf_4_ids, ]), + predleaf = TRUE +) %>% + as_tibble() %>% + mutate(ID = leaf_4_ids) %>% + pivot_longer(starts_with("V")) %>% + mutate(tree = readr::parse_number(name) - 1) %>% + left_join( + lgb.model.dt.tree(ames_fit_eng_10) %>% + select(tree_index, leaf_index, leaf_value), + by = c("tree" = "tree_index", "value" = "leaf_index") + ) %>% + select(-name, -value) %>% + mutate(leaf_value = scales::dollar(leaf_value, accuracy = 1)) %>% + pivot_wider( + id_cols = "ID", + names_from = "tree", + values_from = "leaf_value", + names_prefix = "Tree " + ) + +shared_nodes <- tibble( + y = c(rep(1, 10), rep(2, 8), rep(3, 4), rep(4, 4), rep(5, 4), rep(6, 3)) + 1, + x = c(1:10, c(1:4, 6:9), c(1, 8:10), c(1, 8:10), c(1, 8:10), c(1, 8, 10)) +) + +for (i in seq_len(nrow(shared_nodes))) { + ames_comps_values[shared_nodes$x[[i]], shared_nodes$y[[i]]] <- paste0( + "", + ames_comps_values[shared_nodes$x[[i]], shared_nodes$y[[i]]], + "" + ) +} + +ames_comps_values %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1:10, align = "right") +``` + +Given that individual properties are unlikely to share _all_ of their leaf nodes with a target property, we need a way to measure comparability that doesn't rely on perfect matching. + +## Quantifying Comparables + +Now that we've seen how LightGBM trees are structured, we can use them to create a _similarity score_. This score is simply the percentage of shared leaf nodes between a target and comparable property, weighted by the importance of each tree. + +Let's see it in action. Here is the same green table from above, but with predicted values replaced by a boolean (T) when the property shares the same leaf node as the target within each tree. The rightmost columns show the number and percentage of leaf nodes shared with the target. + +```{r, message=FALSE} +predict( + ames_fit_eng_10, + data = as.matrix(ames_test_prep[2, ]), + predleaf = TRUE +) %>% + bind_cols( + tibble(ID = 2), + . + ) %>% + set_names(c("ID", paste0("Tree ", 0:9))) %>% + mutate(`Number` = "", `Percent` = "") %>% + mutate(across(everything(), as.character)) %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1, background = "lightgreen", align = "right") %>% + column_spec(12:13, background = "#d9d9d9") + +ames_comps_ids <- bind_cols( + tibble(ID = leaf_4_ids), + matrix("F", nrow = 10, ncol = 10) +) %>% + set_names(c("ID", paste0("Tree ", 0:9))) %>% + mutate(across(everything(), as.character)) %>% + mutate( + `Number` = c(6, 2, 2, 2, 1, 2, 2, 6, 5, 6), + `Percent` = scales::percent(`Number` / 10, accuracy = 1), + `Number` = paste0(`Number`, " / 10") + ) + +shared_nodes <- tibble( + y = c(rep(1, 10), rep(2, 8), rep(3, 4), rep(4, 4), rep(5, 4), rep(6, 3)) + 1, + x = c(1:10, c(1:4, 6:9), c(1, 8:10), c(1, 8:10), c(1, 8:10), c(1, 8, 10)) +) + +for (i in seq_len(nrow(shared_nodes))) { + ames_comps_ids[shared_nodes$x[[i]], shared_nodes$y[[i]]] <- paste0( + "T" + ) +} + +ames_comps_ids %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1:10, align = "right") %>% + column_spec(12:13, background = "#d9d9d9") +``` + +The observations with a high percentage of shared leaf nodes _should_ be most comparable to the target property. However, this assumes that all trees in the model are weighted equally, which is rarely the case. In LightGBM, trees typically have diminishing importance, i.e. each successive tree has less impact on the overall error of the model. To find accurate comparables, we need to quantify each tree's importance and use it to weight each leaf node match. + +Here is a simple function to determine tree importance based on how much each tree contributes to the overall decrease in model error. We can apply it to our training data to get weights for each tree. + +```{r, echo=TRUE, message=FALSE, warning=FALSE, results='hide'} +get_weights <- function(metric, model, train, outcome_col, num_trees) { + model$params$metric <- list(metric) + train_lgb <- lgb.Dataset(as.matrix(train), label = train[[outcome_col]]) + + trained_model <- lgb.train( + params = model$params, + data = train_lgb, + valids = list(test = train_lgb), + nrounds = num_trees + ) + + # Get the initial error for base model before first tree + # this NEEDS to be after the model is trained + # (or else it won't train correctly) + set_field(train_lgb, "init_score", as.matrix(train[[outcome_col]])) + initial_predictions <- get_field(train_lgb, "init_score") + init_score <- mean(initial_predictions) + + # Index into the errors list, and un-list so it is a flat/1dim list + errors <- unlist(trained_model$record_evals$test[[metric]]$eval) + errors <- c(init_score, errors) + diff_in_errors <- diff(errors, 1, 1) + + # Take proportion of diff in errors over total diff in + # errors from all trees + weights <- diff_in_errors / sum(diff_in_errors) + + return(weights) +} + +# Prepare data using a tidymodels recipe +ames_train_prep <- bake(prep(ames_recp), ames_train) + +# Get the decrease in error caused by each successive tree +ames_tree_weights <- get_weights( + metric = "rmse", + model = ames_fit_eng_10, + train = ames_train_prep, + outcome_col = "sale_price", + num_trees = 10 +) +``` + +```{r} +ames_tree_weights %>% + t() %>% + as_tibble() %>% + bind_cols(tibble(Tree = "Weight"), .) %>% + set_names(c("-", paste0("Tree ", 0:9))) %>% + mutate(across( + starts_with("Tree "), + ~ scales::percent(.x, accuracy = 0.1) + )) %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1, align = "right") +``` + +These weights are then multiplied row-wise by the boolean matching matrix from above. This means that for each tree and comparable, the weight value is either kept (when matching / `TRUE`) or zeroed out (when not matching / `FALSE`). The final similarity score for our target property (ID = 2) is the _sum of each comparable row_. + +```{r, message=FALSE} +ames_final_matrix <- matrix(FALSE, nrow = 10, ncol = 10) + +for (i in seq_len(nrow(shared_nodes))) { + ames_final_matrix[shared_nodes$x[[i]], shared_nodes$y[[i]] - 1] <- TRUE +} + +ames_final_matrix_fmt <- (t(ames_final_matrix) * ames_tree_weights) %>% + t() %>% + bind_cols( + tibble(ID = leaf_4_ids), + . + ) %>% + set_names(c("ID", paste0("Tree ", 0:9))) %>% + mutate( + `Sim. Score` = rowSums(across(starts_with("Tree "))), + across(starts_with("Tree "), ~ scales::percent(.x, accuracy = 0.1)), + `Sim. Score` = scales::percent(`Sim. Score`, accuracy = 0.01) + ) + +for (i in seq_len(nrow(shared_nodes))) { + ames_final_matrix_fmt[shared_nodes$x[[i]], shared_nodes$y[[i]]] <- paste0( + "", + ames_final_matrix_fmt[shared_nodes$x[[i]], shared_nodes$y[[i]]], + "" + ) +} + +ames_final_matrix_fmt %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1:10, align = "right") %>% + column_spec(12, background = "#d9d9d9") +``` + +### Final Results + +Now we can simply sort by similarity score to get the observations most similar to our target property (ID = 2). + +```{r} +ames_example_props %>% + dplyr::slice(1) %>% + mutate(`Sim. Score` = "") %>% + select(-`Pred. Value`) %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1, align = "right") %>% + column_spec(7, background = "#d9d9d9") + +ames_test_prep %>% + dplyr::slice(leaf_4_ids) %>% + mutate( + id = leaf_4_ids, + across(c(gr_liv_area, lot_area), scales::comma) + ) %>% + select( + ID = id, `Livable Area` = gr_liv_area, `Lot Area` = lot_area, + `Year Built` = year_built, Condition = overall_cond, + Neighborhood = neighborhood + ) %>% + mutate(`Sim. Score` = ames_final_matrix_fmt$`Sim. Score`) %>% + arrange(desc(`Sim. Score`)) %>% + kbl("html", align = "c", escape = FALSE) %>% + kable_classic(html_font = "sans-serif") %>% + row_spec(1:10, align = "right") %>% + column_spec(7, background = "#d9d9d9") +``` + +Now we're talking! Properties 3 and 62 are nearly identical to our target property, while 60 and 61 aren't too similar. But what about property 55? It looks fairly similar to our target, but has the lowest similarity score. + +Property 55 reveals another advantage of this approach to finding comparables: variables get implicitly weighted by their importance. Variables that aren't predictive in the LightGBM model are _less likely to appear in splits_ and are therefore less likely to determine whether two properties share a terminal leaf node. In the case of property 55, `Year Built` is an important predictor of value and appears in many splits, so it receives a low similarity score due to the large difference in age compared to the target. + +The comparables finding approach does have some disadvantages. Mainly, it requires you to generate a boolean matching matrix for _every target property_ to _every possible comparable_. This many-to-many relationship quickly blows up the compute required for comp finding, particularly for large/complex models. However, with some clever coding, this isn't too hard to work around. + +Overall, this approach is robust, relatively intuitive, and delivers good comparables that will finally let us answer the question, “What comparables did you use to value my property?”