From 6a7fdb6890edf151d699331e84d432257257608a Mon Sep 17 00:00:00 2001
From: Dan Snow <daniel.snow@cookcountyil.gov>
Date: Tue, 25 Jul 2023 22:22:31 +0000
Subject: [PATCH 1/6] Checkpoint vignette progress

---
 .gitignore                  |   2 +
 vignettes/finding-comps.qmd | 343 ++++++++++++++++++++++++++++++++++++
 2 files changed, 345 insertions(+)
 create mode 100644 vignettes/finding-comps.qmd

diff --git a/.gitignore b/.gitignore
index 2d20778..82a4aa5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@
 public/
 renv/
 renv.lock
+vignettes/*_files
+vignettes/*.html
diff --git a/vignettes/finding-comps.qmd b/vignettes/finding-comps.qmd
new file mode 100644
index 0000000..527c5c2
--- /dev/null
+++ b/vignettes/finding-comps.qmd
@@ -0,0 +1,343 @@
+---
+title: "Finding comparables with LightGBM"
+format: html
+---
+
+```{r opts, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+```
+
+```{r setup, message=FALSE, warning=FALSE}
+library(AmesHousing)
+library(data.table)
+library(DiagrammeR)
+library(dplyr)
+library(kableExtra)
+library(lightgbm)
+library(lightsnip)
+library(scales)
+library(snakecase)
+library(tidymodels)
+
+# Function to plot LightGBM trees. Sourced originally from:
+# https://github.com/microsoft/LightGBM/issues/1222#issuecomment-1134401696
+lgb.plot.tree <- function(model = NULL, tree = NULL, rules = NULL) {
+  dt <- lgb.model.dt.tree(model)
+  dt <- dt[tree_index == tree, ]
+  data.table::setnames(
+    dt,
+    old = c("tree_index", "split_feature", "threshold", "split_gain"),
+    new = c("Tree", "Feature", "Split", "Gain")
+  )
+  dt[, Value := 0.0]
+  dt[, Value := leaf_value]
+  dt[is.na(Value), Value := internal_value]
+  dt[is.na(Gain), Gain := leaf_value]
+  dt[is.na(Feature), Feature := "Leaf"]
+  dt[, Node := split_index]
+  max_node <- max(dt[["Node"]], na.rm = TRUE)
+  dt[is.na(Node), Node := max_node + leaf_index + 1]
+  dt[, ID := paste(Tree, Node, sep = "-")]
+  dt[, parent := node_parent][is.na(parent), parent := leaf_parent]
+  dt[, Yes := dt$ID[match(dt$Node, dt$parent)]]
+  dt <- dt[nrow(dt):1, ]
+  dt[, No := dt$ID[match(dt$Node, dt$parent)]]
+  dt[default_left == TRUE, Missing := Yes]
+  dt[default_left == FALSE, Missing := Yes]
+  zero_present <- function(x) {
+    sapply(strsplit(as.character(x), "||", fixed = TRUE), function(el) {
+      any(el == "0")
+    })
+  }
+  dt[zero_present(Split), Missing := Yes]
+  dt[Feature == "Leaf", label := paste0(
+    Feature, " ", leaf_index,
+    "\nValue: ", scales::dollar(Value, accuracy = 1)
+  )]
+  dt[Feature != "Leaf", label := paste0(
+    "Split ", split_index, "\n", Feature,
+    "\nValue: ", scales::dollar(Value, accuracy = 1)
+  )]
+  dt[Node == 0, label := paste0("Tree ", Tree, "\n", label)]
+  dt[, shape := "rectangle"][Feature == "Leaf", shape := "oval"]
+  dt[, filledcolor := "Beige"][Feature == "Leaf", filledcolor := "Khaki"]
+
+  dt <- dt[order(-Tree)]
+  nodes <- DiagrammeR::create_node_df(
+    n         = nrow(dt),
+    ID        = dt$ID,
+    label     = dt$label,
+    fillcolor = dt$filledcolor,
+    shape     = dt$shape,
+    data      = dt$Feature,
+    fontcolor = "black"
+  )
+  numeric_idx <- suppressWarnings(!is.na(as.numeric(dt[["Split"]])))
+  dt[numeric_idx, Split := round(as.numeric(Split), 4)]
+  levels.to.names <- function(x, feature_name, rules) {
+    lvls <- sort(rules[[feature_name]])
+    result <- strsplit(x, "||", fixed = TRUE)
+    result <- lapply(result, as.numeric)
+    levels_to_names <- function(x) {
+      names(lvls)[as.numeric(x)]
+    }
+    result <- lapply(result, levels_to_names)
+    result <- lapply(result, paste, collapse = "\n")
+    result <- as.character(result)
+  }
+  if (!is.null(rules)) {
+    for (f in names(rules)) {
+      dt[Feature == f & decision_type == "==", Split := levels.to.names(Split, f, rules)]
+    }
+  }
+  dt[nchar(Split) > 500, Split := "Split too long to render"]
+
+  edges <- DiagrammeR::create_edge_df(
+    from = match(dt[Feature != "Leaf", c(ID)] %>% rep(2), dt$ID),
+    to = match(dt[Feature != "Leaf", c(Yes, No)], dt$ID),
+    label = dt[Feature != "Leaf", paste(decision_type, Split)] %>%
+      c(rep("", nrow(dt[Feature != "Leaf"]))),
+    style = dt[Feature != "Leaf", ifelse(Missing == Yes, "bold", "solid")] %>%
+      c(dt[Feature != "Leaf", ifelse(Missing == No, "bold", "solid")]),
+    rel = "leading_to"
+  )
+
+  graph <- DiagrammeR::create_graph(
+    nodes_df = nodes,
+    edges_df = edges,
+    attr_theme = NULL
+  ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "graph",
+      attr = c("layout", "rankdir"),
+      value = c("dot", "LR")
+    ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "node",
+      attr = c("color", "style", "fontname"),
+      value = c("DimGray", "filled", "Helvetica")
+    ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "edge",
+      attr = c("color", "arrowsize", "arrowhead", "fontname"),
+      value = c("DimGray", "1.5", "vee", "Helvetica")
+    )
+
+  return(graph)
+}
+```
+
+```{r train_model, warning=FALSE, message=FALSE, results='hide'}
+# Create a standard tidymodels workflow using LightGBM
+qmd_seed <- 2021
+set.seed(qmd_seed)
+
+# Train/test split
+ames <- AmesHousing::make_ames() %>%
+  rename_with(~ snakecase::to_snake_case(.x)) %>%
+  rsample::initial_split(prop = 0.8)
+ames_train <- rsample::training(ames)
+ames_test <- rsample::testing(ames)
+
+# Tell model which vars to use and convert factors to int
+ames_recp <- recipes::recipe(
+  sale_price ~ lot_area + overall_cond +
+    year_built + gr_liv_area + neighborhood,
+  data = ames_train
+) %>%
+  step_integer(all_nominal(), zero_based = TRUE)
+
+# Add the model specification and workflow
+ames_model <- parsnip::boost_tree(trees = 2) %>%
+  set_mode("regression") %>%
+  set_engine(
+    engine = "lightgbm",
+    seed = qmd_seed,
+    deterministic = TRUE,
+    categorical_feature = c("overall_cond", "neighborhood"),
+    num_leaves = 11
+  )
+
+ames_wflow <- workflow() %>%
+  add_model(ames_model) %>%
+  add_recipe(
+    recipe = ames_recp,
+    blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE)
+  )
+
+# Fit the model, then extract the fitted engine object
+ames_fit <- parsnip::fit(ames_wflow, ames_train)
+ames_fit_eng <- parsnip::extract_fit_engine(ames_fit)
+
+# Extract the tree structure from the fitted engine object
+ames_tree_0 <- lgb.plot.tree(ames_fit_eng, 0)
+```
+
+The [Cook County Assessor's Office (CCAO)](https://www.cookcountyassessor.com/) uses LightGBM for its [residential](https://github.com/ccao-data/model-res-avm) and [condominium](https://github.com/ccao-data/model-condo-avm) valuation models. [LightGBM](https://github.com/microsoft/LightGBM) is a machine learning framework that works by iteratively growing many [decision trees](https://en.wikipedia.org/wiki/Decision_tree). It is especially suited to data with many categorical variables and complex interactions, such as housing data.
+
+However, LightGBM is also complicated. Its outputs can be difficult to explain and interpret, especially as model complexity grows. Techniques such as [SHAP values](https://github.com/shap/shap) can help, but aren't inuitive to the average person.
+
+This vignette outlines a novel technique to explain LightGBM outputs specifically for housing data. The technique generates [comparable properties](https://www.investopedia.com/terms/c/comparables.asp) by exploiting the tree structure of a trained LightGBM model. Its goal is to help diagnose model issues and answer a common question from property owners, "What comparables did you use to value my property?"
+
+## How Decision Trees Work
+
+To understand the comp finding technique, you must first understand how decision trees are used to determine a property's value. Below is a simple decision tree trained on the [Ames Housing Dataset](https://www.tmwr.org/ames.html). It is made up of rectangular <span style="background-color:#F5F5DC"><strong>splits</strong></span> and oval <span style="background-color:#F0E68C"><strong>leaves</strong></span>. 
+
+#### Ames Housing Data: Example Decision Tree
+
+```{r render_tree, message=FALSE}
+ames_tree_0 %>%
+  render_graph(as_svg = TRUE)
+```
+
+<br>
+
+Each <span style="background-color:#F5F5DC"><strong>split</strong></span> has a _variable_ (`gr_liv_area`) and a _rule_ (`<= 1224.5`), and the **bold** arrow points to the node you should go to when a rule is true. For example, at Split 4 in the tree above, properties with a living area of less than `1224.5` should proceed to Split 9.
+
+Each <span style="background-color:#F0E68C"><strong>leaf</strong></span> shows the value a property will get from the tree after going through all the splits. For example, the Ames property below (ID = 2) follows the purple path through through the tree based on its characteristics. Its predicted value from this tree is $184,306, as shown at the terminal leaf node (Leaf 4).
+
+```{r}
+ames_test_prep <- recipes::bake(
+  prep(ames_recp),
+  new_data = ames_test,
+  all_predictors()
+)
+
+example_prop_1 <- 2
+example_prop_2 <- 75
+ames_example_props <- ames_test_prep %>%
+  dplyr::slice(c(example_prop_1, example_prop_2)) %>%
+  mutate(
+    id = c(example_prop_1, example_prop_2),
+    across(c(gr_liv_area, lot_area), scales::comma)
+  ) %>%
+  select(
+    ID = id, `Livable Area` = gr_liv_area, `Lot Area` = lot_area,
+    `Year Built` = year_built, Condition = overall_cond,
+    Neighborhood = neighborhood
+  ) %>%
+  mutate(
+    `Pred. Value` = scales::dollar(predict(
+      ames_fit_eng,
+      data = as.matrix(ames_test_prep[c(example_prop_1, example_prop_2), ]),
+      num_iteration = 1
+    ), accuracy = 1)
+  )
+
+ames_example_props[1, 7] <- paste0("<span style='background-color:plum'>", ames_example_props[1, 7], "</span>")
+ames_example_props %>%
+  dplyr::slice(1) %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1, background = "thistle", align = "right")
+```
+
+<br>
+
+```{r}
+ames_tree_0 %>%
+  set_node_attrs(fillcolor, "thistle", c(21, 13, 12, 10)) %>%
+  set_node_attrs(fillcolor, "plum", c(9)) %>%
+  render_graph(as_svg = TRUE)
+```
+<br><br>
+
+Other properties may have a different path through the tree. Below, property number 75 takes the green path through the tree and receives a predicted value of $195,367.
+
+```{r}
+ames_example_props[2, 7] <- paste0("<span style='background-color:limegreen'>", ames_example_props[2, 7], "</span>")
+ames_example_props %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1, background = "thistle", align = "right") %>%
+  row_spec(2, background = "palegreen", align = "right")
+```
+
+<br>
+
+```{r}
+ames_tree_0 %>%
+  set_node_attrs(fillcolor, "grey85", c(21)) %>%
+  set_node_attrs(fillcolor, "thistle", c(13, 12, 10)) %>%
+  set_node_attrs(fillcolor, "plum", c(9)) %>%
+  set_node_attrs(fillcolor, "palegreen", c(20, 16)) %>%
+  set_node_attrs(fillcolor, "limegreen", c(15)) %>%
+  render_graph(as_svg = TRUE)
+```
+
+
+
+## How LightGBM Works
+
+#### Ames Housing Data: Tree 0
+
+```{r}
+ames_tree_0 %>%
+  set_node_attrs(fillcolor, "thistle", c(21, 13, 12, 10)) %>%
+  set_node_attrs(fillcolor, "plum", c(9)) %>%
+  render_graph(as_svg = TRUE)
+```
+
+#### Ames Housing Data: Tree 1
+
+```{r}
+ames_tree_1 <- lgb.plot.tree(ames_fit_eng, 1)
+ames_tree_1 %>%
+  set_node_attrs(fillcolor, "thistle", c(21, 11, 10, 8)) %>%
+  set_node_attrs(fillcolor, "plum", c(7)) %>%
+  render_graph(as_svg = TRUE)
+```
+
+
+
+```{r}
+ames_example_lgb <- ames_test_prep %>%
+  dplyr::slice(c(example_prop_1, example_prop_2)) %>%
+  mutate(
+    id = c(example_prop_1, example_prop_2),
+    across(c(gr_liv_area, lot_area), scales::comma)
+  ) %>%
+  select(
+    ID = id, `Livable Area` = gr_liv_area, `Lot Area` = lot_area,
+    `Year Built` = year_built, Condition = overall_cond,
+    Neighborhood = neighborhood
+  ) %>%
+  mutate(
+    `Pred. Value (Tree 0)` = predict(
+      ames_fit_eng,
+      data = as.matrix(ames_test_prep[c(example_prop_1, example_prop_2), ]),
+      num_iteration = 1
+    ),
+    `Pred. Value (Final)` = predict(
+      ames_fit_eng,
+      data = as.matrix(ames_test_prep[c(example_prop_1, example_prop_2), ]),
+      num_iteration = 2
+    ),
+    `Pred. Value (Tree 1)` =
+      `Pred. Value (Final)` - `Pred. Value (Tree 0)`
+  ) %>%
+  dplyr::relocate(`Pred. Value (Final)`, .after = everything()) %>%
+  mutate(
+    across(starts_with("Pred."), ~ scales::dollar(.x, accuracy = 1)),
+    `Pred. Value (Tree 1)` = paste("+", `Pred. Value (Tree 1)`)
+  )
+
+ames_example_lgb[1, 7] <- paste0("<span style='background-color:plum'>", ames_example_lgb[1, 7], "</span>")
+ames_example_lgb[1, 8] <- paste0("<span style='background-color:plum'>", ames_example_lgb[1, 8], "</span>")
+ames_example_lgb %>%
+  dplyr::slice(1) %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1, background = "thistle", align = "right") %>%
+  column_spec(9, background = "plum")
+```
+
+
+- How decision trees work
+
+- How the data gets saved (as a matrix)
+
+## Finding comps
+
+- Finding the most comparable property
+- Doing it at scale

From 4666526b16c5814ee9e826e5acf37b3e27096f74 Mon Sep 17 00:00:00 2001
From: Dan Snow <daniel.snow@cookcountyil.gov>
Date: Mon, 31 Jul 2023 22:18:48 +0000
Subject: [PATCH 2/6] Checkpoint work

---
 vignettes/finding-comps.qmd | 385 ++++++++++++++++++++++++++++++++++--
 1 file changed, 368 insertions(+), 17 deletions(-)

diff --git a/vignettes/finding-comps.qmd b/vignettes/finding-comps.qmd
index 527c5c2..78fec58 100644
--- a/vignettes/finding-comps.qmd
+++ b/vignettes/finding-comps.qmd
@@ -177,11 +177,11 @@ The [Cook County Assessor's Office (CCAO)](https://www.cookcountyassessor.com/)
 
 However, LightGBM is also complicated. Its outputs can be difficult to explain and interpret, especially as model complexity grows. Techniques such as [SHAP values](https://github.com/shap/shap) can help, but aren't inuitive to the average person.
 
-This vignette outlines a novel technique to explain LightGBM outputs specifically for housing data. The technique generates [comparable properties](https://www.investopedia.com/terms/c/comparables.asp) by exploiting the tree structure of a trained LightGBM model. Its goal is to help diagnose model issues and answer a common question from property owners, "What comparables did you use to value my property?"
+This vignette outlines a novel technique to explain LightGBM outputs specifically for housing data. The technique discovers [comparable properties](https://www.investopedia.com/terms/c/comparables.asp) by exploiting the tree structure of a trained LightGBM model. Its goal is to help diagnose model issues and answer a common question from property owners, "What comparables did you use to value my property?"
 
 ## How Decision Trees Work
 
-To understand the comp finding technique, you must first understand how decision trees are used to determine a property's value. Below is a simple decision tree trained on the [Ames Housing Dataset](https://www.tmwr.org/ames.html). It is made up of rectangular <span style="background-color:#F5F5DC"><strong>splits</strong></span> and oval <span style="background-color:#F0E68C"><strong>leaves</strong></span>. 
+To understand the comparable (comp) finding technique, you must first understand how decision trees work to determine a property's value. Below is a simple decision tree trained on the [Ames Housing Dataset](https://www.tmwr.org/ames.html). It is made up of rectangular <span style="background-color:#F5F5DC"><strong>splits</strong></span> and oval <span style="background-color:#F0E68C"><strong>leaves</strong></span>. 
 
 #### Ames Housing Data: Example Decision Tree
 
@@ -192,9 +192,9 @@ ames_tree_0 %>%
 
 <br>
 
-Each <span style="background-color:#F5F5DC"><strong>split</strong></span> has a _variable_ (`gr_liv_area`) and a _rule_ (`<= 1224.5`), and the **bold** arrow points to the node you should go to when a rule is true. For example, at Split 4 in the tree above, properties with a living area of less than `1224.5` should proceed to Split 9.
+Each <span style="background-color:#F5F5DC"><strong>split</strong></span> has a _variable_ (`gr_liv_area`) and a _rule_ (`<= 1224.5`), and the **bold** arrow points to the node you should go to when the rule is true about a given property. For example, at Split 4 in the tree above, properties with a living area of less than `1224.5` should proceed to Split 9.
 
-Each <span style="background-color:#F0E68C"><strong>leaf</strong></span> shows the value a property will get from the tree after going through all the splits. For example, the Ames property below (ID = 2) follows the purple path through through the tree based on its characteristics. Its predicted value from this tree is $184,306, as shown at the terminal leaf node (Leaf 4).
+Each <span style="background-color:#F0E68C"><strong>leaf</strong></span> shows the value a property will get from the tree after going through all the splits. For example, the Ames target property below (ID = 2) follows the purple path through through the tree based on its characteristics. Its predicted value from the tree is $184,306, as shown at the terminal leaf node (<span style='background-color:plum'><strong>Leaf 4</strong></span>).
 
 ```{r}
 ames_test_prep <- recipes::bake(
@@ -265,10 +265,66 @@ ames_tree_0 %>%
   render_graph(as_svg = TRUE)
 ```
 
+<br>
+
+The process of "predicting" with a decision tree is just running each property through the rules established by the splits. Properties with similar split outcomes will also be similar in terms of their characteristics and will end up in the same leaf node of the tree. As a result, extracting comparables from a single tree is simple: just find all the properties that share a leaf node with your target. For example, here are a few comparables for our target property (ID = 2), which ends up in <span style='background-color:plum'><strong>Leaf 4</strong></span>.
+
+```{r, warning=FALSE}
+leaf_4_ids <- predict(
+  ames_fit_eng,
+  data = as.matrix(ames_test_prep),
+  num_iteration = 1,
+  predleaf = TRUE
+) %>%
+  as_tibble() %>%
+  mutate(id = row_number()) %>%
+  filter(V1 == 4) %>%
+  dplyr::slice(2:11) %>%
+  pull(id)
+
+ames_example_comps1 <- ames_test_prep %>%
+  dplyr::slice(leaf_4_ids) %>%
+  mutate(
+    id = leaf_4_ids,
+    across(c(gr_liv_area, lot_area), scales::comma)
+  ) %>%
+  select(
+    ID = id, `Livable Area` = gr_liv_area, `Lot Area` = lot_area,
+    `Year Built` = year_built, Condition = overall_cond,
+    Neighborhood = neighborhood
+  ) %>%
+  mutate(
+    `Pred. Value` = scales::dollar(predict(
+      ames_fit_eng,
+      data = as.matrix(ames_test_prep[leaf_4_ids, ]),
+      num_iteration = 1
+    ), accuracy = 1)
+  )
+
+for (i in seq_len(10)) {
+  ames_example_comps1[i, 7] <- paste0(
+    "<span style='background-color:plum'>",
+    ames_example_comps1[i, 7],
+    "</span>"
+  )
+}
+
+ames_example_comps1 %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1:10, background = "thistle", align = "right")
+```
 
+All of these properties follow the same purple path through the tree and end up receiving the same predicted value from <span style='background-color:plum'><strong>Leaf 4</strong></span>. However, their characteristics aren't actually very comparable. They have vastly different lot areas and ages. Any appraiser seeing these comps would probably laugh at you. 
+
+So what's the issue? Why aren't our properties in the same leaf node actually comparable? It's because our model is too simple. It doesn't yet have enough rules to distinguish a house built in 1959 from one built in 2008. We can solve this by adding more <span style="background-color:#F5F5DC"><strong>splits</strong></span> to our single tree, by adding more trees, or by doing both.
 
 ## How LightGBM Works
 
+In practice, frameworks like LightGBM don't use just one decision tree. Instead, they combine many decision trees together, either by taking the average or adding their results. This can create incredibly complex rulesets that are difficult for humans to follow, but which lead to much better predictions. Let's extend our decision tree from before with an additional tree. 
+
+**Tree 0** is the same as before, and the purple path shows the rules followed by our target property (ID = 2). 
+
 #### Ames Housing Data: Tree 0
 
 ```{r}
@@ -278,17 +334,23 @@ ames_tree_0 %>%
   render_graph(as_svg = TRUE)
 ```
 
+<br>
+
+**Tree 1** is a new tree. The target property still follows the purple path, but notice the new values at each split and leaf. These values are substantially less than the values from **Tree 0**. That's because they are _added_ to **Tree 0**'s results. Using LightGBM, the final predicted value for a given property is the _sum of predicted values from all trees_. In the case of our target property, the final predicted value is $187,627.
+
 #### Ames Housing Data: Tree 1
 
 ```{r}
 ames_tree_1 <- lgb.plot.tree(ames_fit_eng, 1)
 ames_tree_1 %>%
   set_node_attrs(fillcolor, "thistle", c(21, 11, 10, 8)) %>%
-  set_node_attrs(fillcolor, "plum", c(7)) %>%
+  set_node_attrs(fillcolor, "lightpink", c(7)) %>%
+  set_node_attrs(fillcolor, "sandybrown", c(6)) %>%
+  set_node_attrs(fillcolor, "paleturquoise", c(9)) %>%
   render_graph(as_svg = TRUE)
 ```
 
-
+<br>
 
 ```{r}
 ames_example_lgb <- ames_test_prep %>%
@@ -317,27 +379,316 @@ ames_example_lgb <- ames_test_prep %>%
       `Pred. Value (Final)` - `Pred. Value (Tree 0)`
   ) %>%
   dplyr::relocate(`Pred. Value (Final)`, .after = everything()) %>%
-  mutate(
-    across(starts_with("Pred."), ~ scales::dollar(.x, accuracy = 1)),
-    `Pred. Value (Tree 1)` = paste("+", `Pred. Value (Tree 1)`)
-  )
+  mutate(across(starts_with("Pred."), ~ scales::dollar(.x, accuracy = 1)))
 
 ames_example_lgb[1, 7] <- paste0("<span style='background-color:plum'>", ames_example_lgb[1, 7], "</span>")
-ames_example_lgb[1, 8] <- paste0("<span style='background-color:plum'>", ames_example_lgb[1, 8], "</span>")
+ames_example_lgb[1, 8] <- paste0("<span style='background-color:lightpink'>", ames_example_lgb[1, 8], "</span>")
 ames_example_lgb %>%
   dplyr::slice(1) %>%
   kbl("html", align = "c", escape = FALSE) %>%
   kable_classic(html_font = "sans-serif") %>%
   row_spec(1, background = "thistle", align = "right") %>%
-  column_spec(9, background = "plum")
+  column_spec(9, background = "#d9d9d9")
+```
+
+What about our comparables from earlier? Like before, they all share the same first leaf node (in **Tree 0**), but now they start to differ in the second tree. The newest property (ID = 55) receives a higher final value and the property with the smallest livable area (ID = 84) receives a lower final value.
+
+```{r}
+ames_example_comps2 <- ames_test_prep %>%
+  dplyr::slice(leaf_4_ids) %>%
+  mutate(
+    id = leaf_4_ids,
+    across(c(gr_liv_area, lot_area), scales::comma)
+  ) %>%
+  select(
+    ID = id, `Livable Area` = gr_liv_area, `Lot Area` = lot_area,
+    `Year Built` = year_built, Condition = overall_cond,
+    Neighborhood = neighborhood
+  ) %>%
+  mutate(
+    `Pred. Value (Tree 0)` = predict(
+      ames_fit_eng,
+      data = as.matrix(ames_test_prep[leaf_4_ids, ]),
+      num_iteration = 1
+    ),
+    `Pred. Value (Final)` = predict(
+      ames_fit_eng,
+      data = as.matrix(ames_test_prep[leaf_4_ids, ]),
+      num_iteration = 2
+    ),
+    `Pred. Value (Tree 1)` =
+      `Pred. Value (Final)` - `Pred. Value (Tree 0)`
+  ) %>%
+  dplyr::relocate(`Pred. Value (Final)`, .after = everything()) %>%
+  mutate(across(starts_with("Pred."), ~ scales::dollar(.x, accuracy = 1)))
+
+for (i in seq_len(10)) {
+  ames_example_comps2[i, 7] <- paste0(
+    "<span style='background-color:plum'>",
+    ames_example_comps2[i, 7],
+    "</span>"
+  )
+}
+for (i in c(1:4, 6:9)) {
+  ames_example_comps2[i, 8] <- paste0(
+    "<span style='background-color:lightpink'>",
+    ames_example_comps2[i, 8],
+    "</span>"
+  )
+}
+ames_example_comps2[5, 8] <- paste0("<span style='background-color:sandybrown'>", ames_example_comps2[5, 8], "</span>")
+ames_example_comps2[10, 8] <- paste0("<span style='background-color:paleturquoise'>", ames_example_comps2[10, 8], "</span>")
+
+ames_example_comps2 %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1:10, background = "thistle", align = "right") %>%
+  column_spec(9, background = "#d9d9d9")
 ```
 
+Properties that match the target property in _both trees_ will be more similar than ones that match in just one tree. In this case, all properties that landed in <span style="background-color:lightpink"><strong>Leaf 4</strong></span> will be most comparable to our target (ID = 2). 
+
+However, as more trees are added, the number of properties that share all their leaf nodes with the target will shrink rapidly. Here's what happens to our target and comparable properties if we add 10 trees to the model.
+
+The <span style="background-color:lightgreen"><strong>green cells</strong></span> below show leaf nodes shared with the target property. Note that by the final tree, there are _no comparable properties_ that share all the target's leaf nodes.
 
-- How decision trees work
+```{r, warning=FALSE, message=FALSE, results='hide'}
+ames_model_10 <- parsnip::boost_tree(trees = 10) %>%
+  set_mode("regression") %>%
+  set_engine(
+    engine = "lightgbm",
+    seed = qmd_seed,
+    deterministic = TRUE,
+    categorical_feature = c("overall_cond", "neighborhood"),
+    num_leaves = 11
+  )
+
+ames_wflow_10 <- workflow() %>%
+  add_model(ames_model_10) %>%
+  add_recipe(
+    recipe = ames_recp,
+    blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE)
+  )
+
+ames_fit_10 <- parsnip::fit(ames_wflow_10, ames_train)
+ames_fit_eng_10 <- parsnip::extract_fit_engine(ames_fit_10)
+```
+
+```{r}
+predict(
+  ames_fit_eng_10,
+  data = as.matrix(ames_test_prep[2, ]),
+  predleaf = TRUE
+) %>%
+  as_tibble() %>%
+  mutate(ID = 2) %>%
+  pivot_longer(starts_with("V")) %>%
+  mutate(tree = readr::parse_number(name) - 1) %>%
+  left_join(
+    lgb.model.dt.tree(ames_fit_eng_10) %>%
+      select(tree_index, leaf_index, leaf_value),
+    by = c("tree" = "tree_index", "value" = "leaf_index")
+  ) %>%
+  select(-name, -value) %>%
+  mutate(leaf_value = scales::dollar(leaf_value, accuracy = 1)) %>%
+  pivot_wider(
+    id_cols = "ID",
+    names_from = "tree",
+    values_from = "leaf_value",
+    names_prefix = "Tree "
+  ) %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1, background = "lightgreen", align = "right")
+```
+
+```{r}
+ames_comps_values <- predict(
+  ames_fit_eng_10,
+  data = as.matrix(ames_test_prep[leaf_4_ids, ]),
+  predleaf = TRUE
+) %>%
+  as_tibble() %>%
+  mutate(ID = leaf_4_ids) %>%
+  pivot_longer(starts_with("V")) %>%
+  mutate(tree = readr::parse_number(name) - 1) %>%
+  left_join(
+    lgb.model.dt.tree(ames_fit_eng_10) %>%
+      select(tree_index, leaf_index, leaf_value),
+    by = c("tree" = "tree_index", "value" = "leaf_index")
+  ) %>%
+  select(-name, -value) %>%
+  mutate(leaf_value = scales::dollar(leaf_value, accuracy = 1)) %>%
+  pivot_wider(
+    id_cols = "ID",
+    names_from = "tree",
+    values_from = "leaf_value",
+    names_prefix = "Tree "
+  )
+
+shared_nodes <- tibble(
+  y = c(rep(1, 10), rep(2, 8), rep(3, 4), rep(4, 4), rep(5, 4), rep(6, 3)) + 1,
+  x = c(1:10, c(1:4, 6:9), c(1, 8:10), c(1, 8:10), c(1, 8:10), c(1, 8, 10))
+)
+
+for (i in seq_len(nrow(shared_nodes))) {
+  ames_comps_values[shared_nodes$x[[i]], shared_nodes$y[[i]]] <- paste0(
+    "<span style='background-color:lightgreen'>",
+    ames_comps_values[shared_nodes$x[[i]], shared_nodes$y[[i]]],
+    "</span>"
+  )
+}
+
+ames_comps_values %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1:10, align = "right")
+```
+
+Given that individual properties are unlikely to share _all_ of their leaf nodes with a target property, we need a way to extract comparability that doesn't rely on perfect matching.
+
+## Quantifying Comparables
+
+Now that we've seen how LightGBM trees are structured, we can use them to create a "similarity score." This score is simply the percentage of shared leaf nodes between a target and comparable property, weighted by the importance of each tree.
+
+Let's see it in action. Here is the same green table from above, but with predicted values replaced by a boolean (<span style="background-color:lightgreen"><strong>T</strong></span>) when the property shares the same leaf node as the target within each tree. The rightmost columns show the number and percentage of leaf nodes shared with the target.
+
+```{r, message=FALSE}
+predict(
+  ames_fit_eng_10,
+  data = as.matrix(ames_test_prep[2, ]),
+  predleaf = TRUE
+) %>%
+  bind_cols(
+    tibble(ID = 2),
+    .
+  ) %>%
+  set_names(c("ID", paste0("Tree ", 0:9))) %>%
+  mutate(`Number` = " - ", `Percent` = " - ") %>%
+  mutate(across(everything(), as.character)) %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1, background = "lightgreen", align = "right") %>%
+  column_spec(12:13, background = "#d9d9d9")
+
+ames_comps_ids <- bind_cols(
+  tibble(ID = leaf_4_ids),
+  matrix("F", nrow = 10, ncol = 10)
+) %>%
+  set_names(c("ID", paste0("Tree ", 0:9))) %>%
+  mutate(across(everything(), as.character)) %>%
+  mutate(
+    `Number` = c(6, 2, 2, 2, 1, 2, 2, 6, 5, 6),
+    `Percent` = scales::percent(`Number` / 10, accuracy = 1),
+    `Number` = paste0(`Number`, " / 10")
+  )
+
+shared_nodes <- tibble(
+  y = c(rep(1, 10), rep(2, 8), rep(3, 4), rep(4, 4), rep(5, 4), rep(6, 3)) + 1,
+  x = c(1:10, c(1:4, 6:9), c(1, 8:10), c(1, 8:10), c(1, 8:10), c(1, 8, 10))
+)
+
+for (i in seq_len(nrow(shared_nodes))) {
+  ames_comps_ids[shared_nodes$x[[i]], shared_nodes$y[[i]]] <- paste0(
+    "<span style='background-color:lightgreen'><strong>T</strong></span>"
+  )
+}
+
+ames_comps_ids %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1:10, align = "right") %>%
+  column_spec(12:13, background = "#d9d9d9")
+```
+
+The observations with a high percentage of shared leaf nodes _should_ be most comparable to the target property. However, this assumes that all trees in the model are weighted equally, which is rarely the case. In LightGBM, trees typically have diminishing importance, i.e. each successive tree has less impact on the overall error of the model. To find accurate comparables, we need to quantify each tree's importance and use it to weight each leaf node match.
+
+Here is a simple function to determine tree importance based on how much each tree contributes to the overall decrease in model error. We can apply it to our training data to get weights for each tree.
+
+```{r, echo=TRUE, message=FALSE, warning=FALSE, results='hide'}
+get_weights <- function(metric, model, train, outcome_col, num_trees) {
+  model$params$metric <- list(metric)
+  train_lgb <- lgb.Dataset(as.matrix(train), label = train[[outcome_col]])
+
+  trained_model <- lgb.train(
+    params = model$params,
+    data = train_lgb,
+    valids = list(test = train_lgb),
+    nrounds = num_trees
+  )
+
+  # Get the initial error for base model before first tree
+  # this NEEDS to be after the model is trained (or else it won't train correctly)
+  set_field(train_lgb, "init_score", as.matrix(train[[outcome_col]]))
+  initial_predictions <- get_field(train_lgb, "init_score")
+  init_score <- mean(initial_predictions)
+
+  # Index into the errors list, and un-list so it is a flat/1dim list
+  errors <- unlist(trained_model$record_evals$test[[metric]]$eval)
+  errors <- c(init_score, errors)
+  diff_in_errors <- diff(errors, 1, 1)
+
+  # Take proportion of diff in errors over total diff in errors from all trees
+  weights <- diff_in_errors / sum(diff_in_errors)
+
+  return(weights)
+}
+
+# Prepare data using a tidymodels recipe
+ames_train_prep <- bake(prep(ames_recp), ames_train)
+
+# Get the decrease in error caused by each successive tree
+ames_tree_weights <- get_weights(
+  metric = "rmse",
+  model = ames_fit_eng_10,
+  train = ames_train_prep,
+  outcome_col = "sale_price",
+  num_trees = 10
+)
+```
+
+```{r}
+ames_tree_weights %>%
+  t() %>%
+  as_tibble() %>%
+  bind_cols(tibble(Tree = "Weight"), .) %>%
+  set_names(c("-", paste0("Tree ", 0:9))) %>%
+  mutate(across(starts_with("Tree "), ~ scales::percent(.x, accuracy = 0.1))) %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1, align = "right")
+```
+
+These weights are then multiplied row-wise by the boolean matching matrix from above. The final similarity score for our target property (ID = 2) is the _sum of each comparable row_.
+
+```{r, message=FALSE}
+ames_final_matrix <- matrix(FALSE, nrow = 10, ncol = 10)
+
+for (i in seq_len(nrow(shared_nodes))) {
+  ames_final_matrix[shared_nodes$x[[i]], shared_nodes$y[[i]] - 1] <- TRUE
+}
+
+(t(ames_final_matrix) * ames_tree_weights) %>%
+  t() %>%
+  bind_cols(
+    tibble(ID = leaf_4_ids),
+    .
+  ) %>%
+  set_names(c("ID", paste0("Tree ", 0:9))) %>%
+  mutate(
+    `Sim. Score` = rowSums(across(starts_with("Tree "))),
+    across(starts_with("Tree "), ~ scales::percent(.x, accuracy = 0.1)),
+    `Sim. Score` = scales::percent(`Sim. Score`, accuracy = 0.01)
+  ) %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1:10, align = "right") %>%
+  column_spec(12, background = "#d9d9d9")
+```
 
-- How the data gets saved (as a matrix)
 
-## Finding comps
 
-- Finding the most comparable property
-- Doing it at scale
+- Transform into matrix
+- Add more trees
+- Percent matchin = more comparable
+- Weight by tree importance

From 212333fac2ac7a8e772b084b68581347bca34fbe Mon Sep 17 00:00:00 2001
From: Dan Snow <daniel.snow@cookcountyil.gov>
Date: Tue, 1 Aug 2023 16:31:10 +0000
Subject: [PATCH 3/6] Finalize comps vignette

---
 vignettes/finding-comps.qmd | 135 +++++++++++++++++++++++++++++-------
 1 file changed, 109 insertions(+), 26 deletions(-)

diff --git a/vignettes/finding-comps.qmd b/vignettes/finding-comps.qmd
index 78fec58..7cc1817 100644
--- a/vignettes/finding-comps.qmd
+++ b/vignettes/finding-comps.qmd
@@ -40,7 +40,7 @@ lgb.plot.tree <- function(model = NULL, tree = NULL, rules = NULL) {
   dt[, ID := paste(Tree, Node, sep = "-")]
   dt[, parent := node_parent][is.na(parent), parent := leaf_parent]
   dt[, Yes := dt$ID[match(dt$Node, dt$parent)]]
-  dt <- dt[nrow(dt):1, ]
+  dt <- dt[nrow(dt):1, ] # nolint
   dt[, No := dt$ID[match(dt$Node, dt$parent)]]
   dt[default_left == TRUE, Missing := Yes]
   dt[default_left == FALSE, Missing := Yes]
@@ -87,7 +87,10 @@ lgb.plot.tree <- function(model = NULL, tree = NULL, rules = NULL) {
   }
   if (!is.null(rules)) {
     for (f in names(rules)) {
-      dt[Feature == f & decision_type == "==", Split := levels.to.names(Split, f, rules)]
+      dt[
+        Feature == f & decision_type == "==",
+        Split := levels.to.names(Split, f, rules)
+      ]
     }
   }
   dt[nchar(Split) > 500, Split := "Split too long to render"]
@@ -177,11 +180,11 @@ The [Cook County Assessor's Office (CCAO)](https://www.cookcountyassessor.com/)
 
 However, LightGBM is also complicated. Its outputs can be difficult to explain and interpret, especially as model complexity grows. Techniques such as [SHAP values](https://github.com/shap/shap) can help, but aren't inuitive to the average person.
 
-This vignette outlines a novel technique to explain LightGBM outputs specifically for housing data. The technique discovers [comparable properties](https://www.investopedia.com/terms/c/comparables.asp) by exploiting the tree structure of a trained LightGBM model. Its goal is to help diagnose model issues and answer a common question from property owners, "What comparables did you use to value my property?"
+This vignette outlines a novel technique to explain LightGBM outputs specifically for housing data. The technique finds [comparable properties](https://www.investopedia.com/terms/c/comparables.asp) by exploiting the tree structure of a trained LightGBM model. Its goal is to help diagnose model issues and answer a common question from property owners, "What comparables did you use to value my property?"
 
 ## How Decision Trees Work
 
-To understand the comparable (comp) finding technique, you must first understand how decision trees work to determine a property's value. Below is a simple decision tree trained on the [Ames Housing Dataset](https://www.tmwr.org/ames.html). It is made up of rectangular <span style="background-color:#F5F5DC"><strong>splits</strong></span> and oval <span style="background-color:#F0E68C"><strong>leaves</strong></span>. 
+To understand the comparable (comp) finding technique, you must first understand how decision trees determine a property's value. Below is a simple decision tree trained on the [Ames Housing Dataset](https://www.tmwr.org/ames.html). It is made up of rectangular <span style="background-color:#F5F5DC"><strong>splits</strong></span> and oval <span style="background-color:#F0E68C"><strong>leaves</strong></span>. 
 
 #### Ames Housing Data: Example Decision Tree
 
@@ -192,9 +195,9 @@ ames_tree_0 %>%
 
 <br>
 
-Each <span style="background-color:#F5F5DC"><strong>split</strong></span> has a _variable_ (`gr_liv_area`) and a _rule_ (`<= 1224.5`), and the **bold** arrow points to the node you should go to when the rule is true about a given property. For example, at Split 4 in the tree above, properties with a living area of less than `1224.5` should proceed to Split 9.
+Each <span style="background-color:#F5F5DC"><strong>split</strong></span> has a _property variable_ (`gr_liv_area`) and a _rule_ (`<= 1224.5`). The **bold** arrow points to the node you should go to when the rule is true about a given property. For example, at Split 4 in the tree above, properties with a living area of less than `1224.5` should proceed to Split 9.
 
-Each <span style="background-color:#F0E68C"><strong>leaf</strong></span> shows the value a property will get from the tree after going through all the splits. For example, the Ames target property below (ID = 2) follows the purple path through through the tree based on its characteristics. Its predicted value from the tree is $184,306, as shown at the terminal leaf node (<span style='background-color:plum'><strong>Leaf 4</strong></span>).
+Each <span style="background-color:#F0E68C"><strong>leaf</strong></span> shows the value a property will get from the tree after going through all the splits. For example, the Ames property below (ID = 2) follows the purple path through through the tree based on its characteristics. Its predicted value from the tree is $184,306, as shown at the terminal leaf node (<span style='background-color:plum'><strong>Leaf 4</strong></span>).
 
 ```{r}
 ames_test_prep <- recipes::bake(
@@ -224,7 +227,12 @@ ames_example_props <- ames_test_prep %>%
     ), accuracy = 1)
   )
 
-ames_example_props[1, 7] <- paste0("<span style='background-color:plum'>", ames_example_props[1, 7], "</span>")
+ames_example_props[1, 7] <- paste0(
+  "<span style='background-color:plum'>",
+  ames_example_props[1, 7],
+  "</span>"
+)
+
 ames_example_props %>%
   dplyr::slice(1) %>%
   kbl("html", align = "c", escape = FALSE) %>%
@@ -245,7 +253,12 @@ ames_tree_0 %>%
 Other properties may have a different path through the tree. Below, property number 75 takes the green path through the tree and receives a predicted value of $195,367.
 
 ```{r}
-ames_example_props[2, 7] <- paste0("<span style='background-color:limegreen'>", ames_example_props[2, 7], "</span>")
+ames_example_props[2, 7] <- paste0(
+  "<span style='background-color:limegreen'>",
+  ames_example_props[2, 7],
+  "</span>"
+)
+
 ames_example_props %>%
   kbl("html", align = "c", escape = FALSE) %>%
   kable_classic(html_font = "sans-serif") %>%
@@ -267,7 +280,9 @@ ames_tree_0 %>%
 
 <br>
 
-The process of "predicting" with a decision tree is just running each property through the rules established by the splits. Properties with similar split outcomes will also be similar in terms of their characteristics and will end up in the same leaf node of the tree. As a result, extracting comparables from a single tree is simple: just find all the properties that share a leaf node with your target. For example, here are a few comparables for our target property (ID = 2), which ends up in <span style='background-color:plum'><strong>Leaf 4</strong></span>.
+The process of "predicting" with a decision tree is just running each property through the rules established by the splits. Properties with similar split outcomes will also have similar characteristics and will end up in the same leaf node of the tree. As such, extracting comparables from a single tree is simple: just find all the properties that share a leaf node with your target property.
+
+Let's use the property with ID = 2 as our target. As we saw above, it ends up in <span style='background-color:plum'><strong>Leaf 4</strong></span>. Here are some comparables from the same leaf node:
 
 ```{r, warning=FALSE}
 leaf_4_ids <- predict(
@@ -323,7 +338,7 @@ So what's the issue? Why aren't our properties in the same leaf node actually co
 
 In practice, frameworks like LightGBM don't use just one decision tree. Instead, they combine many decision trees together, either by taking the average or adding their results. This can create incredibly complex rulesets that are difficult for humans to follow, but which lead to much better predictions. Let's extend our decision tree from before with an additional tree. 
 
-**Tree 0** is the same as before, and the purple path shows the rules followed by our target property (ID = 2). 
+**Tree 0** is the same as before. and the purple path shows the rules followed by our target property (ID = 2). 
 
 #### Ames Housing Data: Tree 0
 
@@ -336,7 +351,9 @@ ames_tree_0 %>%
 
 <br>
 
-**Tree 1** is a new tree. The target property still follows the purple path, but notice the new values at each split and leaf. These values are substantially less than the values from **Tree 0**. That's because they are _added_ to **Tree 0**'s results. Using LightGBM, the final predicted value for a given property is the _sum of predicted values from all trees_. In the case of our target property, the final predicted value is $187,627.
+**Tree 1** is a new tree. The target property still follows the purple path, but notice the new values at each split and leaf. These values are substantially less than the values from **Tree 0**. That's because they are _added_ to **Tree 0**'s results. Using LightGBM, the final predicted value for a given property is the _sum of predicted values from all trees_.
+
+In the case of our target property, the final predicted value is $187,627, which comes from adding **Tree 0:** <span style='background-color:plum'><strong>Leaf 4</strong></span> and **Tree 1:** <span style='background-color:lightpink'><strong>Leaf 4</strong></span>.
 
 #### Ames Housing Data: Tree 1
 
@@ -381,8 +398,17 @@ ames_example_lgb <- ames_test_prep %>%
   dplyr::relocate(`Pred. Value (Final)`, .after = everything()) %>%
   mutate(across(starts_with("Pred."), ~ scales::dollar(.x, accuracy = 1)))
 
-ames_example_lgb[1, 7] <- paste0("<span style='background-color:plum'>", ames_example_lgb[1, 7], "</span>")
-ames_example_lgb[1, 8] <- paste0("<span style='background-color:lightpink'>", ames_example_lgb[1, 8], "</span>")
+ames_example_lgb[1, 7] <- paste0(
+  "<span style='background-color:plum'>",
+  ames_example_lgb[1, 7],
+  "</span>"
+)
+ames_example_lgb[1, 8] <- paste0(
+  "<span style='background-color:lightpink'>",
+  ames_example_lgb[1, 8],
+  "</span>"
+)
+
 ames_example_lgb %>%
   dplyr::slice(1) %>%
   kbl("html", align = "c", escape = FALSE) %>%
@@ -436,8 +462,16 @@ for (i in c(1:4, 6:9)) {
     "</span>"
   )
 }
-ames_example_comps2[5, 8] <- paste0("<span style='background-color:sandybrown'>", ames_example_comps2[5, 8], "</span>")
-ames_example_comps2[10, 8] <- paste0("<span style='background-color:paleturquoise'>", ames_example_comps2[10, 8], "</span>")
+ames_example_comps2[5, 8] <- paste0(
+  "<span style='background-color:sandybrown'>",
+  ames_example_comps2[5, 8],
+  "</span>"
+)
+ames_example_comps2[10, 8] <- paste0(
+  "<span style='background-color:paleturquoise'>",
+  ames_example_comps2[10, 8],
+  "</span>"
+)
 
 ames_example_comps2 %>%
   kbl("html", align = "c", escape = FALSE) %>%
@@ -545,11 +579,11 @@ ames_comps_values %>%
   row_spec(1:10, align = "right")
 ```
 
-Given that individual properties are unlikely to share _all_ of their leaf nodes with a target property, we need a way to extract comparability that doesn't rely on perfect matching.
+Given that individual properties are unlikely to share _all_ of their leaf nodes with a target property, we need a way to measure comparability that doesn't rely on perfect matching.
 
 ## Quantifying Comparables
 
-Now that we've seen how LightGBM trees are structured, we can use them to create a "similarity score." This score is simply the percentage of shared leaf nodes between a target and comparable property, weighted by the importance of each tree.
+Now that we've seen how LightGBM trees are structured, we can use them to create a _similarity score_. This score is simply the percentage of shared leaf nodes between a target and comparable property, weighted by the importance of each tree.
 
 Let's see it in action. Here is the same green table from above, but with predicted values replaced by a boolean (<span style="background-color:lightgreen"><strong>T</strong></span>) when the property shares the same leaf node as the target within each tree. The rightmost columns show the number and percentage of leaf nodes shared with the target.
 
@@ -618,7 +652,8 @@ get_weights <- function(metric, model, train, outcome_col, num_trees) {
   )
 
   # Get the initial error for base model before first tree
-  # this NEEDS to be after the model is trained (or else it won't train correctly)
+  # this NEEDS to be after the model is trained
+  # (or else it won't train correctly)
   set_field(train_lgb, "init_score", as.matrix(train[[outcome_col]]))
   initial_predictions <- get_field(train_lgb, "init_score")
   init_score <- mean(initial_predictions)
@@ -628,7 +663,8 @@ get_weights <- function(metric, model, train, outcome_col, num_trees) {
   errors <- c(init_score, errors)
   diff_in_errors <- diff(errors, 1, 1)
 
-  # Take proportion of diff in errors over total diff in errors from all trees
+  # Take proportion of diff in errors over total diff in
+  # errors from all trees
   weights <- diff_in_errors / sum(diff_in_errors)
 
   return(weights)
@@ -653,7 +689,10 @@ ames_tree_weights %>%
   as_tibble() %>%
   bind_cols(tibble(Tree = "Weight"), .) %>%
   set_names(c("-", paste0("Tree ", 0:9))) %>%
-  mutate(across(starts_with("Tree "), ~ scales::percent(.x, accuracy = 0.1))) %>%
+  mutate(across(
+    starts_with("Tree "),
+    ~ scales::percent(.x, accuracy = 0.1)
+  )) %>%
   kbl("html", align = "c", escape = FALSE) %>%
   kable_classic(html_font = "sans-serif") %>%
   row_spec(1, align = "right")
@@ -668,7 +707,7 @@ for (i in seq_len(nrow(shared_nodes))) {
   ames_final_matrix[shared_nodes$x[[i]], shared_nodes$y[[i]] - 1] <- TRUE
 }
 
-(t(ames_final_matrix) * ames_tree_weights) %>%
+ames_final_matrix_fmt <- (t(ames_final_matrix) * ames_tree_weights) %>%
   t() %>%
   bind_cols(
     tibble(ID = leaf_4_ids),
@@ -679,16 +718,60 @@ for (i in seq_len(nrow(shared_nodes))) {
     `Sim. Score` = rowSums(across(starts_with("Tree "))),
     across(starts_with("Tree "), ~ scales::percent(.x, accuracy = 0.1)),
     `Sim. Score` = scales::percent(`Sim. Score`, accuracy = 0.01)
-  ) %>%
+  )
+
+for (i in seq_len(nrow(shared_nodes))) {
+  ames_final_matrix_fmt[shared_nodes$x[[i]], shared_nodes$y[[i]]] <- paste0(
+    "<span style='background-color:lightgreen'>",
+    ames_final_matrix_fmt[shared_nodes$x[[i]], shared_nodes$y[[i]]],
+    "</span>"
+  )
+}
+
+ames_final_matrix_fmt %>%
   kbl("html", align = "c", escape = FALSE) %>%
   kable_classic(html_font = "sans-serif") %>%
   row_spec(1:10, align = "right") %>%
   column_spec(12, background = "#d9d9d9")
 ```
 
+### Final Results
+
+Now we can simply sort by similarity score to get the observations most similar to our target property (ID = 2).
+
+```{r}
+ames_example_props %>%
+  dplyr::slice(1) %>%
+  mutate(`Sim. Score` = " - ") %>%
+  select(-`Pred. Value`) %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1, align = "right") %>%
+  column_spec(7, background = "#d9d9d9")
+
+ames_test_prep %>%
+  dplyr::slice(leaf_4_ids) %>%
+  mutate(
+    id = leaf_4_ids,
+    across(c(gr_liv_area, lot_area), scales::comma)
+  ) %>%
+  select(
+    ID = id, `Livable Area` = gr_liv_area, `Lot Area` = lot_area,
+    `Year Built` = year_built, Condition = overall_cond,
+    Neighborhood = neighborhood
+  ) %>%
+  mutate(`Sim. Score` = ames_final_matrix_fmt$`Sim. Score`) %>%
+  arrange(desc(`Sim. Score`)) %>%
+  kbl("html", align = "c", escape = FALSE) %>%
+  kable_classic(html_font = "sans-serif") %>%
+  row_spec(1:10, align = "right") %>%
+  column_spec(7, background = "#d9d9d9")
+```
+
+Now we're talking! Properties 3 and 62 are nearly identical to our target property, while 60 and 61 aren't too similar. But what about property 55? It looks fairly similar to our target, but has the lowest similarity score.
+
+Property 55 reveals another advantage of this approach to finding comparables: variables get implicitly weighted by their importance. Variables that aren't predictive in the LightGBM model are _less likely to appear in splits_ and are therefore less likely to determine whether two properties share a terminal leaf node. In the case of property 55, `Year Built` is an important predictor of value and appears in many splits, so it receives a low similarity score due to the large difference in age compared to the target.
 
+The comparables finding approach does have some disadvantages. Mainly, it requires you to generate a boolean matching matrix for _every target property_ to _every possible comparable_. This many-to-many relationship quickly blows up the compute required for comp finding, particularly for large/complex models. However, with some clever coding, this isn't too hard to work around.
 
-- Transform into matrix
-- Add more trees
-- Percent matchin = more comparable
-- Weight by tree importance
+Overall, this approach is robust, relatively intuitive, and delivers good comparables that will finally let us answer the question, “What comparables did you use to value my property?”

From 598559ee201cb5d2ac0675259aa0ccbb159b9aae Mon Sep 17 00:00:00 2001
From: Dan Snow <daniel.snow@cookcountyil.gov>
Date: Tue, 1 Aug 2023 17:16:59 +0000
Subject: [PATCH 4/6] Move to Rmd for pkgdown article

---
 DESCRIPTION                                        | 12 ++++++++----
 vignettes/{finding-comps.qmd => finding-comps.rmd} | 12 +++++++++---
 2 files changed, 17 insertions(+), 7 deletions(-)
 rename vignettes/{finding-comps.qmd => finding-comps.rmd} (99%)

diff --git a/DESCRIPTION b/DESCRIPTION
index 4c5becd..6dd299f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -22,19 +22,23 @@ Imports:
     tibble,
     zip
 Suggests:
+    AmesHousing,
     covr,
+    data.table,
+    DiagrammeR,
     hardhat,
-    testthat (>= 2.1.0),
+    jsonlite,
     knitr,
-    modeldata,
-    rmarkdown,
     lintr,
+    modeldata,
     pkgdown,
     readr,
-    jsonlite,
     recipes,
+    rmarkdown,
     rsample,
     scales,
+    snakecase,
+    testthat (>= 2.1.0),
     tidyr,
     tune,
     workflows,
diff --git a/vignettes/finding-comps.qmd b/vignettes/finding-comps.rmd
similarity index 99%
rename from vignettes/finding-comps.qmd
rename to vignettes/finding-comps.rmd
index 7cc1817..8be163c 100644
--- a/vignettes/finding-comps.qmd
+++ b/vignettes/finding-comps.rmd
@@ -3,8 +3,14 @@ title: "Finding comparables with LightGBM"
 format: html
 ---
 
+```{css, echo=FALSE}
+table {
+  display: table !important; 
+}
+```
+
 ```{r opts, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
+knitr::opts_chunk$set(echo = FALSE, out.width = "100%")
 ```
 
 ```{r setup, message=FALSE, warning=FALSE}
@@ -598,7 +604,7 @@ predict(
     .
   ) %>%
   set_names(c("ID", paste0("Tree ", 0:9))) %>%
-  mutate(`Number` = " - ", `Percent` = " - ") %>%
+  mutate(`Number` = "", `Percent` = "") %>%
   mutate(across(everything(), as.character)) %>%
   kbl("html", align = "c", escape = FALSE) %>%
   kable_classic(html_font = "sans-serif") %>%
@@ -742,7 +748,7 @@ Now we can simply sort by similarity score to get the observations most similar
 ```{r}
 ames_example_props %>%
   dplyr::slice(1) %>%
-  mutate(`Sim. Score` = " - ") %>%
+  mutate(`Sim. Score` = "") %>%
   select(-`Pred. Value`) %>%
   kbl("html", align = "c", escape = FALSE) %>%
   kable_classic(html_font = "sans-serif") %>%

From de4166484ad9306d9216eef0f655a2c92d28a9e5 Mon Sep 17 00:00:00 2001
From: Dan Snow <daniel.snow@cookcountyil.gov>
Date: Tue, 1 Aug 2023 17:28:42 +0000
Subject: [PATCH 5/6] Add dependencies for vignette

---
 DESCRIPTION                 |  2 ++
 README.Rmd                  |  4 +++-
 README.md                   | 12 +++++++-----
 vignettes/finding-comps.rmd |  6 +++++-
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 6dd299f..6c558a1 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -28,10 +28,12 @@ Suggests:
     DiagrammeR,
     hardhat,
     jsonlite,
+    kableExtra,
     knitr,
     lintr,
     modeldata,
     pkgdown,
+    purrr,
     readr,
     recipes,
     rmarkdown,
diff --git a/README.Rmd b/README.Rmd
index 39ca435..da9f54b 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -68,7 +68,9 @@ Here is a quick example using `lightsnip` with a Tidymodels cross-validation wor
 library(dplyr)
 library(lightgbm)
 library(lightsnip)
-library(tidymodels)
+library(parsnip)
+library(recipes)
+library(workflows)
 
 # Create a dataset for training
 mtcars_train <- mtcars %>%
diff --git a/README.md b/README.md
index f8e3d28..6315530 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,9 @@ cross-validation workflow:
 library(dplyr)
 library(lightgbm)
 library(lightsnip)
-library(tidymodels)
+library(parsnip)
+library(recipes)
+library(workflows)
 
 # Create a dataset for training
 mtcars_train <- mtcars %>%
@@ -134,7 +136,7 @@ mtcars_test %>%
 
 |                | actual_mpg | pred_mpg |
 |:---------------|-----------:|---------:|
-| Ford Pantera L |       15.8 |    13.72 |
-| Ferrari Dino   |       19.7 |    21.29 |
-| Maserati Bora  |       15.0 |    13.65 |
-| Volvo 142E     |       21.4 |    22.04 |
+| Ford Pantera L |       15.8 |    14.08 |
+| Ferrari Dino   |       19.7 |    21.20 |
+| Maserati Bora  |       15.0 |    13.72 |
+| Volvo 142E     |       21.4 |    22.12 |
diff --git a/vignettes/finding-comps.rmd b/vignettes/finding-comps.rmd
index 8be163c..516f24a 100644
--- a/vignettes/finding-comps.rmd
+++ b/vignettes/finding-comps.rmd
@@ -21,9 +21,13 @@ library(dplyr)
 library(kableExtra)
 library(lightgbm)
 library(lightsnip)
+library(parsnip)
+library(purrr)
+library(recipes)
 library(scales)
 library(snakecase)
-library(tidymodels)
+library(tidyr)
+library(workflows)
 
 # Function to plot LightGBM trees. Sourced originally from:
 # https://github.com/microsoft/LightGBM/issues/1222#issuecomment-1134401696

From 4dfe751b6266d3ca63cdf7c8cbb048a3864dbac4 Mon Sep 17 00:00:00 2001
From: Dan Snow <daniel.snow@cookcountyil.gov>
Date: Tue, 1 Aug 2023 17:58:46 +0000
Subject: [PATCH 6/6] Add PR suggestions

---
 vignettes/finding-comps.rmd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vignettes/finding-comps.rmd b/vignettes/finding-comps.rmd
index 516f24a..d93893a 100644
--- a/vignettes/finding-comps.rmd
+++ b/vignettes/finding-comps.rmd
@@ -205,7 +205,7 @@ ames_tree_0 %>%
 
 <br>
 
-Each <span style="background-color:#F5F5DC"><strong>split</strong></span> has a _property variable_ (`gr_liv_area`) and a _rule_ (`<= 1224.5`). The **bold** arrow points to the node you should go to when the rule is true about a given property. For example, at Split 4 in the tree above, properties with a living area of less than `1224.5` should proceed to Split 9.
+Each <span style="background-color:#F5F5DC"><strong>split</strong></span> has a _property variable_ (i.e. `gr_liv_area`) and a _rule_ (`<= 1224.5`). The **bold** arrow points to the node you should go to when the rule is true about a given property. For example, at Split 4 in the tree above, properties with a living area of less than `1224.5` should proceed to Split 9.
 
 Each <span style="background-color:#F0E68C"><strong>leaf</strong></span> shows the value a property will get from the tree after going through all the splits. For example, the Ames property below (ID = 2) follows the purple path through through the tree based on its characteristics. Its predicted value from the tree is $184,306, as shown at the terminal leaf node (<span style='background-color:plum'><strong>Leaf 4</strong></span>).
 
@@ -708,7 +708,7 @@ ames_tree_weights %>%
   row_spec(1, align = "right")
 ```
 
-These weights are then multiplied row-wise by the boolean matching matrix from above. The final similarity score for our target property (ID = 2) is the _sum of each comparable row_.
+These weights are then multiplied row-wise by the boolean matching matrix from above. This means that for each tree and comparable, the weight value is either kept (when matching / `TRUE`) or zeroed out (when not matching / `FALSE`). The final similarity score for our target property (ID = 2) is the _sum of each comparable row_.
 
 ```{r, message=FALSE}
 ames_final_matrix <- matrix(FALSE, nrow = 10, ncol = 10)