Andrew Gelman, Jennifer Hill, Aki Vehtari 2021-06-23
Tidyverse version by Bill Behrman.
Ordered categorical data analysis with a study from experimental economics, on the topic of “storable votes”. See Chapter 15 in Regression and Other Stories.
# Packages
library(tidyverse)
library(rstanarm)
# Parameters
# Data directory
dir_data <- here::here("Storable/data")
# People to model
people <-
tribble(
~person, ~behavior,
101, "Perfectly monotonic",
303, "One fuzzy and one sharp cutpoint",
409, "Monotonic with one outlier",
405, "Only 1's and 3's",
504, "Almost only 3's",
112, "Erratic"
)
# Common code
file_common <- here::here("_common.R")
#===============================================================================
# Run common code
source(file_common)
Data
data <-
dir_data %>%
fs::dir_ls(regexp = "/\\dplayergames.csv") %>%
map_dfr(read_csv) %>%
select(person, value, vote) %>%
mutate(vote_ord = ordered(vote))
summary(data)
#> person value vote vote_ord
#> Min. :101 Min. : 1.0 Min. :1.00 1:940
#> 1st Qu.:309 1st Qu.: 25.0 1st Qu.:1.00 2:810
#> Median :514 Median : 49.0 Median :2.00 3:759
#> Mean :479 Mean : 50.1 Mean :1.93
#> 3rd Qu.:614 3rd Qu.: 76.0 3rd Qu.:3.00
#> Max. :716 Max. :100.0 Max. :3.00
EDA
data %>%
count(person, name = "n_rows") %>%
count(n_rows)
#> # A tibble: 3 x 2
#> n_rows n
#> <int> <int>
#> 1 20 10
#> 2 29 1
#> 3 30 76
There are 87 people in the data, 76 have 30 observations each, 1 has 29, and 10 have 20.
Votes cast by value.
data %>%
ggplot(aes(value)) +
stat_ydensity(
aes(y = vote_ord),
draw_quantiles = c(0.25, 0.5, 0.75),
scale = "count"
) +
geom_smooth(aes(y = vote)) +
labs(
title = "Votes cast by value",
x = "Value",
y = "Votes cast"
)
Low values are associated with one vote, middle values are associated with two votes, and high values are associated with three votes.
Fit model for person 401.
set.seed(768)
fit_401 <-
stan_polr(
vote_ord ~ value,
data = data %>% filter(person == 401),
refresh = 0,
prior = R2(location = 0.3, what = "mean")
)
print(fit_401, digits = 2)
#> stan_polr
#> family: ordered [logistic]
#> formula: vote_ord ~ value
#> observations: 20
#> ------
#> Median MAD_SD
#> value 0.09 0.03
#>
#> Cutpoints:
#> Median MAD_SD
#> 1|2 2.74 1.42
#> 2|3 5.86 2.23
#>
#> ------
#> * For help interpreting the printed output see ?print.stanreg
#> * For info on the priors used see ?prior_summary.stanreg
Fit model to people in people
.
set.seed(768)
fit_people <-
data %>%
filter(person %in% people$person) %>%
nest(data = !person) %>%
rowwise() %>%
mutate(
fit =
list(
stan_polr(
vote_ord ~ value,
data = data,
refresh = 0,
prior = R2(location = 0.3, what = "mean"),
adapt_delta = 0.9999
)
)
)
Calculate cutpoints and sigma for each person.
data_people <-
fit_people %>%
mutate(
sims = list(as_tibble(fit)),
c_1.5 = median(sims$`1|2` / sims$value),
c_2.5 = median(sims$`2|3` / sims$value),
sigma = median(1 / sims$value)
) %>%
ungroup() %>%
select(!c(fit, sims)) %>%
mutate(person = factor(person, levels = people$person))
Calculate expected response curves.
expected <- function(x, c_1.5, c_2.5, sigma) {
p_1.5 <- plogis((x - c_1.5) / sigma)
p_2.5 <- plogis((x - c_2.5) / sigma)
1 * (1 - p_1.5) + 2 * (p_1.5 - p_2.5) + 3 * p_2.5
}
curves <-
data_people %>%
select(!data) %>%
mutate(
x = list(seq_range(range(data$value))),
y = pmap(list(x, c_1.5, c_2.5, sigma), expected)
) %>%
select(person, x, y) %>%
unnest(cols = c(x, y))
Cutpoint segments.
segments <-
data_people %>%
select(person, c_1.5, c_2.5) %>%
pivot_longer(cols = !person, names_to = "cutpoint", values_to = "x") %>%
mutate(
y = str_extract(cutpoint, pattern = "[12]") %>% as.double(),
yend = y + 1
) %>%
filter(x >= 1, x <= 100)
Example individuals from storable votes study.
person_label <- function(x) {
people %>%
filter(person == x) %>%
pull(behavior)
}
data_people %>%
unnest(data) %>%
ggplot() +
geom_segment(
aes(x = x, xend = x, y = y, yend = yend),
data = segments,
color = "grey60"
) +
geom_line(aes(x, y), data = curves) +
geom_count(aes(value, vote)) +
facet_wrap(
facets = vars(person),
ncol = 2,
labeller = labeller(person = person_label)
) +
scale_y_continuous(breaks = 1:3, minor_breaks = NULL) +
scale_size(range = c(1, 3), guide = "none") +
labs(
title = "Example individuals from storable votes study",
x = "Value",
y = "Votes cast"
)