-
Notifications
You must be signed in to change notification settings - Fork 1
/
pem_vs_pam.Rmd
172 lines (135 loc) · 4.87 KB
/
pem_vs_pam.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
---
title: "PEM vs. PAM split point sensitivity"
site: workflowr::wflow_site
output:
workflowr::wflow_html:
toc: true
---
```{r setup, message = FALSE}
library(ggplot2)
theme_set(theme_bw())
library(batchtools)
```
## Motivation
This is a light-weight simulation study to investigate how sensitive the different
approaches (PEM vs. PAM) to the estimation of the baseline-hazard function are
to the placement of the interval split points.
## Setup
The setup is as follows:
- $n=250$ survival times are simulated from a distribution with log-hazard
$-3.5 + f(8,2)*6$, where $f(8,2)$ is the density function of the Gamma distribution
with respective parameters.
- The baseline hazard is estimated by a PEM and PAM respectively
- Three different settings are used for the interval split point definition
1. "default": Unique event times from each simulated data set is used
2. "fine": A fine, equidistant grid with interval lengths $0.2$
3. "rough": A rough, equidistant grid with interval lengths $0.5$
- For each setting, 20 replications are run
Function for data simulation (using `pammtools::sim_pexp`):
```{r}
## simulation function
sim_wrapper <- function(data, job, n = 250, time_grid = seq(0, 10, by = 0.05)) {
# create data set with covariates
df <- tibble::tibble(x1 = runif(n, -3, 3), x2 = runif(n, 0, 6))
# baseline hazard
f0 <- function(t) {dgamma(t, 8, 2) * 6}
# define function that generates nz exposures z(t_{z,1}), ..., z(t_{z,Q})
sim_pexp(formula = ~ -3.5 + f0(t), data = df, cut = time_grid)
}
```
Function to estimate hazard from simulated data, either by a PEM or PAM
```{r}
## estimation function
pam_wrapper <- function(data, job, instance,
cut = NA,
bs = "ps",
mod_type = c("pem", "pam") ,
max_time = 10) {
if(is.na(cut)) {
cut <- NULL
} else {
if(cut == "rough") {
cut <- seq(0, max_time, by = 0.5)
} else {
if(cut == "fine") {
cut <- seq(0, max_time, by = 0.2)
}
}
}
ped <- as_ped(data = instance, formula = Surv(time, status) ~ ., cut = cut, id="id")
form <- "ped_status ~ s(tend) + s(x1) + s(x2)"
if(mod_type == "pem") {
form <- ped_status ~ interval
time_var <- "interval"
} else {
form <- ped_status ~ s(tend, bs = bs, k = 10)
time_var <- "tend"
}
mod <- gam(formula = form, data = ped, family = poisson(), offset = offset, method = "REML")
# summary(mod)
make_newdata(ped, tend=unique(tend)) %>%
add_hazard(mod, type="link", se_mult = qnorm(0.975), time_var = time_var) %>%
mutate(truth = -3.5 + dgamma(tend, 8, 2) * 6)
}
```
## Setup simulation registry
Setup simulation using `batchtools`:
```{r, cache = TRUE, message = FALSE}
if(!checkmate::test_directory_exists("output/sim-pem-vs-pam-registry")) {
reg <- makeExperimentRegistry("output/sim-pem-vs-pam-registry",
packages = c("mgcv", "dplyr", "tidyr", "pammtools"),
seed = 20052018)
reg$cluster.functions = makeClusterFunctionsMulticore(ncpus = 2)
addProblem(name = "pem-vs-pam", fun = sim_wrapper)
addAlgorithm(name = "pem-vs-pam", fun = pam_wrapper)
algo_df <- tidyr::crossing(
cut = c(NA, "fine", "rough"),
mod_type = c("pem", "pam"))
addExperiments(algo.design = list("pem-vs-pam" = algo_df), repls = 20)
submitJobs()
waitForJobs()
}
```
## Evaluate Simulation
```{r, message = FALSE, warning=FALSE}
reg <- loadRegistry("output/sim-pem-vs-pam-registry", writeable = TRUE)
ids_pam <- findExperiments(prob.name="pem-vs-pam", algo.name="pem-vs-pam")
pars <- unwrap(getJobPars()) %>% as_tibble()
res <- reduceResultsDataTable(ids=findDone(ids_pam)) %>%
as_tibble() %>%
tidyr::unnest() %>%
left_join(pars) %>%
mutate(cut = case_when(is.na(cut) ~ "default", TRUE ~ cut))
res %>%
mutate(
sq_error = (truth - hazard)^2,
covered = (truth >= ci_lower) & (truth <= ci_upper)) %>%
group_by(job.id, mod_type, cut) %>%
summarize(
RMSE = sqrt(mean(sq_error)),
coverage = mean(covered)) %>%
group_by(mod_type, cut) %>%
summarize(
RMSE = mean(RMSE),
coverage = mean(coverage))
```
## Visualize Estimations
```{r}
ggplot(res, aes(x=tend, y = hazard)) +
geom_step(aes(group = job.id), alpha = 0.3) +
geom_line(aes(y = truth, col = "truth"), lwd = 2) +
facet_grid(cut ~ mod_type) +
coord_cartesian(ylim=c(-5, 0)) +
geom_smooth(aes(col="average estimate"), method="gam", formula = y ~ s(x),
se=FALSE) +
scale_color_brewer("", palette = "Dark2") +
xlab("time")
```
## Conclusion
- For the PAM, the RMSE has about the same magnitude for all three split point
settings
- For the PEM, the RMSE is highly dependent on the RMSE, partly because
even for the "rough" split point setting, in some simulations some intervals
have no events and the hazard is estimated close to zero (very small log-hazard values)
and for the "default" setting, where each interval contains at least one
event, appears to overestimate the hazard on average