-
Notifications
You must be signed in to change notification settings - Fork 2
/
gompertztrunc_simu.R
158 lines (137 loc) · 5.14 KB
/
gompertztrunc_simu.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#' Simulate Gompertzian death distribution
#'
#'
#'
#' @param n sample size
#' @param formula estimation formula
#' @param coefs named vectors of coefficients and corresponding true values
#' @param dummy vector flags for each coefficient
#' @param sigma standard deviation for each variable
#' @param seed random seed to duplicate data
#' @param a0 Gompertz alpha parameter
#' @param b Gompertz b parameter
#' @param verbose print internal check if true
#'
#' @return dataframe of simulated death ages and covariate values
#'
#' @examples
#' gompertztrunc_simu(n=1000, formula = death_age ~ sex + ambient_temp,
#' coefs = c('sex'=-0.8, 'ambient_temp'=0.3), dummy=c(TRUE,FALSE))
#'
#' @importFrom rlang .data
#' @export
gompertztrunc_simu <- function(n, ## sample size
formula, ## formula "death_age ~ sex + educ"
coefs, ## "true" values for coefficients
dummy = NULL, ## dummy flag for each var (optional)
sigma = NULL, ## sd for each var (optional)
seed = NULL, ## random seed to duplicate data
a0 = 10^-4, ## gompertz "alpha"
b = 1 / 10, ## gompertz "b"
verbose = FALSE) ## print internal check
{
## proportional hazards model of form
## h(x) = exp(b*x) * a0 * exp(beta1*x1 + beta2*x2 + ...)
## = exp(b*x) * exp(beta0 + beta1*x1 + beta2*x2 + ...)
## = exp( b * x) * exp(X %*% Beta)
## with a0 = exp(b0)
## X = design matrix of vars
## Beta = vector of coefs
## 0. Preliminaries
coefs <- c("b0" = log(a0), coefs) ## include log(a0) as intercept
## check to see if right number of coefs with correct labels
myterms <- stats::terms(formula) ## complex object with lots of info from formula
coef.names.from.formula <- attr(myterms, "term.labels")
##
if (!identical(names(coefs)[-1], coef.names.from.formula)) {
stop(c(
"term labels not correct. should be: ",
paste(coef.names.from.formula, collapse = " ")
))
}
## check to see if "dummy" has right number of terms
## get names of vars to simulate
vars.to.sim <- all.vars(formula[[3]])
k <- length(vars.to.sim)
if (!is.null(dummy) & length(dummy) != k) {
warning("incorrect length of dummy argument:\n
should equal number of vars in formula.\n
E.g., in y ~ x1 + x2 + x1:x2, there are 2 vars")
}
## 1. simulate covariates (e.g., x1, x2, and x3)
## initialize data frame
data <- data.frame(matrix(NA, nrow = n, ncol = k))
names(data) <- vars.to.sim
if (!is.null(seed)) {
set.seed(seed)
}
for (i in 1:k)
{
if (is.null(sigma[i])) {
data[, i] <- stats::rnorm(n, mean = 0, sd = 1)
} ## standardized
if (!is.null(sigma[i])) {
data[, i] <- stats::rnorm(n, mean = 0, sd = sigma[i])
} ## own variance
}
## create dummy vars (for now just random 0,1 coding)
if (!is.null(dummy)) {
for (i in 1:k)
{
if (dummy[i] == TRUE) {
data[, i] <- ifelse(data[, i] > 0, 1, 0)
}
}
}
## 2. get design matrix
form.without.y <- paste(formula[[1]], format(formula[[3]]))
form.without.y <- stats::as.formula(form.without.y)
mat <- stats::model.matrix(form.without.y, data)
## head(mat)
## (Intercept) x1 x2 x3 x2:x3
## 1 1 -0.96193342 1 -1.2913493 -1.2913493
## 2 1 -0.29252572 1 2.6350454 2.6350454
## 3 1 0.25878822 1 0.4870723 0.4870723
## 4 1 -1.15213189 0 0.8538923 0.0000000
## 5 1 0.19578283 0 1.0884427 0.0000000
## 6 1 0.03012394 1 0.2260140 0.2260140
## label design matrix "X"
X <- mat
## 3. multiply out to get individual effects
Beta <- cbind(coefs)
log.effects <- X %*% Beta
effects <- exp(log.effects)
## 4. generate gompertz draws
y <- flexsurv::rgompertz(n, shape = b, rate = effects) ## ages of death
data_with_y <- cbind(y, data)
## re-label "y" with name used on left hand side of formula
left.string <- deparse(formula[[2]])
colnames(data_with_y)[1] <- left.string
right.string <- format(formula[[3]])
## 5. check with lm using -10*B as entropy approximation
## Since data is untruncated, we can use result that
## (1) d e0 / d haz \approx -1/b
## A coefficient value of b3 = 0.20, increases hazards by a factor
## exp(b3) = exp(.2) \approx 1.2,
## which is like increasing hazards (d haz) by +20%.
## From (1), this means life expectancy should go down
## by about -0.2/b \approx -.2 * 10, or about 2 years.
## (Note: this approximation will not work for truncated data!)
lm.form <- stats::update(formula, paste(left.string, " ~ ."))
m <- stats::lm(lm.form, data_with_y)
Beta.vec <- as.vector(Beta) ## true coefs as vector
names(Beta.vec) <- rownames(Beta) ## add rownames
coef.m.hat <- Beta.vec / (-b)
check.dt <- data.table::data.table(
"coef.name" = names(stats::coef(m)),
"coef(m)" = round(stats::coef(m), 3),
"coef.m.hat" = round(coef.m.hat, 3),
"beta.name" = names(Beta.vec),
Beta = round(Beta.vec, 3)
)
if (verbose) {
print(check.dt)
}
simu_data <- data.table::as.data.table(data_with_y)
return(simu_data)
}