/
make-newdata.R
379 lines (315 loc) · 12.1 KB
/
make-newdata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
#' Extract information of the sample contained in a data set
#'
#' Given a data set and grouping variables, this function returns mean values
#' for numeric variables and modus for characters and factors. Usually
#' this function should not be called directly but will rather be called
#' as part of a call to \code{make_newdata}.
#'
#' @rdname sample_info
#' @param x A data frame (or object that inherits from \code{data.frame}).
#' @importFrom stats median
#' @return A data frame containing sample information (for each group).
#' If applied to an object of class \code{ped}, the sample means of the
#' original data is returned.
#' Note: When applied to a \code{ped} object, that doesn't contain covariates
#' (only interval information), returns data frame with 0 columns.
#'
#' @export
#' @keywords internal
sample_info <- function(x) {
UseMethod("sample_info", x)
}
#' @import checkmate dplyr
#' @importFrom purrr compose
#' @export
#' @rdname sample_info
sample_info.data.frame <- function(x) {
cn <- colnames(x)
num <- summarize_if (x, .predicate = is.numeric, ~mean(., na.rm = TRUE))
fac <- summarize_if (x, .predicate = compose("!", is.numeric), modus)
nnames <- intersect(names(num), names(fac))
if (length(nnames) != 0) {
suppressMessages(
x <- left_join(num, fac) %>% group_by(!!!lapply(nnames, as.name))
)
} else {
x <- bind_cols(num, fac)
}
return(select(x, one_of(cn)))
}
#' @rdname sample_info
#' @import checkmate dplyr
#' @importFrom rlang sym
#' @export
sample_info.ped <- function(x) {
# is.grouped_df
# remove "noise" information on interval variables
grps <- group_vars(x)
iv <- attr(x, "intvars")
id_var <- attr(x, "id_var")
x <- x %>%
group_by(!!sym(id_var)) %>%
slice(1) %>%
ungroup() %>%
grouped_df(grps) %>%
select(-one_of(iv))
if (test_data_frame(x, min.rows = 1, min.cols = 1)) {
sample_info.data.frame(x)
} else {
NULL
}
}
#' @rdname sample_info
#' @export
sample_info.fped <- function(x) {
x %>% select_if(~!is.matrix(.x)) %>% sample_info.ped()
}
#' Create a data frame from all combinations of data frames
#'
#' Works like \code{\link[base]{expand.grid}} but for data frames.
#'
#' @importFrom dplyr slice bind_cols
#' @importFrom vctrs vec_c
#' @importFrom purrr map map_lgl
#' @importFrom checkmate test_data_frame
#' @param ... Data frames that should be combined to one data frame.
#' Elements of first df vary fastest, elements of last df vary slowest.
#' @examples
#' combine_df(
#' data.frame(x=1:3, y=3:1),
#' data.frame(x1=c("a", "b"), x2=c("c", "d")),
#' data.frame(z=c(0, 1)))
#' @export
#' @keywords internal
combine_df <- function(...) {
dots <- list(...)
if (!all(sapply(dots, test_data_frame))) {
stop("All elements in ... must inherit from data.frame!")
}
ind_seq <- map(dots, ~ seq_len(nrow(.x)))
not_empty <- map_lgl(ind_seq, ~ length(.x) > 0)
ord <- lapply(dots[not_empty], function(z) colnames(z)) |> unlist()
out <- do.call(expand_grid, rev(dots[not_empty]))
out <- out[, ord]
}
#' Construct a data frame suitable for prediction
#'
#' This functions provides a flexible interface to create a data set that
#' can be plugged in as \code{newdata} argument to a suitable \code{predict}
#' function (or similar).
#' The function is particularly useful in combination with one of the
#' \code{add_*} functions, e.g., \code{\link[pammtools]{add_term}},
#' \code{\link[pammtools]{add_hazard}}, etc.
#'
#' @rdname newdata
#' @aliases make_newdata
#' @importFrom tidyr expand_grid
#' @inheritParams sample_info
#' @param ... Covariate specifications (expressions) that will be evaluated
#' by looking for variables in \code{x}. Must be of the form \code{z = f(z)}
#' where \code{z} is a variable in the data set and \code{f} a known
#' function that can be usefully applied to \code{z}. Note that this is also
#' necessary for single value specifications (e.g. \code{age = c(50)}).
#' For data in PED (piece-wise exponential data) format, one can also specify
#' the time argument, but see "Details" an "Examples" below.
#' @import dplyr
#' @importFrom checkmate assert_data_frame assert_character
#' @importFrom purrr map
#' @importFrom tidyr expand_grid
#' @details Depending on the type of variables in \code{x}, mean or modus values
#' will be used for variables not specified in ellipsis
#' (see also \code{\link[pammtools]{sample_info}}). If \code{x} is an object
#' that inherits from class \code{ped}, useful data set completion will be
#' attempted depending on variables specified in ellipsis. This is especially
#' useful, when creating a data set with different time points, e.g. to
#' calculate survival probabilities over time (\code{\link[pammtools]{add_surv_prob}})
#' or to calculate a time-varying covariate effects (\code{\link[pammtools]{add_term}}).
#' To do so, the time variable has to be specified in \code{...}, e.g.,
#' \code{tend = seq_range(tend, 20)}. The problem with this specification is that
#' not all values produced by \code{seq_range(tend, 20)} will be actual values
#' of \code{tend} used at the stage of estimation (and in general, it will
#' often be tedious to specify exact \code{tend} values). \code{make_newdata}
#' therefore finds the correct interval and sets \code{tend} to the respective
#' interval endpoint. For example, if the intervals of the PED object are
#' \eqn{(0,1], (1,2]} then \code{tend = 1.5} will be set to \code{2} and the
#' remaining time-varying information (e.g. offset) completed accordingly.
#' See examples below.
#' @examples
#' # General functionality
#' tumor %>% make_newdata()
#' tumor %>% make_newdata(age=c(50))
#' tumor %>% make_newdata(days=seq_range(days, 3), age=c(50, 55))
#' tumor %>% make_newdata(days=seq_range(days, 3), status=unique(status), age=c(50, 55))
#' # mean/modus values of unspecified variables are calculated over whole data
#' tumor %>% make_newdata(sex=unique(sex))
#' tumor %>% group_by(sex) %>% make_newdata()
#'
#' # Examples for PED data
#' ped <- tumor %>% slice(1:3) %>% as_ped(Surv(days, status)~., cut = c(0, 500, 1000))
#' ped %>% make_newdata(age=c(50, 55))
#'
#' # if time information is specified, other time variables will be specified
#' # accordingly and offset calculated correctly
#' ped %>% make_newdata(tend = c(1000), age = c(50, 55))
#' ped %>% make_newdata(tend = unique(tend))
#' ped %>% group_by(sex) %>% make_newdata(tend = unique(tend))
#'
#' # tend is set to the end point of respective interval:
#' ped <- tumor %>% as_ped(Surv(days, status)~.)
#' seq_range(ped$tend, 3)
#' make_newdata(ped, tend = seq_range(tend, 3))
#' @export
make_newdata <- function(x, ...) {
UseMethod("make_newdata", x)
}
#' @inherit make_newdata
#' @rdname newdata
#' @export
make_newdata.default <- function(x, ...) {
assert_data_frame(x, all.missing = FALSE, min.rows = 2, min.cols = 1)
orig_names <- names(x)
expressions <- quos(...)
expr_evaluated <- map(expressions, lazyeval::f_eval, data = x) |>
map(c)
# construct data parts depending on input type
lgl_atomic <- map_lgl(expr_evaluated, is_atomic)
# part1 <- expr_evaluated[lgl_atomic] |> cross_df()
part1 <- do.call(tidyr::expand_grid, rev(expr_evaluated[lgl_atomic]))
part2 <- do.call(combine_df, expr_evaluated[!lgl_atomic])
ndf <- combine_df(part1, part2)
rest <- x %>% select(-one_of(c(colnames(ndf))))
if (ncol(rest) > 0) {
si <- sample_info(rest) %>% ungroup()
ndf <- combine_df(si, ndf)
}
ndf %>% select(one_of(orig_names))
}
#' @rdname newdata
#' @inherit make_newdata.default
#' @export
make_newdata.ped <- function(x, ...) {
assert_data_frame(x, all.missing = FALSE, min.rows = 2, min.cols = 1)
# prediction time points have to be interval end points so that piece-wise
# constancy of predicted hazards is respected. If user overrides this, warn.
orig_vars <- names(x)
int_df <- int_info(x)
expressions <- quos(...)
dot_names <- names(expressions)
int_names <- names(int_df)
# x <- select(x, -one_of(setdiff(int_names, c(dot_names, "intlen", "intmid"))))
ndf <- x %>%
select(-one_of(setdiff(int_names, c(dot_names, "intlen", "intmid")))) %>%
unped() %>%
make_newdata(...)
if (any(names(int_df) %in% names(ndf))) {
int_tend <- get_intervals(x, ndf$tend)$tend
if (!all(ndf$tend == int_tend)) {
message("Some values of 'tend' have been set to the respective interval end-points")
}
ndf$tend <- int_tend
suppressMessages(
ndf <- ndf %>% left_join(int_df)
)
} else {
ndf <- combine_df(int_df[1, ], ndf)
}
int_names <- intersect(int_names, c("intlen", orig_vars))
ndf %>% select(one_of(c(int_names, setdiff(orig_vars, int_names)))) %>%
mutate(
intlen = .data$tend - .data$tstart,
offset = log(.data$tend - .data$tstart),
ped_status = 0)
}
#' @rdname newdata
#' @inherit make_newdata.ped
#' @importFrom rlang quos
#' @export
make_newdata.fped <- function(x, ...) {
assert_data_frame(x, all.missing = FALSE, min.rows = 2, min.cols = 1)
# prediction time points have to be interval end points so that piece-wise
# constancy of predicted hazards is respected. If user overrides this, warn.
expressions <- quos(...)
dot_names <- names(expressions)
orig_vars <- names(x)
cumu_vars <- setdiff(unlist(attr(x, "func_mat_names")), dot_names)
cumu_smry <- smry_cumu_vars(x, attr(x, "time_var")) %>%
select(one_of(cumu_vars))
int_names <- attr(x, "intvars")
ndf <- x %>%
select(one_of(setdiff(names(x), cumu_vars))) %>%
unfped() %>%
make_newdata(...)
out_df <- do.call(combine_df, compact(list(cumu_smry, ndf)))
int_df <- int_info(attr(x, "breaks"))
suppressMessages(
out_df <- out_df %>% left_join(int_df) %>%
select(-one_of(c("intmid"))) %>% as_tibble()
)
# adjust lag-lead indicator
out_df <- adjust_ll(out_df, x)
out_df
}
smry_cumu_vars <- function(data, time_var) {
cumu_vars <- unlist(attr(data, "func_mat_names"))
func_list <- attr(data, "func")
z_vars <- map(func_list, ~get_zvars(.x, time_var, length(func_list))) %>%
unlist()
smry_z <- select(data, one_of(z_vars)) %>%
map(~ .x[1, ]) %>% map(~mean(unlist(.x))) %>% bind_cols()
smry_time <- select(data, setdiff(cumu_vars, z_vars)) %>% map(~.x[1, 1])
bind_cols(smry_z, smry_time)
}
get_zvars <- function(func, time_var, n_func) {
col_vars <- func$col_vars
all_vars <- make_mat_names(c(col_vars, "LL"), func$latency_var, func$tz_var,
func$suffix, n_func)
time_vars <- make_mat_names(c(time_var, func$tz_var, "LL"),
func$latency_var, func$tz_var, func$suffix, n_func)
setdiff(all_vars, time_vars)
}
## apply ll_fun to newly created data
adjust_ll <- function(out_df, data) {
func_list <- attr(data, "func")
n_func <- length(func_list)
LL_names <- grep("LL", unlist(attr(data, "func_mat_names")), value = TRUE)
for (i in LL_names) {
ind_ll <- map_lgl(names(attr(data, "ll_funs")), ~grepl(.x, i))
if (any(ind_ll)) {
ind_ll <- which(ind_ll)
} else {
ind_ll <- 1
}
func <- func_list[[ind_ll]]
ll_i <- attr(data, "ll_funs")[[ind_ll]]
tz_var <- attr(data, "tz_vars")[[ind_ll]]
tz_var <- make_mat_names(tz_var, func$latency_var, func$tz_var, func$suffix,
n_func)
if (func$latency_var == "") {
out_df[[i]] <- ll_i(out_df[["tend"]], out_df[[tz_var]]) * 1L
} else {
out_df[[i]] <- ll_i(out_df[["tend"]], out_df[["tend"]] -
out_df[[tz_var]]) * 1L
}
}
out_df
}
# All variables that represent follow-up time should have the same values
# adjust_time_vars <- function(out_df, data, dot_names) {
# time_vars <- c("tend",
# grep(attr(data, "time_var"), unlist(attr(data, "func_mat_names")), value=TRUE))
# time_vars_dots <- c(grep("tend", dot_names, value=TRUE),
# grep(attr(data, "time_var"), dot_names, value=TRUE))
# if (length(time_vars_dots) == 0) {
# time_vars_dots <- "tend"
# } else {
# if (length(time_vars_dots) > 1) {
# warning(paste0("Only one of ", paste0(time_vars_dots, collapse=", "),
# "must be specified. Only the first one will be used!"))
# time_vars_dots <- time_vars_dots[1]
# }
# }
# for (i in setdiff(time_vars, time_vars_dots)) {
# out_df[[i]] <- out_df[[time_vars_dots]]
# }
# out_df
# }