/
mcar.R
518 lines (501 loc) · 19.1 KB
/
mcar.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
#' Jamshidian and Jalal's Non-Parametric MCAR Test
#'
#' Test whether missingness is contingent upon the observed variables,
#' according to the methodology developed by Jamshidian and Jalal (2010) (see
#' Details).
#' @param x An object for which a method exists; usually a `data.frame`.
#' @param imputed Either an object of class `mids`, as returned by
#' [mice::mice()], or a list of `data.frame`s.
#' @param min_n Atomic numeric, must be greater than 1. When there are missing
#' data patterns with fewer than `min_n` cases, all cases with that pattern will
#' be removed from `x` and `imputed`.
#' @param method Atomic character. If it is known (or assumed) that data are
#' either multivariate normally distributed or not, then use either
#' `method = "hawkins"` or `method = "nonparametric"`, respectively.
#' The default argument `method = "auto"` follows the procedure outlined in the
#' Details section, and in Figure 7 of Jamshidian and Jalal (2010).
#' @param replications Number of replications used to simulate the Neyman
#' distribution when performing Hawkins' test. As this method is based on random
#' sampling, use a high number of `replications` (and optionally,
#' [set.seed()]) to minimize Monte Carlo error and ensure reproducibility.
#' @param use_chisq Atomic integer, indicating the minimum number of cases
#' within a group *k* that triggers the use of asymptotic Chi-square
#' distribution instead of the emprical distribution in the Neyman uniformity
#' test, which is performed as part of Hawkins' test.
#' @param alpha Atomic numeric, indicating the significance level of tests.
#' @details Three types of missingness have been distinguished in the literature
#' (Rubin, 1976):
#' Missing completely at random (MCAR), which means that missingness is random;
#' missing at random (MAR), which means that missingness is contingent on the
#' *observed*;
#' and missing not at random (MNAR), which means that missingness is related to
#' unobserved data.
#'
#' Jamshidian and Jalal's non-parametric MCAR test assumes that the missing data
#' are either MCAR or MAR, and tests whether the missingness is independent of
#' the observed values. If so, the covariance matrices of the imputed data will
#' be equal accross groups with different patterns of missingness. This test
#' consists of the following procedure:
#' \enumerate{
#' \item Data are imputed.
#' \item The imputed data are split into *k* groups according to the
#' *k* missing data patterns in the original data (see
#' [mice::md.pattern()]).
#' \item Perform Hawkins' test for equality of covariances across the *k*
#' groups.
#' \item If the test is *not significant*, conclude that there is no evidence
#' against multivariate normality of the data, nor against MCAR.
#' \item If the test *is significant*, and multivariate normality of the data
#' can be assumed, then it can be concluded that missingness is MAR.
#' \item If multivariate normality cannot be assumed, then perform the
#' Anderson-Darling non-parametric test for equality of covariances across the
#' *k* groups.
#' \item If the Anderson-Darling test is *not significant*, this is evidence
#' against multivariate normality - but no evidence against MCAR.
#' \item If the Anderson-Darling test *is significant*, this is evidence
#' it can be concluded that missingness is MAR.
#' }
#'
#' Note that, despite its name in common parlance, an MCAR test can only
#' indicate whether missingness is MCAR or MAR. The procedure cannot distinguish
#' MCAR from MNAR, so a non-significant result does not rule out MNAR.
#'
#' This is a re-implementation of the function `TestMCARNormality`, which was
#' originally published in the R-packgage `MissMech`, which has been removed
#' from CRAN. This new implementation is faster, as its backend is written in
#' C++. It also enhances the functionality of the original:
#' \itemize{
#' \item Multiply imputed data can now be used; the median p-value and test
#' statistic across replications is then reported, as suggested by
#' Eekhout, Wiel, and Heymans (2017).
#' \item The printing method for an `mcar_object` gives a warning when at
#' least one p-value of either test was significant. In this case, it is
#' recommended to inspect the range of p-values, and consider potential
#' violations of MCAR.
#' \item A plotting method for an `mcar_object` is provided.
#' \item A plotting method for the `$md.pattern` element of an `mcar_object`
#' is provided.
#' }
#'
#' @return An object of class `mcar_object`.
#' @author Caspar J. Van Lissa
#' @references
#' Rubin, D. B. (1976). Inference and Missing Data. Biometrika, Vol. 63, No. 3,
#' pp. 581-592. \doi{10.2307/2335739}
#'
#' Eekhout, I., M. A. Wiel, & M. W. Heymans (2017). Methods for Significance
#' Testing of Categorical Covariates in Logistic Regression Models After
#' Multiple Imputation: Power and Applicability Analysis. BMC Medical Research
#' Methodology 17 (1): 129.
#'
#' Jamshidian, M., & Jalal, S. (2010). Tests of homoscedasticity, normality, and
#' missing completely at random for incomplete multivariate data. Psychometrika,
#' 75(4), 649–674. \doi{10.1007/s11336-010-9175-3}
#' @keywords internal
#' @examples
#' res <- mcar(nhanes)
#' # Examine test results
#' res
#' # Plot p-values across imputed data sets
#' plot(res)
#' # Plot md patterns used for the test
#' plot(res, type = "md.pattern")
#' # Note difference with the raw md.patterns:
#' md.pattern(nhanes)
#' @export
#' @importFrom stats cov pchisq spline
#' @md
mcar <- function(x,
imputed = mice(x, method = "norm"),
min_n = 6,
method = "auto",
replications = 10000,
use_chisq = 30,
alpha = 0.05) {
UseMethod("mcar", x)
}
#' @method mcar data.frame
#' @export
mcar.data.frame <- function(x,
imputed = mice(x, method = "norm"),
min_n = 6,
method = "auto",
replications = 10000,
use_chisq = 30,
alpha = 0.05) {
anyfact <- sapply(x, inherits, what = "factor")
if (any(anyfact)) {
stop("Be advised that this MCAR test has not been validated for categorical variables.")
}
if (min_n < 1) {
stop("Argument 'min_n' must be greater than 1.")
}
out <-
list(
hawk_chisq = NULL,
hawk_df = NULL,
hawk_p = NULL,
ad_value = NULL,
ad_p = NULL,
alpha = alpha,
method = method,
md.pattern = NULL,
removed_rows = NULL,
removed_patterns = NULL
)
if (inherits(imputed, "mids")) {
imputed <- mice::complete(imputed, action = "all")
} else if (inherits(imputed, "list")) {
if (!inherits(imputed[[1]], "data.frame")) {
imputed <- tryCatch(lapply(imputed, data.frame), error = function(e) {
stop("Argument 'imputed' must be a list of data.frames or an object of class 'mids'. Could not coerce argument 'imputed' to class 'data.frame'.")
})
message("Argument 'imputed' must be an object of class 'mids', or a list of 'data.frame's. Coerced argument 'imputed' to data.frame.")
}
}
if (!all(dim(x) == dim(imputed[[1]]))) {
stop("Imputed data must have the same dimensions as the incomplete data.")
}
if (any(is.na(imputed[[1]]))) {
stop("Imputed data contain missing values.")
}
missings <- is.na(x)
rowmis <- rowSums(missings)
colmis <- colSums(missings)
if (any(rowmis == ncol(x))) {
warning("Note that there were some rows with all missing data. Tests may be invalid for these rows. Consider removing them.")
# x <- x[!rowmis == ncol(x), , drop = FALSE]
}
if (any(colmis == nrow(x))) {
stop("Some columns contain all missing data. This will result in invalid results.")
}
univals <- sapply(x, function(i) {
length(unique(i))
})
if (any(univals < 2)) {
stop("Some columns are constant, not variable. This will result in invalid results.")
}
newdata <- x
missings <- is.na(x)
pats <- mice::md.pattern(x, plot = FALSE)
if (nrow(pats) < 4L) {
stop("Two or more missing data patterns are required.")
}
remove_pats <- as.numeric(rownames(pats))[-nrow(pats)] <= min_n
if (any(remove_pats)) {
out$removed_patterns <- pats[which(remove_pats), ]
pats <- pats[-nrow(pats), colnames(missings)]
idmiss <- do.call(paste, as.data.frame(missings))
idpats <- do.call(paste, as.data.frame(pats == 0))
remove_these <- idmiss %in% idpats[remove_pats]
if(all(remove_these)) stop("After dropping missing data patterns with fewer than ", min_n, " cases, there were no remaining valid cases in the dataset. Consider lowering 'min_n', and be cautious about interpreting the results; these data might not be suitable for an MCAR test.")
out$removed_rows <- remove_these
newdata <- x[!remove_these, , drop = FALSE]
imputed <- lapply(imputed, `[`, i = !remove_these, j = colnames(newdata), drop = FALSE)
missings <- is.na(newdata)
pats <- mice::md.pattern(newdata, plot = FALSE)
}
class(pats) <- c("md.pattern", class(pats))
out$md.pattern <- pats
pat_n <- as.numeric(rownames(pats))[-nrow(pats)]
pats <- pats[-nrow(pats), colnames(missings)]
idpats <- idpats[!remove_pats]
idmiss <- do.call(paste, as.data.frame(missings))
which_pat <- apply(sapply(idpats, `==`, idmiss), 1, which)
numpat <- length(pat_n)
if (nrow(newdata) == 0) {
stop("No valid rows of data left.")
}
if (numpat == 1) {
stop("Only one missing data pattern.")
}
if (any(pat_n < 2)) {
stop("At least 2 cases needed in each missing data pattern.")
}
# Perform Hawkins test ----------------------------------------------------
hawklist <- lapply(imputed, function(thisimp) {
hawkins(thisimp, which_pat)
})
if (method %in% c("auto", "hawkins")) {
pvalsn <- sapply(hawklist, function(thisimp) {
sapply(thisimp[["a"]], function(thistail) {
ni <- length(thistail)
pn <- p_neyman(thistail, replications, use_chisq)
pn + (pn == 0) / replications
})
})
out$hawk_chisq <- -2 * colSums(log(pvalsn))
out$hawk_df <- 2 * numpat
out$hawk_p <- pchisq(out$hawk_chisq, out$hawk_df, lower.tail = FALSE)
}
# Perform Anderson-Darling test -------------------------------------------
if ((method == "auto" & any(out$hawk_p < alpha)) | method == "nonparametric") {
adout <- sapply(hawklist, function(thisimp) {
anderson_darling(thisimp[["fij"]]) # First row is p
})
out$ad_value <- colSums(adout[-1, , drop = FALSE])
out$ad_p <- adout[1, ]
}
class(out) <- c("mcar_object", class(out))
out
}
#' @method print mcar_object
#' @export
print.mcar_object <- function(x, ...) {
ni <- x$pat_n
out <- "\nInterpretation of results:\n"
cat("\nMissing data patterns:", (nrow(x$md.pattern) - 1))
if (!is.null(x$removed_patterns)) cat(" used,", nrow(x$removed_patterns), "removed.")
cat("\nCases used:", sum(!x$removed_rows), "\n\n")
if (!is.null(x$hawk_p)) {
if (length(x$hawk_p) > 1) {
hawkp <- median(x$hawk_p)
cat("Hawkins' test: median chi^2 (", x$hawk_df, ") = ", median(x$hawk_chisq), ", median p = ", hawkp, sep = "")
if (any(x$hawk_p < x$alpha) & !hawkp < x$alpha) cat(". Some p-values for Hawkins' test were significant; please inspect their values, e.g., using `plot(", deparse(substitute(x)), ")`", sep = "")
} else {
hawkp <- x$hawk_p
cat("Hawkins' test: chi^2 (", x$hawk_df, ") = ", x$hawk_chisq, ", p = ", x$hawk_p, sep = "")
}
cat("\n\n")
if (x$method == "auto") {
out <- c(
out,
c(
"Hawkins' test is not significant; there is no evidence to reject the assumptions of multivariate normality and MCAR.\n",
"Hawkins' test is significant; if multivariate normality can be assumed, then reject the assumption that missingness is MCAR.\n"
)[(hawkp < x$alpha) + 1]
)
}
}
if (!is.null(x$ad_p)) {
if (length(x$ad_p) > 1) {
adp <- median(x$ad_p)
cat("Anderson-Darling rank test: median T = ", median(x$ad_value), ", median p = ", median(x$ad_p), sep = "")
if (any(x$ad_p < x$alpha) & !median(x$ad_p) < x$alpha) cat(". Some p-values for the Anderson-Darling test were significant; please inspect their values, e.g., using `plot(", deparse(substitute(x)), ")`", sep = "")
} else {
adp <- x$ad_p
cat("Anderson-Darling rank test: T = ", x$ad_value, ", p = ", x$ad_p, sep = "")
}
cat("\n")
if (x$method == "auto") {
out <- c(
out,
c(
"Anderson-Darling test is not significant. There is thus evidence against multivariate normality, but not against MCAR.\n",
"Anderson-Darling test is significant. Reject the assumption that missingness is MCAR."
)[(adp < x$alpha) + 1]
)
}
}
if (x$method == "auto") {
cat(out)
}
}
#' @importFrom graphics hist
#' @importFrom grDevices dev.off
#' @method plot mcar_object
#' @export
plot.mcar_object <- function(x, y, type = NULL, ...) {
if (isTRUE(type == "md.pattern")) {
plot(x[["md.pattern"]])
} else {
op <- par(mar = rep(0, 4))
on.exit(par(op))
dev.off()
if (!is.null(x$hawk_p) & !is.null(x$ad_p)) par(mfrow = c(2, 1))
if (!is.null(x$hawk_p)) {
pct <- sum(x$hawk_p < x$alpha) / length(x$hawk_p)
hist(x$hawk_p, main = NULL, xlab = paste0("Hawkins p-values, ", round(pct * 100), "% significant"), ylab = NULL)
abline(v = x$alpha, col = "red")
}
if (!is.null(x$ad_p)) {
pct <- sum(x$ad_p < x$alpha) / length(x$ad_p)
hist(x$ad_p, main = NULL, xlab = paste0("Anderson-Darling p-values, ", round(pct * 100), "% significant"), ylab = NULL)
abline(v = x$alpha, col = "red")
}
}
}
hawkins <- function(x, grouping) {
p <- ncol(x)
n <- nrow(x)
x <- split(x, factor(grouping))
g <- length(x)
S_pooled <- lapply(x, function(i) {
(nrow(i) - 1) * cov(i, use = "complete.obs")
})
S_pooled <- Reduce("+", S_pooled)
S_pooled <- S_pooled / (n - g)
S_pooled <- solve(S_pooled)
f <- lapply(x, function(i) {
i_centered <- scale(i, center = TRUE, scale = FALSE)
i_centered <- apply(i_centered %*% S_pooled * i_centered, 1, sum)
i_centered <- i_centered * nrow(i)
((n - g - p) * i_centered) / (p * ((nrow(i) - 1) * (n - g) - i_centered))
})
a <- lapply(f, function(thisf) {
pf(thisf, p, (n - g - p), lower.tail = FALSE)
})
list(fij = f, a = a, ni = matrix(sapply(x, nrow), ncol = 1))
}
p_neyman <- function(x, replications = 10000, use_chisq = 30) {
n <- length(x)
n4 <- sum(colSums(legendre(x, 4))^2) / n
if (n < use_chisq) {
sum(sim_neyman(n, replications) > n4) / replications
} else {
pchisq(n4, 4, lower.tail = FALSE)
}
}
sim_neyman <- function(n, replications) {
x <- matrix(runif(replications * n), ncol = replications)
pi <- apply(x, 2, function(i) {
sum(colSums(legendre(i, 4))^2) / n
})
sort(pi)
}
anderson_darling <- function(fij) {
x <- unlist(fij)
ni <- sapply(fij, length)
if (length(ni) < 2) {
stop("At least 2 groups required for Anderson-Darling test.")
}
k <- length(ni)
n <- length(x)
x.sort <- sort(x)[-n]
hj <- rle(x.sort)$lengths
hn <- cumsum(hj)
zj <- x.sort[which(!duplicated(x.sort))]
adk.all <- sapply(fij, function(fi) {
ni <- length(fi)
combs <- expand.grid(zj, fi)
b <- combs[, 1] == combs[, 2]
thisfij <- rowSums(matrix(b, length(zj)))
mij <- cumsum(thisfij)
num <- (n * mij - ni * hn)^2
den <- hn * (n - hn)
(1 / ni * sum(hj * (num / den)))
})
adk <- sum(adk.all) / n
adk.all <- adk.all / n
j <- sum(1 / ni)
h <- sum(1 / seq(1:(n - 1)))
g <- sum(sapply(1:(n - 2), function(i) {
(1 / (n - i)) * sum(1 / seq((i + 1), (n - 1)))
}))
a <- (4 * g - 6) * (k - 1) + (10 - 6 * g) * j
b <- (2 * g - 4) * k^2 + 8 * h * k + (2 * g - 14 * h - 4) *
j - 8 * h + 4 * g - 6
c <- (6 * h + 2 * g - 2) * k^2 + (4 * h - 4 * g + 6) * k +
(2 * h - 6) * j + 4 * h
d <- (2 * h + 6) * k^2 - 4 * h * k
var.adk <- max(((a * n^3) + (b * n^2) + (c * n) + d) / ((n - 1) *
(n - 2) * (n - 3)), 0)
adk.s <- (adk - (k - 1)) / sqrt(var.adk)
b0 <- c(0.675, 1.281, 1.645, 1.96, 2.326)
b1 <- c(-0.245, 0.25, 0.678, 1.149, 1.822)
b2 <- c(-0.105, -0.305, -0.362, -0.391, -0.396)
c0 <- c(
1.09861228866811, 2.19722457733622, 2.94443897916644, 3.66356164612965,
4.59511985013459
)
qnt <- b0 + b1 / sqrt(k - 1) + b2 / (k - 1)
ind <- seq(1:4) + (adk.s <= qnt[3])
yy <- spline(qnt[ind], c0[ind], xout = adk.s)$y
p <- 1 / (1 + exp(yy))
c(p, adk.all)
}
ad <- function(fij) {
x <- unlist(fij)
ni <- sapply(fij, length)
if (length(ni) < 2) {
stop("At least 2 groups required for Anderson-Darling test.")
}
k <- length(ni)
n <- length(x)
x.sort <- sort(x)[-n]
hj <- rle(x.sort)$lengths
hn <- cumsum(hj)
zj <- x.sort[which(!duplicated(x.sort))]
adk.all <- sapply(fij, function(fi) {
ni <- length(fi)
combs <- expand.grid(zj, fi)
b <- combs[, 1] == combs[, 2]
thisfij <- rowSums(matrix(b, length(zj)))
mij <- cumsum(thisfij)
num <- (n * mij - ni * hn)^2
den <- hn * (n - hn)
(1 / ni * sum(hj * (num / den)))
})
adk <- sum(adk.all) / n
adk.all <- adk.all / n
j <- sum(1 / ni)
h <- sum(1 / seq(1:(n - 1)))
g <- sum(sapply(1:(n - 2), function(i) {
(1 / (n - i)) * sum(1 / seq((i + 1), (n - 1)))
}))
a <- (4 * g - 6) * (k - 1) + (10 - 6 * g) * j
b <- (2 * g - 4) * k^2 + 8 * h * k + (2 * g - 14 * h - 4) *
j - 8 * h + 4 * g - 6
c <- (6 * h + 2 * g - 2) * k^2 + (4 * h - 4 * g + 6) * k +
(2 * h - 6) * j + 4 * h
d <- (2 * h + 6) * k^2 - 4 * h * k
var.adk <- max(((a * n^3) + (b * n^2) + (c * n) + d) / ((n - 1) *
(n - 2) * (n - 3)), 0)
adk.s <- (adk - (k - 1)) / sqrt(var.adk)
b0 <- c(0.675, 1.281, 1.645, 1.96, 2.326)
b1 <- c(-0.245, 0.25, 0.678, 1.149, 1.822)
b2 <- c(-0.105, -0.305, -0.362, -0.391, -0.396)
c0 <- c(
1.09861228866811, 2.19722457733622, 2.94443897916644, 3.66356164612965,
4.59511985013459
)
qnt <- b0 + b1 / sqrt(k - 1) + b2 / (k - 1)
ind <- seq(1:4) + (adk.s <= qnt[3])
yy <- spline(qnt[ind], c0[ind], xout = adk.s)$y
p <- 1 / (1 + exp(yy))
list(pn = p, adk.all = adk.all, adk = adk, var.sdk = var.adk)
}
#' @method plot md.pattern
#' @export
plot.md.pattern <- function(x, y, rotate.names = FALSE, ...) {
op <- par(mar = rep(0, 4))
on.exit(par(op))
plot.new()
R <- x[1:nrow(x) - 1, 1:ncol(x) - 1]
nmis <- x[nrow(x), 1:ncol(x) - 1]
if (rotate.names) {
adj <- c(0, 0.5)
srt <- 90
length_of_longest_colname <- max(nchar(colnames(x))) / 2.6
plot.window(
xlim = c(-1, ncol(R) + 1),
ylim = c(-1, nrow(R) + length_of_longest_colname),
asp = 1
)
} else {
adj <- c(0.5, 0)
srt <- 0
plot.window(
xlim = c(-1, ncol(R) + 1),
ylim = c(-1, nrow(R) + 1),
asp = 1
)
}
M <- cbind(c(row(R)), c(col(R))) - 1
shade <- ifelse(R[nrow(R):1, ], mdc(1), mdc(2))
rect(M[, 2], M[, 1], M[, 2] + 1, M[, 1] + 1, col = shade)
for (i in 1:ncol(R)) {
text(i - .5,
nrow(R) + .3,
colnames(x)[i],
adj = adj,
srt = srt
)
text(i - .5, -.3, nmis[order(nmis)][i])
}
for (i in 1:nrow(R)) {
text(ncol(R) + .3, i - .5, x[(nrow(x) - 1):1, ncol(x)][i], adj = 0)
text(-.3, i - .5, rownames(x)[(nrow(x) - 1):1][i], adj = 1)
}
text(ncol(R) + .3, -.3, x[nrow(x), ncol(x)])
}