/
thinning.R
135 lines (129 loc) · 5.72 KB
/
thinning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#' Takes a dataset (scalar, vector, or matrix) and returns a training set and a test set that sum to the original data matrix.
#'
#'
#' @export
#'
#' @param data A scalar, vector, or matrix of data.
#' @param family The distribution of the data. Options are "poisson", "negative binomial", "normal" (or "gaussian"),
#' "normal-variance" (or "gaussian-variance"), "mvnormal" (or "mvgaussian"), "binomial", "multinomial", "exponential",
#' "gamma", "chi-squared", "gamma-weibull", "weibull", "pareto", "shifted-exponential", "scaled-uniform", "scaled-beta".
#' @param K The number of folds. Note that for the "chi-squared" and "gamma-weibull" decompositions, the number of folds implies the
#' degrees of freedom and shape parameters respectively.
#' @param epsilon The tuning parameter for convolution-closed data thinning; must be a simplex vector of length K. Larger values
#' correspond to more information in the respective fold. Available for "poisson", "negative binomial", "normal" (or "gaussian"),
#' "mvnormal" (or "mvgaussian"), "binomial", "multinomial", "exponential", and "gamma" families. If epsilon is not supplied,
#' rep(1/K, K) is used.
#' @param arg The extra parameter that must be known in order to thin. Either a scalar or a matrix with the same dimensions as data
#' (excluding "mvnormal" (or "mvgaussian") and "multinomial" families; see below).
#' Requirements vary by decomposition:
#' * Not needed for "poisson", "exponential", "chi-squared", or "scaled-uniform" distributions.
#' * "negative binomial" requires the size parameter.
#' * "normal" (or "gaussian") requires the variance.
#' * "normal-variance" (or "gaussian-variance") requires the mean.
#' * "mvnormal" (or "mvgaussian") requires the covariance matrix. If the dimensions of data are nxp, arg must be nxpxp.
#' * "binomial" and "multinomial" require the number of trials. For "multinomial", if the dimensions of data are nxp, arg must be a
#' vector of length n.
#' * "gamma", "gamma-weibull", and "weibull" requires the shape parameter.
#' * "pareto" requires the location paramter.
#' * "shifted-exponential" requires the rate parameter.
#' * "scaled-beta" requires the first shape parameter.
#'
#' Please refer to \url{https://arxiv.org/abs/2301.07276} and \url{https://arxiv.org/abs/2303.12931} for further details.
#'
#' @details See \url{https://anna-neufeld.github.io/datathin/articles/introduction_tutorial.html} for examples of each decomposition.
datathin <- function(data, family, K=2, epsilon=NULL, arg=NULL) {
#Convert vectors to matrices for consistent processing later
data <- as.matrix(data)
np <- dim(data)
#arg dimension check
if (family %in% c("poisson", "exponential", "chi-squared", "scaled-uniform")) {
if (!is.null(arg)) {
warning(paste0("Extra parameter provided was not used in ", family, " thinning."))
}
} else {
if (is.null(arg)) {
stop("Extra parameter missing.")
} else if (!is.numeric(arg)) {
stop("Non-numeric parameter provided.")
} else {
if (family %in% c("mvnormal", "mvgaussian")) {
if (length(dim(arg)) == 2) {
if (any(dim(arg) != c(np[2], np[2]))) {
stop("Incorrect dimensions for multivariate normal covariance matrices.")
}
} else if (length(dim(arg)) == 3) {
if (any(dim(arg) != c(np[1], np[2], np[2]))) {
stop("Incorrect dimensions for multivariate normal covariance matrices.")
}
}
} else if (family == "multinomial") {
if (!(length(arg) %in% c(1, np[1]))) {
stop("Incorrect dimensions for multinomial trials parameter.")
}
} else {
if (length(arg) > 1) {
if (any(dim(as.matrix(arg)) != dim(data))) {
stop("If `arg` is not a scalar, its dimensions must match those of `data`.")
}
}
}
}
}
#Set epsilon
if (family %in% c("poisson", "negative binomial", "normal", "gaussian", "mvnormal",
"mvgaussian", "binomial", "multinomial", "exponential", "gamma")) {
if (is.null(epsilon)) {
epsilon <- rep(1/K, K)
} else {
if (sum(epsilon) != 1) {
stop("Epsilon does not sum to 1.")
}
if (length(epsilon) != K) {
warning("K parameter will be ignored in favour of the length of epsilon.")
}
}
}
# Convert scalar arg to matrix
if (length(arg) == 1) {
if (family == "multinomial") {
arg <- rep(arg, np[1])
} else {
arg <- matrix(arg, nrow=np[1], ncol=np[2])
}
}
if (family == "poisson") {
poisthin(data, epsilon)
} else if (family == "negative binomial") {
nbthin(data, epsilon, arg)
} else if (family %in% c("normal", "gaussian")) {
normthin(data, epsilon, arg)
} else if (family %in% c("normal-variance", "gaussian-variance")) {
normvarthin(data, arg, K)
} else if (family %in% c("mvnormal", "mvgaussian")) {
mvnormthin(data, epsilon, arg)
} else if (family == "binomial") {
binomthin(data, epsilon, arg)
} else if (family == "multinomial") {
multinomthin(data, epsilon, arg)
} else if (family == "exponential") {
gammathin(data, epsilon, matrix(1, nrow=np[1], ncol=np[2]))
} else if (family == "gamma") {
gammathin(data, epsilon, arg)
} else if (family == "chi-squared") {
chisqthin(data, K)
} else if (family == "gamma-weibull") {
gammaweibullthin(data, K, arg)
} else if (family == "weibull") {
weibullthin(data, arg, K)
} else if (family == "pareto") {
paretothin(data, arg, K)
} else if (family == "shifted-exponential") {
shiftexpthin(data, arg, K)
} else if (family == "scaled-uniform") {
scaledbetathin(data, matrix(1, nrow=np[1], ncol=np[2]), K)
} else if (family == "scaled-beta") {
scaledbetathin(data, arg, K)
} else {
print("Invalid family argument")
}
}