Skip to content

Commit

Permalink
Implement RCG family and downstream tests; closes #86
Browse files Browse the repository at this point in the history
  • Loading branch information
hofnerb committed May 15, 2018
1 parent fa0d6fe commit 45f6b05
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 7 deletions.
3 changes: 2 additions & 1 deletion NAMESPACE
Expand Up @@ -22,7 +22,7 @@ export(glmboost,
boost_control, mstop, Family,
GaussReg, Gaussian, GaussClass, Laplace, Binomial, Poisson, GammaReg, QuantReg,
ExpectReg, NBinomial, PropOdds, Weibull, Loglog, Lognormal, AUC, mboost_fit,
Huber, AdaExp, Gehan, Cindex, CoxPH, Hurdle, Multinomial, FP, IPCweights,
Huber, AdaExp, Gehan, Cindex, CoxPH, Hurdle, Multinomial, RCG, FP, IPCweights,
cvrisk, cv,
bbs, bols, bspatial, brandom, btree, bss, bns, brad, bkernel, bmono, bmrf, buser,
survFit, selected, selected.mboost,
Expand All @@ -31,6 +31,7 @@ export(glmboost,
predict.mboost, predict.glmboost,
stabsel.mboost, stabsel_parameters.mboost,
confint.mboost, confint.glmboost, plot.mboost.ci, lines.mboost.ci, print.glmboost.ci,
downstream.test,
varimp, plot.varimp, as.data.frame.varimp,
mboost_intern)
###, basesel, fitsel)
Expand Down
87 changes: 87 additions & 0 deletions R/family.R
Expand Up @@ -1140,3 +1140,90 @@ Cindex <- function (sigma = 0.1, ipcw = 1) {
}


# RCG model for bounded response variables, log link
RCG <- function (nuirange = c(0, 1), offrange = c(-5, 5)) {
sigma <- c(1, 0.2)
plloss <- function(sigma, y, f, w = 1) {
alpha <- sigma[1]
rho <- sigma[2]
if (length(f) == 1)
f <- rep(f, length(y))
my <- 1 + (exp(f) - 1) * y
dmy <- my ^ 2 - 4 * rho * exp(f) * y * (1 - y)
fy <- lgamma(2 * alpha) - 2 * lgamma(alpha) + alpha * f +
alpha * log(1 - rho) + log(my) + (alpha - 1) * log(y * (1 - y)) -
(alpha + 0.5) * log(dmy)
return(-fy)
}
riskS <- function(sigma, y, fit, w = 1)
sum(w * plloss(y = y, f = fit, sigma = sigma))
risk <- function(y, f, w = 1)
sum(w * plloss(y = y, f = f, sigma = sigma))
ngradient <- function(y, f, w = 1) {
sigma <<- optim(par = sigma, fn = riskS, y = y, fit = f,
lower = c(0.001, 0.001), upper = c(100, 0.99),
w = w, method = "L-BFGS-B")$par
if (length(f) == 1)
f <- rep(f, length(y))
alpha <- sigma[1]
rho <- sigma[2]
my <- 1 + (exp(f) - 1) * y
dmy <- my ^ 2 - 4 * rho * exp(f) * y * (1 - y)
return(alpha + (y * exp(f)) / my - (2 * alpha + 1) * exp(f) *
y * (my - 2 * rho * (1 - y)) / dmy)
}
offset <- function(y, w = 1)
0
dRCG <- function(f, sigma) {
alpha <- sigma[1]
rho <- sigma[2]
densw <- function(w, alpha, rho, f)
w * gamma(2 * alpha) / (gamma(alpha)) ^ 2 * (1 - rho) ^ alpha *
(exp(f))^alpha * ((exp(f) - 1) * w + 1) * (w * (1 - w))^(alpha - 1) /
((((exp(f) - 1) * w + 1) ^ 2 - 4 * rho * exp(f) * w * (1 - w)) ^ (alpha + 0.5))
res <- rep(0, length(f))
for (k in 1:length(f))
res[k] <- integrate(densw, lower = 0, upper = 1, alpha, rho, f[k])$value
return(res)
}
response <- function(f, sigma)
dRCG(f, sigma)
Fisher <- function(y, alpha, rho, X, coefs) {
X <- data.frame(X)
f.i <- matrix(0, ncol = length(coefs), nrow = length(coefs))
for (i in 1:length(y)) {
wobs <- as.numeric(y[i])
x <- as.matrix(X[i,])
xt <- t(x)
xtx <- xt %*% x
exb <- as.numeric(exp(x %*% coefs))
D1 <- (exb - 1) * wobs + 1
D2 <- D1 ^ 2 - 4 * rho * exb * wobs * (1 - wobs)
f.i <- f.i + (D1 * wobs * exb - (wobs * exb) ^ 2) * xtx / (D1 ^ 2) -
(alpha + 0.5) / (D2 ^ 2) *
(2 * wobs * exb * xtx * (2 * wobs * exb - wobs + 1 - 2 * rho * (1 - wobs))
* D2 - xtx * (2 * wobs * exb * (D1 - 2 * rho * (1 - wobs)))^2)
}
Fisher.information <- -f.i
return(Fisher.information)
}

Family(
ngradient = ngradient,
risk = risk,
offset = offset,
# Fisher = Fisher,
check_y = function(y) {
if(!all(y < 1 & y > 0))
stop("y must be < 1 and > 0")
y
},
nuisance = function()
return(sigma),
response = function(f)
response(f, sigma = sigma),
rclass = function(f)
f,
name = "Ratio of Correlated Gammas (RCG)"
)
}
22 changes: 22 additions & 0 deletions R/methods.R
Expand Up @@ -641,3 +641,25 @@ risk <- function(object, ...)
risk.mboost <- function(object, ...) {
object$risk()
}

downstream.test <- function(object, ...) {
if (object$family@name != "Ratio of Correlated Gammas (RCG)")
stop("downstream tests currently only implemented for RCG family")
if (!inherits(object, "glmboost"))
stop("downstream tests currently only implemented for linear models, i.e., glmboost")

# calculate Fisher information matrix
alpha <- nuisance(object)[1]
rho <- nuisance(object)[2]
coefs <- coef(object, which = "")
y <- object$response
## do we need the original design matrix (X1) or the centered design (X) matrix?
X <- as.data.frame(extract(object, what = "design"))
X1 <- data.frame(model.matrix(~ x1 + x2))

Fisher <- environment(object$family@ngradient)[["Fisher"]](y, alpha, rho, X1, coefs)

# downstream hypothesis tests
p_value <- sapply(1:length(coefs), function(i) (1 - pnorm(abs(coefs[i]) / sqrt(solve(Fisher)[i, i]))) * 2)
return(p_value)
}
61 changes: 57 additions & 4 deletions man/Family.Rd
Expand Up @@ -22,6 +22,8 @@
\alias{Hurdle}
\alias{Multinomial}
\alias{Cindex}
\alias{RCG}

\title{ Gradient Boosting Families }
\description{
\code{boost_family} objects provide a convenient way to specify loss functions
Expand Down Expand Up @@ -62,6 +64,8 @@ Gehan()
Hurdle(nuirange = c(0, 100))
Multinomial()
Cindex(sigma = 0.1, ipcw = 1)
RCG(nuirange = c(0, 1), offrange = c(-5, 5))

}
\arguments{
\item{ngradient}{ a function with arguments \code{y}, \code{f} and \code{w} implementing the
Expand Down Expand Up @@ -98,7 +102,7 @@ Cindex(sigma = 0.1, ipcw = 1)
searched for the minimum risk w.r.t. the nuisance parameter.
In case of \code{PropOdds}, the starting values for
the nuisance parameters. }
\item{offrange}{ interval to search for offset in.}
\item{offrange}{ interval to search in for offset.}
\item{sigma}{smoothness parameter for sigmoid functions inside \code{Cindex}.}
\item{ipcw}{vector containing inverse probability of censoring weights for all observations. If omitted, it is estimated inside \code{Cindex} family.}
\item{\dots}{ additional arguments to link functions.}
Expand Down Expand Up @@ -212,7 +216,17 @@ distribution approximation used in Ma and Huang (2005).
(see \code{\link{Surv}} for details). For details on \code{Gehan()} see
Johnson and Long (2011).

\code{Cindex()} optimizes the concordance-index for survival data (often denoted as Harrell's C or C-index). The concordance index evaluates the rank-based concordance probability between the model and the outcome. The C-index measures whether large values of the model are associated with short survival times and vice versa. The interpretation is similar to the AUC: A C-index of 1 represents a perfect discrimination while a C-index of 0.5 will be achieved by a completely non-informative marker. The \code{Cindex()} family is based on an estimator by Uno et al. (2011), which incorporates inverse probability of censoring weighting \code{ipcw}. To make the estimator differentiable, sigmoid functions are applied; the corresponding smoothness can be controlled via \code{sigma}. For details on \code{Cindex()} see Mayr and Schmid (2014).
\code{Cindex()} optimizes the concordance-index for survival data (often denoted
as Harrell's C or C-index). The concordance index evaluates the rank-based
concordance probability between the model and the outcome. The C-index measures
whether large values of the model are associated with short survival times and
vice versa. The interpretation is similar to the AUC: A C-index of 1 represents a
perfect discrimination while a C-index of 0.5 will be achieved by a completely
non-informative marker. The \code{Cindex()} family is based on an estimator by
Uno et al. (2011), which incorporates inverse probability of censoring weighting
\code{ipcw}. To make the estimator differentiable, sigmoid functions are applied;
the corresponding smoothness can be controlled via \code{sigma}. For details on
\code{Cindex()} see Mayr and Schmid (2014).
Hurdle models for zero-inflated count data can be fitted by using a combination
Expand All @@ -230,7 +244,9 @@ distribution approximation used in Ma and Huang (2005).
moment. The class corresponding to the last level of the factor coding
of the response is used as reference class.
\code{RCG()} implements the ratio of correlated gammas (RCG) model proposed
by Weinhold et al. (2016).
}
\section{Warning}{
The coefficients resulting from boosting with family
Expand Down Expand Up @@ -302,7 +318,10 @@ distribution approximation used in Ma and Huang (2005).
\emph{Annals of Applied Statistics}, \bold{5}, 1081--1101.
}
Weinhold, L., S. Pechlivanis, S. Wahl, P. Hoffmann and M. Schmid (2016) A Statistical Model for the
Analysis of Bounded Response Variables in DNA Methylation Studies.
\emph{BMC Bioinformatics}. 2016; 17: 480. \url{http://dx.doi.org/10.1186/s12859-016-1347-4}
}
\seealso{\code{\link{mboost}} for the usage of \code{Family}s. See
\code{\link{boost_family-class}} for objects resulting from a call to \code{Family}. }
\author{
Expand Down Expand Up @@ -369,6 +388,40 @@ pred2 <- predict(mlm, type = "response", newdata = newdata)
pred[1, ]
pred2
### Example for RCG model
## generate covariate values
set.seed(12345)
x1 <- rnorm(500)
x2 <- rnorm(500)
## generate linear predictors
zetaM <- 0.1 + 0.3 * x1 - 0.5 * x2
zetaU <- 0.1 - 0.1 * x1 + 0.2 * x2
## generate beta values
M <- rgamma(500, shape = 2, rate = exp(zetaM))
U <- rgamma(500, shape = 2, rate = exp(zetaU))
y <- M / (M + U)
## fit RCG model
data <- data.frame(y, x1, x2)
RCGmodel <- glmboost(y ~ x1 + x2, data = data, family = RCG(),
control = boost_control(mstop = 2000,
trace = TRUE, nu = 0.01))
## true coefficients: gamma = (0.0, 0.4, -0.7),
## alpha (= shape) = 2,
## rho = 0
## compare to coefficient estimates
coef(RCGmodel)
nuisance(RCGmodel)
## compute downstream tests
## (only suitable without early stopping, i.e., if likelihood based model converged)
downstream.test(RCGmodel)
## compute conditional expectations
predictions <- predict(RCGmodel, type = "response")
plot(predictions, y)
abline(0,1)
\donttest{############################################################
## Do not run and check these examples automatically as
## they take some time
Expand Down
10 changes: 8 additions & 2 deletions man/methods.Rd
Expand Up @@ -51,8 +51,6 @@
\alias{nuisance}
\alias{nuisance.mboost}



\title{ Methods for Gradient Boosting Objects }
\description{
Methods for models fitted by boosting algorithms.
Expand Down Expand Up @@ -114,6 +112,8 @@ mstop(x) <- value
\method{risk}{mboost}(object, ...)

\method{nuisance}{mboost}(object)

downstream.test(object, ...)
}
\arguments{
\item{object}{ objects of class \code{glmboost}, \code{gamboost},
Expand Down Expand Up @@ -302,6 +302,12 @@ mstop(x) <- value
Note that \code{logLik} and \code{AIC} only make sense when the
corresponding \code{\link{Family}} implements the appropriate loss
function.
\code{downstream.test} computes tests for linear models fitted via \code{\link{glmboost}}
with a likelihood based loss function and only suitable without early stopping, i.e.,
if likelihood based model converged. In order to work, the Fisher matrix must
be implemented in the \code{\link{Family}}; currently this is only the case for
family \code{\link{RCG}}.
}
\note{
Expand Down

0 comments on commit 45f6b05

Please sign in to comment.