Join GitHub today
GitHub is home to over 50 million developers working together to host and review code, manage projects, and build software together.
Sign upFamily for Binomial(n, p) instead of Binomial(1, p) #34
Comments
|
Ok, @sbrockhaus showed me how to use |
|
What follows is a dirty hack for getting fits for binomial data where the number of trials
|
|
As boost-R/gamboostLSS#12 is fixed, Additionally, to fit this without using a library(mboost)
Binomial_SuccessFailures <- function(p = NULL)
{
loss <- function(y, f, w = 1) {
ntrials <- rowSums(y)
y <- y[,1]
f <- pmin(abs(f), 36) * sign(f)
p <- exp(f)/(1 + exp(f))
-dbinom(x = y, size = ntrials, prob = p, log = log)
}
risk <- function(y, f, w = 1) {
sum(w * loss(y = y, f = f))
}
ngradient <- function(y, f, w = 1) {
if (! (is.matrix(y) && ncol(y) ==2)){
stop("response should be a two-column matrix (success and failures) for this family")
}
ntrials <- rowSums(y)
y <- y[,1]
f <- pmin(abs(f), 36) * sign(f)
p <- exp(f)/(1 + exp(f))
ngr <- dlogis(f)*(y - ntrials * p)/(p * (1 - p))
}
offset <-function(y, w = 1) {
if (! (is.matrix(y) && ncol(y)==2)){
stop("response should be a two-column matrix (success and failures) for this family")
}
ntrials <- rowSums(y)
y <- y[,1]
p <- mean((y + 0.5)/(ntrials + 1))
return(log(p/(1-p)))
}
Family(ngradient = ngradient, risk = risk, loss = loss,
response = function(f) exp(f)/(1 + exp(f)), offset = offset,
name = "Binomial Distribution: Success - Failures")
}
set.seed(123)
n <- 100
x <- rnorm(n)
z <- rnorm(n)
data <- data.frame(y = rbinom(n, p = plogis(1+ x + z), size = rep(c(100,20,30,60), each =5)), x = x, z= z)
data$ymat <- with(data, cbind(success = data$y, fail =rep(c(100,20,30,60), each =5) - data$y))
g1 <- glmboost(ymat ~ x + z, family = Binomial_SuccessFailures(), data = data)
coef(g1, off2int = TRUE)
# (Intercept) x z
# 0.9846073 1.0518361 0.9685446
library(gamboostLSS)
g2 <- glmboost(ymat ~ x + z, family = as.families("BI"), data = data)
coef(g2, off2int = TRUE)
# (Intercept) x z
# 0.9846073 1.0518361 0.9685446
library(gamlss)
g3 <- gamlss(ymat ~ x+ z, family = BI(), data = data)
coef(g3)
#(Intercept) x z
# 0.9846073 1.0518361 0.9685446
|
|
Can we (aka you) add this family to mboost, @mayrandy? Perhaps with a more concise name? ;) |
|
Of course. But, as @fabian-s already pointed out, the name somehow is not trivial in this case. In fact, what the family does is fitting a model optimizing the binomial likelihood. However, Btw: When we set failures to set.seed(123)
x1 <- runif(1000)
x2 <- runif(1000)
y <- rbinom(n = 1000, prob = plogis(2*x1 - 2*x2), size = 1)
ymat <- cbind(y, 1 - y)
# new family
glm1 <- glmboost(ymat ~ x1 +x2, family = Binomial_SuccessFailures())
coef(glm1[1000], off2int = TRUE)
#(Intercept) x1 x2
#0.008236774 1.887082157 -1.889001532
# classic glm()
glm2 <- glm(ymat ~ x1 +x2, family = "binomial")
coef(glm2)
#(Intercept) x1 x2
#0.008345434 1.887505727 -1.889425746
# Binomial() family in mboost
glm3 <- glmboost(factor(y) ~ x1 +x2, family = Binomial())
coef(glm3[1000], off2int = TRUE)
#NOTE: Coefficients from a Binomial model are half the size of coefficients
#from a model fitted via glm(... , family = 'binomial').
#See Warning section in ?coef.mboost
# (Intercept) x1 x2
# 0.004172717 0.943752864 -0.944712873 Anyhow, I think what we should do before adding the family is:
|
|
Here comes an updated family, should work now for two-level factors, binary vectors or a two-column matrix. Additionally we now can deal with both "logit" and "probit" links. I talked to @fabian-s during the week and we thought that Binomial_glm <- function(link = "logit")
{
if(! link %in% c("logit", "probit")) stop("link function must be either 'logit' or
'probit'")
link <- make.link(link)
y_check <- function(y) {
if ((is.matrix(y) && NCOL(y)!=2)){
stop("response should be either a two-column matrix (no. successes and
no. failures), a two level factor or a vector of 0 and 1's for this family")
}
if(is.factor(y)){
if (nlevels(y) != 2) stop("response should be either a two-column matrix
(no. successes and no. failures), a two level
factor or a vector of 0 and 1's for this family")
y <- c(0, 1)[as.integer(y)]
}
if(!is.matrix(y)){
if(!all(y %in% c(0,1))) stop("response should be either a two-column matrix
(no. successes and no. failures), a two level
factor or a vector of 0 and 1's for this family")
y <- cbind(y, 1-y)
}
return(y)
}
loss <- function(y, f, w = 1) {
ntrials <- rowSums(y)
y <- y[,1]
p <- link$linkinv(f)
-dbinom(x = y, size = ntrials, prob = p, log = log)
}
risk <- function(y, f, w = 1) {
sum(w * loss(y = y, f = f))
}
ngradient <- function(y, f, w = 1) {
ntrials <- rowSums(y)
y <- y[,1]
p <- link$linkinv(f)
ngr <- link$mu.eta(f)*(y - ntrials * p)/(p * (1 - p))
}
offset <-function(y, w = 1) {
ntrials <- rowSums(y)
y <- y[,1]
p <- mean((y + 0.5)/(ntrials + 1))
return(link$linkfun(p))
}
Family(ngradient = ngradient, risk = risk, loss = loss, check_y = y_check,
response = function(f) link$linkinv(f), offset = offset,
name = "Binomial Distribution (as in glm)")
}
|
|
|
|
@mayrandy: Many thanks. Just some follow up questions:
linktemp <- substitute(link)
if (!is.character(linktemp))
linktemp <- deparse(linktemp)
okLinks <- c("logit", "probit", "cloglog", "cauchit", "log")
if (linktemp %in% okLinks)
stats <- make.link(linktemp)
else if (is.character(link)) {
stats <- make.link(link)
linktemp <- link
}
else {
if (inherits(link, "link-glm")) {
stats <- link
if (!is.null(stats$name))
linktemp <- stats$name
}
else {
stop(gettextf("link \"%s\" not available for binomial family; available links are %s",
linktemp, paste(sQuote(okLinks), collapse = ", ")),
domain = NA)
}
}
|
|
|
@mayrandy can you please check that everything works as expected? I tried to fix all of the above issues. |
We should have a family for binomial responses.
Binomialis a misnomer, should beBernoulliorBinarybecause that's what it actually does (too late to change that now I guess).Transforming binomial responses (x out of y) into a vector of x "sucesses" and y-x "failures" and repeating the covariates y times is an ugly hack.
EDIT: where do I find an explanation of what
fWis supposed to look like -- the examples I can find on how to define aFamilyuse identity link and seem to get away without one.....