Skip to content

Commit

Permalink
Merge pull request #6 from better/weibull-regression
Browse files Browse the repository at this point in the history
weibull regression (MAP and bayesian version)
  • Loading branch information
erikbern committed Feb 8, 2018
2 parents 62c0e24 + 05bcbf3 commit 896b02c
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 0 deletions.
43 changes: 43 additions & 0 deletions convoys/bayesian.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import numpy
import pymc3
import random
from scipy.special import expit
from pymc3.math import dot, sigmoid, log, exp

from convoys import Model

class WeibullRegression(Model):
# This is a super-experimental model that uses pymc3 to sample from the posterior
# Unfortunately, it's way too slow to use for any datasets larger than n=1000
# I'm keeping it here as a way to double check the results of models
def fit(self, X, B, T):
n, k = X.shape
with pymc3.Model() as m:
beta_sd = pymc3.Exponential('beta_sd', 1.0) # Weak prior for the regression coefficients
beta = pymc3.Normal('beta', mu=0, sd=beta_sd, shape=(k,)) # Regression coefficients
c = sigmoid(dot(X, beta)) # Conversion rates for each example
k = pymc3.Lognormal('k', mu=0, sd=1.0) # Weak prior around k=1
lambd = pymc3.Exponential('lambd', 0.1) # Weak prior

# PDF of Weibull: k * lambda * (x * lambda)^(k-1) * exp(-(t * lambda)^k)
LL_observed = log(c) + log(k) + log(lambd) + (k-1)*(log(T) + log(lambd)) - (T*lambd)**k
# CDF of Weibull: 1 - exp(-(t * lambda)^k)
LL_censored = log((1-c) + c * exp(-(T*lambd)**k))

# We need to implement the likelihood using pymc3.Potential (custom likelihood)
# https://github.com/pymc-devs/pymc3/issues/826
logp = B * LL_observed + (1 - B) * LL_censored
logpvar = pymc3.Potential('logpvar', logp.sum())

self.trace = pymc3.sample(n_simulations=500, tune=500, discard_tuned_samples=True, njobs=1)
print('done')
print('done 2')

def predict(self):
pass # TODO: implement

def predict_final(self, x):
return numpy.mean(expit(numpy.dot(self.trace['beta'], x)))

def predict_time(self):
pass # TODO: implement
60 changes: 60 additions & 0 deletions convoys/regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import numpy
import scipy.optimize
from autograd import jacobian, hessian, grad
from autograd.scipy.special import expit, gamma, gammainc, gammaincc, gammaln
from autograd.numpy import exp, log, sum, dot

from convoys import Model

class WeibullRegression(Model):
# This will replace the Weibull model in __init__.py soon.
def fit(self, X, B, T):
n, k = X.shape
X = X.astype(numpy.float32)
def f(x):
lambd, k = exp(x[0]), exp(x[1])
beta = x[2:]
c = expit(dot(X, beta.T)) # Conversion rates for each example

# PDF of Weibull: k * lambda * (x * lambda)^(k-1) * exp(-(t * lambda)^k)
LL_observed = log(c) + log(k) + log(lambd) + (k-1)*(log(T) + log(lambd)) - (T*lambd)**k
# CDF of Weibull: 1 - exp(-(t * lambda)^k)
LL_censored = log((1-c) + c * exp(-(T*lambd)**k))

LL = sum(B * LL_observed + (1 - B) * LL_censored)
return -LL

res = scipy.optimize.minimize(
fun=f,
jac=jacobian(f),
hess=hessian(f),
x0=numpy.zeros(k+2),
method='trust-ncg')
log_lambd, log_k = res.x[0], res.x[1]
beta = res.x[2:]
# Compute hessian of betas
beta_hessian = hessian(f)(res.x)[2:,2:]
self.params = dict(
lambd=exp(log_lambd),
k=exp(log_k),
beta=beta,
beta_hessian=beta_hessian
)

def predict(self):
pass # TODO: implement

def predict_final(self, x, confidence_interval=False):
x = numpy.array(x)
def f(x, d=0):
return expit(dot(x, self.params['beta']) + d)
if confidence_interval:
# I have no clue if this math is correct, need to double check
inv_sd = dot(dot(x.T, self.params['beta_hessian']), x)
lo, hi = (scipy.stats.norm.ppf(p, scale=1./inv_sd) for p in (0.025, 0.975))
return f(x), f(x, lo), f(x, hi)
else:
return f(x)

def predict_time(self):
pass # TODO: implement
15 changes: 15 additions & 0 deletions test_convoys.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import datetime
import matplotlib
import numpy
import pytest
import random
import scipy.stats
matplotlib.use('Agg') # Needed for matplotlib to run in Travis
import convoys
import convoys.regression


def test_exponential_model(c=0.3, lambd=0.1, n=100000):
Expand Down Expand Up @@ -55,6 +57,19 @@ def sample_weibull():
assert 0.95*k < model.params['k'] < 1.05*k


def test_weibull_regression_model(cs=[0.3, 0.5, 0.7], lambd=0.1, k=0.5, n=10000):
def sample_weibull():
return (-numpy.log(random.random())) ** (1.0/k) / lambd
X = numpy.array([[1] + [r % len(cs) == j for j in range(len(cs))] for r in range(n)])
B = numpy.array([bool(random.random() < cs[r % len(cs)]) for r in range(n)])
T = numpy.array([b and sample_weibull() or 1000 for b in B])
model = convoys.regression.WeibullRegression()
model.fit(X, B, T)
for r, c in enumerate(cs):
x = [1] + [int(r == j) for j in range(len(cs))]
assert 0.95 * c < model.predict_final(x) < 1.05 * c


def _get_data(c=0.3, k=10, lambd=0.1, n=1000):
data = []
now = datetime.datetime(2000, 7, 1)
Expand Down

0 comments on commit 896b02c

Please sign in to comment.