Merge pull request #6 from better/weibull-regression

weibull regression (MAP and bayesian version)
better · Feb 8, 2018 · 896b02c · 896b02c
2 parents 62c0e24 + 05bcbf3
commit 896b02c
Show file tree

Hide file tree

Showing 3 changed files with 118 additions and 0 deletions.
diff --git a/convoys/bayesian.py b/convoys/bayesian.py
@@ -0,0 +1,43 @@
+import numpy
+import pymc3
+import random
+from scipy.special import expit
+from pymc3.math import dot, sigmoid, log, exp
+
+from convoys import Model
+
+class WeibullRegression(Model):
+    # This is a super-experimental model that uses pymc3 to sample from the posterior
+    # Unfortunately, it's way too slow to use for any datasets larger than n=1000
+    # I'm keeping it here as a way to double check the results of models
+    def fit(self, X, B, T):
+        n, k = X.shape
+        with pymc3.Model() as m:
+            beta_sd = pymc3.Exponential('beta_sd', 1.0)  # Weak prior for the regression coefficients
+            beta = pymc3.Normal('beta', mu=0, sd=beta_sd, shape=(k,))  # Regression coefficients
+            c = sigmoid(dot(X, beta))  # Conversion rates for each example
+            k = pymc3.Lognormal('k', mu=0, sd=1.0)  # Weak prior around k=1
+            lambd = pymc3.Exponential('lambd', 0.1)  # Weak prior
+
+            # PDF of Weibull: k * lambda * (x * lambda)^(k-1) * exp(-(t * lambda)^k)
+            LL_observed = log(c) + log(k) + log(lambd) + (k-1)*(log(T) + log(lambd)) - (T*lambd)**k
+            # CDF of Weibull: 1 - exp(-(t * lambda)^k)
+            LL_censored = log((1-c) + c * exp(-(T*lambd)**k))
+
+            # We need to implement the likelihood using pymc3.Potential (custom likelihood)
+            # https://github.com/pymc-devs/pymc3/issues/826
+            logp = B * LL_observed + (1 - B) * LL_censored
+            logpvar = pymc3.Potential('logpvar', logp.sum())
+
+            self.trace = pymc3.sample(n_simulations=500, tune=500, discard_tuned_samples=True, njobs=1)
+            print('done')
+        print('done 2')
+
+    def predict(self):
+        pass  # TODO: implement
+
+    def predict_final(self, x):
+        return numpy.mean(expit(numpy.dot(self.trace['beta'], x)))
+
+    def predict_time(self):
+        pass  # TODO: implement
diff --git a/convoys/regression.py b/convoys/regression.py
@@ -0,0 +1,60 @@
+import numpy
+import scipy.optimize
+from autograd import jacobian, hessian, grad
+from autograd.scipy.special import expit, gamma, gammainc, gammaincc, gammaln
+from autograd.numpy import exp, log, sum, dot
+
+from convoys import Model
+
+class WeibullRegression(Model):
+    # This will replace the Weibull model in __init__.py soon.
+    def fit(self, X, B, T):
+        n, k = X.shape
+        X = X.astype(numpy.float32)
+        def f(x):
+            lambd, k = exp(x[0]), exp(x[1])
+            beta = x[2:]
+            c = expit(dot(X, beta.T))  # Conversion rates for each example
+
+            # PDF of Weibull: k * lambda * (x * lambda)^(k-1) * exp(-(t * lambda)^k)
+            LL_observed = log(c) + log(k) + log(lambd) + (k-1)*(log(T) + log(lambd)) - (T*lambd)**k
+            # CDF of Weibull: 1 - exp(-(t * lambda)^k)
+            LL_censored = log((1-c) + c * exp(-(T*lambd)**k))
+
+            LL = sum(B * LL_observed + (1 - B) * LL_censored)
+            return -LL
+
+        res = scipy.optimize.minimize(
+            fun=f,
+            jac=jacobian(f),
+            hess=hessian(f),
+            x0=numpy.zeros(k+2),
+            method='trust-ncg')
+        log_lambd, log_k = res.x[0], res.x[1]
+        beta = res.x[2:]
+        # Compute hessian of betas
+        beta_hessian = hessian(f)(res.x)[2:,2:]
+        self.params = dict(
+            lambd=exp(log_lambd),
+            k=exp(log_k),
+            beta=beta,
+            beta_hessian=beta_hessian
+        )
+
+    def predict(self):
+        pass  # TODO: implement
+
+    def predict_final(self, x, confidence_interval=False):
+        x = numpy.array(x)
+        def f(x, d=0):
+            return expit(dot(x, self.params['beta']) + d)
+        if confidence_interval:
+            # I have no clue if this math is correct, need to double check
+            inv_sd = dot(dot(x.T, self.params['beta_hessian']), x)
+            lo, hi = (scipy.stats.norm.ppf(p, scale=1./inv_sd) for p in (0.025, 0.975))
+            return f(x), f(x, lo), f(x, hi)
+        else:
+            return f(x)
+
+    def predict_time(self):
+        pass  # TODO: implement
diff --git a/test_convoys.py b/test_convoys.py
@@ -1,10 +1,12 @@
 import datetime
 import matplotlib
 import numpy
+import pytest
 import random
 import scipy.stats
 matplotlib.use('Agg')  # Needed for matplotlib to run in Travis
 import convoys
+import convoys.regression
 
 
 def test_exponential_model(c=0.3, lambd=0.1, n=100000):
@@ -55,6 +57,19 @@ def sample_weibull():
     assert 0.95*k < model.params['k'] < 1.05*k
 
 
+def test_weibull_regression_model(cs=[0.3, 0.5, 0.7], lambd=0.1, k=0.5, n=10000):
+    def sample_weibull():
+        return (-numpy.log(random.random())) ** (1.0/k) / lambd
+    X = numpy.array([[1] + [r % len(cs) == j for j in range(len(cs))] for r in range(n)])
+    B = numpy.array([bool(random.random() < cs[r % len(cs)]) for r in range(n)])
+    T = numpy.array([b and sample_weibull() or 1000 for b in B])
+    model = convoys.regression.WeibullRegression()
+    model.fit(X, B, T)
+    for r, c in enumerate(cs):
+        x = [1] + [int(r == j) for j in range(len(cs))]
+        assert 0.95 * c < model.predict_final(x) < 1.05 * c
+
+
 def _get_data(c=0.3, k=10, lambd=0.1, n=1000):
     data = []
     now = datetime.datetime(2000, 7, 1)