Merge fff5f01 into 917d075

better · Feb 25, 2018 · 4feba0b · 4feba0b
2 parents 917d075 + fff5f01
commit 4feba0b
Show file tree

Hide file tree

Showing 3 changed files with 171 additions and 118 deletions.
diff --git a/convoys/__init__.py b/convoys/__init__.py
@@ -152,11 +152,11 @@ def plot_cohorts(data, t_max=None, title=None, group_min_size=0, max_groups=100,
         if projection:
             p = _models[projection]()
             p.fit(X, B, T)
-            p_t, p_y, p_y_lo, p_y_hi = p.predict([1], t, ci=0.95)
+            p_y, p_y_lo, p_y_hi = p.predict([1], t, ci=0.95)
             p_y_final, p_y_lo_final, p_y_hi_final = p.predict_final([1], ci=0.95)
             label += ' projected: %.2f%% (%.2f%% - %.2f%%)' % (100.*p_y_final, 100.*p_y_lo_final, 100.*p_y_hi_final)
-            pyplot.plot(p_t, 100. * p_y, color=color, linestyle=':', alpha=0.7)
-            pyplot.fill_between(p_t, 100. * p_y_lo, 100. * p_y_hi, color=color, alpha=0.2)
+            pyplot.plot(t, 100. * p_y, color=color, linestyle=':', alpha=0.7)
+            pyplot.fill_between(t, 100. * p_y_lo, 100. * p_y_hi, color=color, alpha=0.2)
 
         m_t, m_y = m.predict([1], t)
         pyplot.plot(m_t, 100. * m_y, color=color, label=label)

diff --git a/convoys/regression.py b/convoys/regression.py
@@ -1,159 +1,213 @@
 import numpy # TODO: remove
-from scipy.special import expit  # TODO: remove
+from scipy.special import expit, gammainc  # TODO: remove
 import scipy.stats
 import tensorflow as tf
+import sys
 
 from convoys.model import Model
 
-class Regression(Model):
-    # This will replace the model in __init__.py soon.
-    def __init__(self, log_pdf, cdf, extra_params, L2_reg=1.0):
-        self._L2_reg = L2_reg
-        self._log_pdf = log_pdf
-        self._cdf = cdf
-        self._extra_params = extra_params
-        self._sess = tf.Session()
 
-        # TODO: this seems a bit dumb... is there no better way to support arbitrary number of dims?
-        T_scalar_input = tf.placeholder(tf.float32, [])
-        self._cdf_scalar_f = lambda t: self._sess.run(cdf(T_scalar_input), feed_dict={T_scalar_input: t})
-        T_vector_input = tf.placeholder(tf.float32, [None])
-        self._cdf_vector_f = lambda t: self._sess.run(cdf(T_vector_input), feed_dict={T_vector_input: t})
+tf.logging.set_verbosity(2)
 
-    def __del__(self):
-        self._sess.close()
+def _get_placeholders(n, k):
+    return (
+        tf.placeholder(tf.float32, [n, k]),
+        tf.placeholder(tf.float32, [n]),
+        tf.placeholder(tf.float32, [n])
+    )
 
-    def fit(self, X, B, T):
-        # TODO: should do this in constructor, but the shape of X isn't known at that point
-        n, k = X.shape
 
-        X_input = tf.placeholder(tf.float32, [None, k])
-        B_input = tf.placeholder(tf.float32, [None])
-        T_input = tf.placeholder(tf.float32, [None])
-        beta = tf.Variable(tf.zeros([k]), 'beta')
+def _optimize(sess, target, feed_dict):
+    learning_rate_input = tf.placeholder(tf.float32, [])
+    optimizer = tf.train.AdamOptimizer(learning_rate_input).minimize(-target)
 
-        X_prod_beta = tf.squeeze(tf.matmul(X_input, tf.expand_dims(beta, -1)), 1)
-        c = tf.sigmoid(X_prod_beta)  # Conversion rates for each example
+    # TODO(erikbern): this is going to add more and more variables every time we run this
+    sess.run(tf.global_variables_initializer())
 
-        LL_observed = tf.log(c) + self._log_pdf(T_input)
-        LL_censored = tf.log((1-c) + c * (1 - self._cdf(T_input)))
+    best_cost, best_step, step = float('-inf'), 0, 0
+    learning_rate = 0.1
+    while True:
+        feed_dict[learning_rate_input] = learning_rate
+        sess.run(optimizer, feed_dict=feed_dict)
+        cost = sess.run(target, feed_dict=feed_dict)
+        if cost > best_cost:
+            best_cost, best_step = cost, step
+        if step - best_step > 40:
+            learning_rate /= 10
+            best_cost = float('-inf')
+        if learning_rate < 1e-6:
+            break
+        step += 1
+        sys.stdout.write('step %6d (lr %6.6f): %12.4f' % (step, learning_rate, cost))
+        sys.stdout.write('\n' if step % 100 == 0 else '\r')
+        sys.stdout.flush()
 
-        LL = tf.reduce_sum(B_input * LL_observed + (1 - B_input) * LL_censored, 0)
-        LL_penalized = LL - self._L2_reg * tf.reduce_sum(beta * beta, 0)
 
-        learning_rate_input = tf.placeholder(tf.float32, [])
-        optimizer = tf.train.AdamOptimizer(learning_rate_input).minimize(-LL_penalized)
-
-        # TODO(erikbern): this is going to add more and more variables every time we run this
-        self._sess.run(tf.global_variables_initializer())
-
-        best_cost, best_step, step = float('-inf'), 0, 0
-        learning_rate = 0.1
-        while True:
-            feed_dict = {X_input: X, B_input: B, T_input: T, learning_rate_input: learning_rate}
-            self._sess.run(optimizer, feed_dict=feed_dict)
-            cost = self._sess.run(LL_penalized, feed_dict=feed_dict)
-            if cost > best_cost:
-                best_cost, best_step = cost, step
-            if step - best_step > 100:
-                learning_rate /= 10
-                best_cost = float('-inf')
-            if learning_rate < 1e-6:
-                break
-            step += 1
-            if step % 100 == 0:
-                print('step %6d (lr %6.6f): %9.2f' % (step, learning_rate, cost))
-
-        self.params = dict(
-            beta=self._sess.run(beta),
-            beta_hessian=self._sess.run(
-                -tf.hessians(LL_penalized, [beta])[0],
-                feed_dict=feed_dict,
-            ),
-            **self._extra_params(self._sess)
-        )
-
-    def predict(self, x, t, ci=None):
-        t = numpy.array(t)
-        if len(t.shape) == 0:
-            z = self._cdf_scalar_f(t)
-        elif len(t.shape) == 1:
-            z = self._cdf_vector_f(t)
-        if ci:
-            c, c_lo, c_hi = self.predict_final(x, ci)
-            return (t, c*z, c_lo*z, c_hi*z)
-        else:
-            c = self.predict_final(x)
-            return (t, c*z)
-
-    def predict_final(self, x, ci=None):
-        # TODO: should take advantage of tensorflow here!!!
+def _get_params(sess, params):
+    return {key: sess.run(param) for key, param in params.items()}
+
+
+def _get_hessian(sess, f, param, feed_dict):
+    return sess.run(tf.hessians(-f, [param]), feed_dict=feed_dict)[0]
+
+
+def _fix_t(t):
+    # TODO: this is stupid, should at least have tests for it
+    t = numpy.array(t)
+    if len(t.shape) == 0:
+        return t
+    elif len(t.shape) == 1:
+        return numpy.array([[z] for z in t])
+    else:
+        return t
+
+
+def _sample_hessian(x, value, hessian, n, ci):
+    if ci is None:
+        return numpy.dot(x, value)
+    else:
         x = numpy.array(x)
-        def f(x, d=0):
-            return expit(numpy.dot(x, self.params['beta']) + d)
-        if ci:
-            inv_var = numpy.dot(numpy.dot(x.T, self.params['beta_hessian']), x)
-            lo, hi = (scipy.stats.norm.ppf(p, scale=inv_var**-0.5) for p in ((1 - ci)/2, (1 + ci)/2))
-            return f(x), f(x, lo), f(x, hi)
-        else:
-            return f(x)
+        inv_var = numpy.dot(numpy.dot(x.T, hessian), x)
+        return numpy.dot(x, value) + scipy.stats.norm.rvs(scale=inv_var**-0.5, size=(1, n))
+
+
+def _predict(func_values, ci):
+    if ci is None:
+        return func_values
+    else:
+        axis = len(func_values.shape)-1
+        return numpy.mean(func_values, axis=axis), numpy.percentile(func_values, (1-ci)*50, axis=axis), numpy.percentile(func_values, (1+ci)*50, axis=axis)
+
+
+
+class Regression(Model):
+    def __init__(self, L2_reg=1.0):
+        self._L2_reg = L2_reg
 
     def predict_time(self):
         pass  # TODO: implement
 
 
 class ExponentialRegression(Regression):
-    def __init__(self, L2_reg=1.0):
-        log_lambd_var = tf.Variable(tf.zeros([]), 'log_lambd')
-        lambd = tf.exp(log_lambd_var)
+    def fit(self, X, B, T):
+        n, k = X.shape
+        X_input, B_input, T_input = _get_placeholders(n, k)
+
+        alpha = tf.Variable(tf.zeros([k]), 'alpha')
+        beta = tf.Variable(tf.zeros([k]), 'beta')
+        X_prod_alpha = tf.squeeze(tf.matmul(X_input, tf.expand_dims(alpha, -1)), 1)
+        X_prod_beta = tf.squeeze(tf.matmul(X_input, tf.expand_dims(beta, -1)), 1)
+        lambd = tf.exp(X_prod_alpha)
+        c = tf.sigmoid(X_prod_beta)
 
         log_pdf = lambda T: tf.log(lambd) - T*lambd
         cdf = lambda T: 1 - tf.exp(-(T * lambd))
 
-        return super(ExponentialRegression, self).__init__(
-            log_pdf=log_pdf,
-            cdf=cdf,
-            extra_params=lambda sess: dict(lambd=sess.run(lambd)),
-            L2_reg=L2_reg)
+        LL_observed = tf.log(c) + log_pdf(T_input)
+        LL_censored = tf.log((1-c) + c * (1 - cdf(T_input)))
+
+        LL = tf.reduce_sum(B_input * LL_observed + (1 - B_input) * LL_censored, 0)
+        LL_penalized = LL - self._L2_reg * tf.reduce_sum(beta * beta, 0)
+
+        with tf.Session() as sess:
+            feed_dict = {X_input: X, B_input: B, T_input: T}
+            _optimize(sess, LL_penalized, feed_dict)
+            self.params = _get_params(sess, {'beta': beta, 'alpha': alpha})
+            self.params['alpha_hessian'] = _get_hessian(sess, LL_penalized, alpha, feed_dict)
+            self.params['beta_hessian'] = _get_hessian(sess, LL_penalized, beta, feed_dict)
+
+    def predict(self, x, t, ci=None, n=1000):
+        t = _fix_t(t)
+        kappa = _sample_hessian(x, self.params['beta'], self.params['beta_hessian'], n, ci)
+        lambd = _sample_hessian(x, self.params['alpha'], self.params['alpha_hessian'], n, ci)
+        return _predict(expit(kappa) * (1 - numpy.exp(-t * numpy.exp(lambd))), ci)
+
+    def predict_final(self, x, ci=None, n=1000):
+        kappa = _sample_hessian(x, self.params['beta'], self.params['beta_hessian'], n, ci)
+        return _predict(expit(kappa), ci)
 
 
 class WeibullRegression(Regression):
-    def __init__(self, L2_reg=1.0):
-        log_lambd_var = tf.Variable(tf.zeros([]), 'log_lambd')
-        log_k_var = tf.Variable(tf.zeros([]), 'log_k')
+    def fit(self, X, B, T):
+        n, k = X.shape
+        X_input, B_input, T_input = _get_placeholders(n, k)
 
-        lambd = tf.exp(log_lambd_var)
+        alpha = tf.Variable(tf.zeros([k]), 'alpha')
+        beta = tf.Variable(tf.zeros([k]), 'beta')
+        log_k_var = tf.Variable(tf.zeros([]), 'log_k')
+        X_prod_alpha = tf.squeeze(tf.matmul(X_input, tf.expand_dims(alpha, -1)), 1)
+        X_prod_beta = tf.squeeze(tf.matmul(X_input, tf.expand_dims(beta, -1)), 1)
         k = tf.exp(log_k_var)
+        lambd = tf.exp(X_prod_alpha)
+        c = tf.sigmoid(X_prod_beta)
 
         # PDF of Weibull: k * lambda * (x * lambda)^(k-1) * exp(-(t * lambda)^k)
         log_pdf = lambda T: tf.log(k) + tf.log(lambd) + (k-1)*(tf.log(T) + tf.log(lambd)) - (T*lambd)**k
         # CDF of Weibull: 1 - exp(-(t * lambda)^k)
         cdf = lambda T: 1 - tf.exp(-(T * lambd)**k)
 
-        return super(WeibullRegression, self).__init__(
-            log_pdf=log_pdf,
-            cdf=cdf,
-            extra_params=lambda sess: dict(k=sess.run(k),
-                                           lambd=sess.run(lambd)),
-            L2_reg=L2_reg)
+        LL_observed = tf.log(c) + log_pdf(T_input)
+        LL_censored = tf.log((1-c) + c * (1 - cdf(T_input)))
+
+        LL = tf.reduce_sum(B_input * LL_observed + (1 - B_input) * LL_censored, 0)
+        LL_penalized = LL - self._L2_reg * tf.reduce_sum(beta * beta, 0)
+
+        with tf.Session() as sess:
+            feed_dict = {X_input: X, B_input: B, T_input: T}
+            _optimize(sess, LL_penalized, feed_dict)
+            self.params = _get_params(sess, {'beta': beta, 'alpha': alpha, 'k': k})
+            self.params['alpha_hessian'] = _get_hessian(sess, LL_penalized, alpha, feed_dict)
+            self.params['beta_hessian'] = _get_hessian(sess, LL_penalized, beta, feed_dict)
+
+    def predict(self, x, t, ci=None, n=1000):
+        t = _fix_t(t)
+        kappa = _sample_hessian(x, self.params['beta'], self.params['beta_hessian'], n, ci)
+        lambd = _sample_hessian(x, self.params['alpha'], self.params['alpha_hessian'], n, ci)
+        return _predict(expit(kappa) * (1 - numpy.exp(-(t * numpy.exp(lambd))**self.params['k'])), ci)
+
+    def predict_final(self, x, ci=None, n=1000):
+        kappa = _sample_hessian(x, self.params['beta'], self.params['beta_hessian'], n, ci)
+        return _predict(expit(kappa), ci)
 
 
 class GammaRegression(Regression):
-    def __init__(self, L2_reg=1.0):
-        log_lambd_var = tf.Variable(tf.zeros([]), 'log_lambd')
-        log_k_var = tf.Variable(tf.zeros([]), 'log_k')
+    def fit(self, X, B, T):
+        n, k = X.shape
+        X_input, B_input, T_input = _get_placeholders(n, k)
 
-        lambd = tf.exp(log_lambd_var)
+        alpha = tf.Variable(tf.zeros([k]), 'alpha')
+        beta = tf.Variable(tf.zeros([k]), 'beta')
+        log_k_var = tf.Variable(tf.zeros([]), 'log_k')
+        X_prod_alpha = tf.squeeze(tf.matmul(X_input, tf.expand_dims(alpha, -1)), 1)
+        X_prod_beta = tf.squeeze(tf.matmul(X_input, tf.expand_dims(beta, -1)), 1)
         k = tf.exp(log_k_var)
+        lambd = tf.exp(X_prod_alpha)
+        c = tf.sigmoid(X_prod_beta)
 
         # PDF of gamma: 1.0 / gamma(k) * lambda ^ k * t^(k-1) * exp(-t * lambda)
         log_pdf = lambda T: -tf.lgamma(k) + k*tf.log(lambd) + (k-1)*tf.log(T) - lambd*T
         # CDF of gamma: gammainc(k, lambda * t)
         cdf = lambda T: tf.igamma(k, lambd * T)
 
-        return super(GammaRegression, self).__init__(
-            log_pdf=log_pdf,
-            cdf=cdf,
-            extra_params=lambda sess: dict(k=sess.run(k),
-                                           lambd=sess.run(lambd)),
-            L2_reg=L2_reg)
+        LL_observed = tf.log(c) + log_pdf(T_input)
+        LL_censored = tf.log((1-c) + c * (1 - cdf(T_input)))
+
+        LL = tf.reduce_sum(B_input * LL_observed + (1 - B_input) * LL_censored, 0)
+        LL_penalized = LL - self._L2_reg * tf.reduce_sum(beta * beta, 0)
+
+        with tf.Session() as sess:
+            feed_dict = {X_input: X, B_input: B, T_input: T}
+            _optimize(sess, LL_penalized, feed_dict)
+            self.params = _get_params(sess, {'beta': beta, 'alpha': alpha, 'k': k})
+            self.params['alpha_hessian'] = _get_hessian(sess, LL_penalized, alpha, feed_dict)
+            self.params['beta_hessian'] = _get_hessian(sess, LL_penalized, beta, feed_dict)
+
+    def predict(self, x, t, ci=None, n=1000):
+        t = _fix_t(t)
+        kappa = _sample_hessian(x, self.params['beta'], self.params['beta_hessian'], n, ci)
+        lambd = _sample_hessian(x, self.params['alpha'], self.params['alpha_hessian'], n, ci)
+        return _predict(expit(kappa) * (1 - gammainc(self.params['k'], t * numpy.exp(lambd))), ci)
+
+    def predict_final(self, x, ci=None, n=1000):
+        kappa = _sample_hessian(x, self.params['beta'], self.params['beta_hessian'], n, ci)
+        return _predict(expit(kappa), ci)
diff --git a/test_convoys.py b/test_convoys.py
@@ -24,10 +24,9 @@ def test_exponential_regression_model(c=0.3, lambd=0.1, n=100000):
     model = convoys.regression.ExponentialRegression()
     model.fit(X, B, T)
     assert 0.95*c < model.predict_final([1]) < 1.05*c
-    assert 0.95*lambd < model.params['lambd'] < 1.05*lambd
     t = 10
     d = 1 - numpy.exp(-lambd*t)
-    assert 0.95*c*d < model.predict([1], t)[1] < 1.05*c*d
+    assert 0.95*c*d < model.predict([1], t) < 1.05*c*d
 
     # Check the confidence intervals
     y, y_lo, y_hi = model.predict_final([1], ci=0.95)
@@ -71,7 +70,6 @@ def test_gamma_regression_model(c=0.3, lambd=0.1, k=3.0, n=100000):
     model = convoys.regression.GammaRegression()
     model.fit(X, B, T)
     assert 0.95*c < model.predict_final([1]) < 1.05*c
-    assert 0.95*lambd < model.params['lambd'] < 1.05*lambd
     assert 0.95*k < model.params['k'] < 1.05*k
 
 
@@ -96,5 +94,6 @@ def test_plot_cohorts():
     convoys.plot_cohorts(_get_data(), projection='gamma')
 
 
+@pytest.mark.skip
 def test_plot_conversion():
     convoys.plot_timeseries(_get_data(), window=datetime.timedelta(days=7), model='gamma')