nonparametric method, super slow

better · Mar 19, 2018 · c1c8eac · c1c8eac
1 parent 433d23c
commit c1c8eac
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 42 deletions.
diff --git a/convoys/regression.py b/convoys/regression.py
@@ -2,7 +2,7 @@
 from scipy.special import expit, gamma, gammainc  # TODO: remove
 import scipy.stats
 import tensorflow as tf
-import sys
+from convoys import tf_utils
 
 
 tf.logging.set_verbosity(3)
@@ -11,44 +11,6 @@ def _get_constants(args):
     return (tf.constant(arg.astype(numpy.float32)) for arg in args)
 
 
-def _optimize(sess, target, variables):
-    learning_rate_input = tf.placeholder(tf.float32, [])
-    optimizer = tf.train.AdamOptimizer(learning_rate_input).minimize(-target)
-
-    best_state_variables = [tf.Variable(tf.zeros(v.shape)) for v in variables]
-    store_best_state = [tf.assign(v, u) for (u, v) in zip(variables, best_state_variables)]
-    restore_best_state = [tf.assign(u, v) for (u, v) in zip(variables, best_state_variables)]
-    sess.run(tf.global_variables_initializer())
-
-    best_step, step = 0, 0
-    dec_learning_rate = 1.0
-    best_cost = sess.run(target)
-    any_var_is_nan = tf.is_nan(tf.add_n([tf.reduce_sum(v) for v in variables]))
-
-    while True:
-        inc_learning_rate = 10**(min(step, 240)//40-6)
-        learning_rate = min(inc_learning_rate, dec_learning_rate)
-        sess.run(optimizer, feed_dict={learning_rate_input: learning_rate})
-        if sess.run(any_var_is_nan):
-            cost = float('-inf')
-        else:
-            cost = sess.run(target)
-        if cost > best_cost:
-            best_cost, best_step = cost, step
-            sess.run(store_best_state)
-        elif str(cost) in ('-inf', 'nan') or step - best_step > 40:
-            sess.run(restore_best_state)
-            dec_learning_rate = learning_rate / 10
-            best_step = step
-        if learning_rate < 1e-6:
-            sys.stdout.write('\n')
-            break
-        step += 1
-        sys.stdout.write('step %6d (lr %6.6f): %14.3f%30s' % (step, learning_rate, cost, ''))
-        sys.stdout.write('\n' if step % 100 == 0 else '\r')
-        sys.stdout.flush()
-
-
 def _get_params(sess, params):
     return {key: sess.run(param) for key, param in params.items()}
 
@@ -112,7 +74,7 @@ def fit(self, X, B, T):
         LL_penalized = LL - self._L2_reg * tf.reduce_sum(beta * beta, 0)
 
         with tf.Session() as sess:
-            _optimize(sess, LL_penalized, (alpha, beta))
+            tf_utils.optimize(sess, LL_penalized, (alpha, beta))
             self.params = _get_params(sess, {'beta': beta, 'alpha': alpha})
             self.params['alpha_hessian'] = _get_hessian(sess, LL_penalized, alpha)
             self.params['beta_hessian'] = _get_hessian(sess, LL_penalized, beta)
@@ -158,7 +120,7 @@ def fit(self, X, B, T):
         LL_penalized = LL - self._L2_reg * tf.reduce_sum(beta * beta, 0)
 
         with tf.Session() as sess:
-            _optimize(sess, LL_penalized, (alpha, beta, log_k_var))
+            tf_utils.optimize(sess, LL_penalized, (alpha, beta, log_k_var))
             self.params = _get_params(sess, {'beta': beta, 'alpha': alpha, 'k': k})
             self.params['alpha_hessian'] = _get_hessian(sess, LL_penalized, alpha)
             self.params['beta_hessian'] = _get_hessian(sess, LL_penalized, beta)
@@ -204,7 +166,7 @@ def fit(self, X, B, T):
         LL_penalized = LL - self._L2_reg * tf.reduce_sum(beta * beta, 0)
 
         with tf.Session() as sess:
-            _optimize(sess, LL_penalized, (alpha, beta, log_k_var))
+            tf_utils.optimize(sess, LL_penalized, (alpha, beta, log_k_var))
             self.params = _get_params(sess, {'beta': beta, 'alpha': alpha, 'k': k})
             self.params['alpha_hessian'] = _get_hessian(sess, LL_penalized, alpha)
             self.params['beta_hessian'] = _get_hessian(sess, LL_penalized, beta)

diff --git a/convoys/single.py b/convoys/single.py
@@ -1,6 +1,8 @@
 import bisect
 import lifelines
 import numpy
+import tensorflow as tf
+from convoys import tf_utils
 
 
 class SingleModel:
@@ -41,3 +43,48 @@ def median(ps):
             return median(self.ps), median(self.ps_lo), median(self.ps_hi)
         else:
             return median(self.ps)
+
+
+class Nonparametric(SingleModel):
+    def fit(self, B, T, n=100):
+        # We're going to fit c and p_0, p_1, ...
+        # so that the probability of conversion at time i is c * (1 - p_0) * ... p_i
+        # What's the total likelihood
+        # For items that did convert:
+        # L = c * (1 - p_0) * ... * (1 - p_{i-1}) * p_i
+        # For items that did not convert:
+        # L = 1 - c + c * (1 - p_0) * ... * (1 - p_{i-1})
+        # Need to sum up the log of that
+        # Let's replace the p_i's with sigmoids
+        n = min(n, len(B))
+        js = [int(round(z)) for z in numpy.linspace(0, len(B), n-1, endpoint=False)]
+        ts = list(sorted(T))
+        ts = [ts[j] for j in js]
+        M = numpy.zeros((len(B), n), dtype=numpy.float32)
+        N = numpy.zeros((len(B), n), dtype=numpy.float32)
+        for i, (b, t) in enumerate(zip(B, T)):
+            j = bisect.bisect_left(ts, t)
+            M[i,0:j] = -1
+            N[i,0:j] = 1
+            if b:
+                M[i,j] = 1
+                N[i,j] = 1
+        M = tf.constant(M)
+        z = tf.Variable(tf.zeros((n,)))
+        Z = tf.expand_dims(z, 0)
+        l = tf.reduce_sum(tf.log(tf.sigmoid(M * Z)) * N, 1)
+        beta = tf.Variable(tf.zeros([]))
+        c = tf.sigmoid(beta)
+        B = tf.constant(B.astype(numpy.float32))
+        LL_observed = (tf.log(c) + l)
+        LL_unobserved = (tf.log(1 - c + c * tf.exp(l)))
+        LL = tf.reduce_sum(B * LL_observed + (1 - B) * LL_unobserved, 0)
+        with tf.Session() as sess:
+            tf_utils.optimize(sess, LL, (z, beta))
+
+            print('c:', sess.run(c))
+            w = sess.run(c * (1 - tf.exp(tf.cumsum(tf.log(tf.sigmoid(-z))))))
+            from matplotlib import pyplot
+            pyplot.plot(w)
+            pyplot.savefig('nonparametric.png')
+
diff --git a/convoys/tf_utils.py b/convoys/tf_utils.py
@@ -0,0 +1,40 @@
+import sys
+import tensorflow as tf
+
+def optimize(sess, target, variables):
+    learning_rate_input = tf.placeholder(tf.float32, [])
+    optimizer = tf.train.AdamOptimizer(learning_rate_input).minimize(-target)
+
+    best_state_variables = [tf.Variable(tf.zeros(v.shape)) for v in variables]
+    store_best_state = [tf.assign(v, u) for (u, v) in zip(variables, best_state_variables)]
+    restore_best_state = [tf.assign(u, v) for (u, v) in zip(variables, best_state_variables)]
+    sess.run(tf.global_variables_initializer())
+
+    best_step, step = 0, 0
+    dec_learning_rate = 1.0
+    best_cost = sess.run(target)
+    any_var_is_nan = tf.is_nan(tf.add_n([tf.reduce_sum(v) for v in variables]))
+
+    while True:
+        inc_learning_rate = 10**(min(step, 240)//40-6)
+        learning_rate = min(inc_learning_rate, dec_learning_rate)
+        sess.run(optimizer, feed_dict={learning_rate_input: learning_rate})
+        if sess.run(any_var_is_nan):
+            cost = float('-inf')
+        else:
+            cost = sess.run(target)
+        if cost > best_cost:
+            best_cost, best_step = cost, step
+            sess.run(store_best_state)
+        elif str(cost) in ('-inf', 'nan') or step - best_step > 40:
+            sess.run(restore_best_state)
+            dec_learning_rate = learning_rate / 10
+            best_step = step
+        if learning_rate < 1e-6:
+            sys.stdout.write('\n')
+            break
+        step += 1
+        sys.stdout.write('step %6d (lr %6.6f): %14.3f%30s' % (step, learning_rate, cost, ''))
+        sys.stdout.write('\n' if step % 100 == 0 else '\r')
+        sys.stdout.flush()
+
diff --git a/test_convoys.py b/test_convoys.py
@@ -8,6 +8,7 @@
 matplotlib.use('Agg')  # Needed for matplotlib to run in Travis
 import convoys
 import convoys.regression
+import convoys.single
 
 
 def sample_weibull(k, lambd):
@@ -120,3 +121,13 @@ def test_plot_cohorts(cs=[0.3, 0.5, 0.7], k=2.0, lambd=0.1, n=10000):
     c = cs[0]
     assert group == 'Group 0'
     assert 0.95*c < y < 1.05 * c
+
+
+def test_nonparametric_model(c=0.3, lambd=0.1, k=0.5, n=100000):
+    C = scipy.stats.bernoulli.rvs(c, size=(n,))
+    N = scipy.stats.uniform.rvs(scale=5./lambd, size=(n,))
+    E = numpy.array([sample_weibull(k, lambd) for r in range(n)])
+    B, T = generate_censored_data(N, E, C)
+
+    m = convoys.single.Nonparametric()
+    m.fit(B, T)