Skip to content

Commit

Permalink
nonparametric method, super slow
Browse files Browse the repository at this point in the history
  • Loading branch information
erikbern committed Mar 19, 2018
1 parent 433d23c commit c1c8eac
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 42 deletions.
46 changes: 4 additions & 42 deletions convoys/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from scipy.special import expit, gamma, gammainc # TODO: remove
import scipy.stats
import tensorflow as tf
import sys
from convoys import tf_utils


tf.logging.set_verbosity(3)
Expand All @@ -11,44 +11,6 @@ def _get_constants(args):
return (tf.constant(arg.astype(numpy.float32)) for arg in args)


def _optimize(sess, target, variables):
learning_rate_input = tf.placeholder(tf.float32, [])
optimizer = tf.train.AdamOptimizer(learning_rate_input).minimize(-target)

best_state_variables = [tf.Variable(tf.zeros(v.shape)) for v in variables]
store_best_state = [tf.assign(v, u) for (u, v) in zip(variables, best_state_variables)]
restore_best_state = [tf.assign(u, v) for (u, v) in zip(variables, best_state_variables)]
sess.run(tf.global_variables_initializer())

best_step, step = 0, 0
dec_learning_rate = 1.0
best_cost = sess.run(target)
any_var_is_nan = tf.is_nan(tf.add_n([tf.reduce_sum(v) for v in variables]))

while True:
inc_learning_rate = 10**(min(step, 240)//40-6)
learning_rate = min(inc_learning_rate, dec_learning_rate)
sess.run(optimizer, feed_dict={learning_rate_input: learning_rate})
if sess.run(any_var_is_nan):
cost = float('-inf')
else:
cost = sess.run(target)
if cost > best_cost:
best_cost, best_step = cost, step
sess.run(store_best_state)
elif str(cost) in ('-inf', 'nan') or step - best_step > 40:
sess.run(restore_best_state)
dec_learning_rate = learning_rate / 10
best_step = step
if learning_rate < 1e-6:
sys.stdout.write('\n')
break
step += 1
sys.stdout.write('step %6d (lr %6.6f): %14.3f%30s' % (step, learning_rate, cost, ''))
sys.stdout.write('\n' if step % 100 == 0 else '\r')
sys.stdout.flush()


def _get_params(sess, params):
return {key: sess.run(param) for key, param in params.items()}

Expand Down Expand Up @@ -112,7 +74,7 @@ def fit(self, X, B, T):
LL_penalized = LL - self._L2_reg * tf.reduce_sum(beta * beta, 0)

with tf.Session() as sess:
_optimize(sess, LL_penalized, (alpha, beta))
tf_utils.optimize(sess, LL_penalized, (alpha, beta))
self.params = _get_params(sess, {'beta': beta, 'alpha': alpha})
self.params['alpha_hessian'] = _get_hessian(sess, LL_penalized, alpha)
self.params['beta_hessian'] = _get_hessian(sess, LL_penalized, beta)
Expand Down Expand Up @@ -158,7 +120,7 @@ def fit(self, X, B, T):
LL_penalized = LL - self._L2_reg * tf.reduce_sum(beta * beta, 0)

with tf.Session() as sess:
_optimize(sess, LL_penalized, (alpha, beta, log_k_var))
tf_utils.optimize(sess, LL_penalized, (alpha, beta, log_k_var))
self.params = _get_params(sess, {'beta': beta, 'alpha': alpha, 'k': k})
self.params['alpha_hessian'] = _get_hessian(sess, LL_penalized, alpha)
self.params['beta_hessian'] = _get_hessian(sess, LL_penalized, beta)
Expand Down Expand Up @@ -204,7 +166,7 @@ def fit(self, X, B, T):
LL_penalized = LL - self._L2_reg * tf.reduce_sum(beta * beta, 0)

with tf.Session() as sess:
_optimize(sess, LL_penalized, (alpha, beta, log_k_var))
tf_utils.optimize(sess, LL_penalized, (alpha, beta, log_k_var))
self.params = _get_params(sess, {'beta': beta, 'alpha': alpha, 'k': k})
self.params['alpha_hessian'] = _get_hessian(sess, LL_penalized, alpha)
self.params['beta_hessian'] = _get_hessian(sess, LL_penalized, beta)
Expand Down
47 changes: 47 additions & 0 deletions convoys/single.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import bisect
import lifelines
import numpy
import tensorflow as tf
from convoys import tf_utils


class SingleModel:
Expand Down Expand Up @@ -41,3 +43,48 @@ def median(ps):
return median(self.ps), median(self.ps_lo), median(self.ps_hi)
else:
return median(self.ps)


class Nonparametric(SingleModel):
def fit(self, B, T, n=100):
# We're going to fit c and p_0, p_1, ...
# so that the probability of conversion at time i is c * (1 - p_0) * ... p_i
# What's the total likelihood
# For items that did convert:
# L = c * (1 - p_0) * ... * (1 - p_{i-1}) * p_i
# For items that did not convert:
# L = 1 - c + c * (1 - p_0) * ... * (1 - p_{i-1})
# Need to sum up the log of that
# Let's replace the p_i's with sigmoids
n = min(n, len(B))
js = [int(round(z)) for z in numpy.linspace(0, len(B), n-1, endpoint=False)]
ts = list(sorted(T))
ts = [ts[j] for j in js]
M = numpy.zeros((len(B), n), dtype=numpy.float32)
N = numpy.zeros((len(B), n), dtype=numpy.float32)
for i, (b, t) in enumerate(zip(B, T)):
j = bisect.bisect_left(ts, t)
M[i,0:j] = -1
N[i,0:j] = 1
if b:
M[i,j] = 1
N[i,j] = 1
M = tf.constant(M)
z = tf.Variable(tf.zeros((n,)))
Z = tf.expand_dims(z, 0)
l = tf.reduce_sum(tf.log(tf.sigmoid(M * Z)) * N, 1)
beta = tf.Variable(tf.zeros([]))
c = tf.sigmoid(beta)
B = tf.constant(B.astype(numpy.float32))
LL_observed = (tf.log(c) + l)
LL_unobserved = (tf.log(1 - c + c * tf.exp(l)))
LL = tf.reduce_sum(B * LL_observed + (1 - B) * LL_unobserved, 0)
with tf.Session() as sess:
tf_utils.optimize(sess, LL, (z, beta))

print('c:', sess.run(c))
w = sess.run(c * (1 - tf.exp(tf.cumsum(tf.log(tf.sigmoid(-z))))))
from matplotlib import pyplot
pyplot.plot(w)
pyplot.savefig('nonparametric.png')

40 changes: 40 additions & 0 deletions convoys/tf_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import sys
import tensorflow as tf

def optimize(sess, target, variables):
learning_rate_input = tf.placeholder(tf.float32, [])
optimizer = tf.train.AdamOptimizer(learning_rate_input).minimize(-target)

best_state_variables = [tf.Variable(tf.zeros(v.shape)) for v in variables]
store_best_state = [tf.assign(v, u) for (u, v) in zip(variables, best_state_variables)]
restore_best_state = [tf.assign(u, v) for (u, v) in zip(variables, best_state_variables)]
sess.run(tf.global_variables_initializer())

best_step, step = 0, 0
dec_learning_rate = 1.0
best_cost = sess.run(target)
any_var_is_nan = tf.is_nan(tf.add_n([tf.reduce_sum(v) for v in variables]))

while True:
inc_learning_rate = 10**(min(step, 240)//40-6)
learning_rate = min(inc_learning_rate, dec_learning_rate)
sess.run(optimizer, feed_dict={learning_rate_input: learning_rate})
if sess.run(any_var_is_nan):
cost = float('-inf')
else:
cost = sess.run(target)
if cost > best_cost:
best_cost, best_step = cost, step
sess.run(store_best_state)
elif str(cost) in ('-inf', 'nan') or step - best_step > 40:
sess.run(restore_best_state)
dec_learning_rate = learning_rate / 10
best_step = step
if learning_rate < 1e-6:
sys.stdout.write('\n')
break
step += 1
sys.stdout.write('step %6d (lr %6.6f): %14.3f%30s' % (step, learning_rate, cost, ''))
sys.stdout.write('\n' if step % 100 == 0 else '\r')
sys.stdout.flush()

11 changes: 11 additions & 0 deletions test_convoys.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
matplotlib.use('Agg') # Needed for matplotlib to run in Travis
import convoys
import convoys.regression
import convoys.single


def sample_weibull(k, lambd):
Expand Down Expand Up @@ -120,3 +121,13 @@ def test_plot_cohorts(cs=[0.3, 0.5, 0.7], k=2.0, lambd=0.1, n=10000):
c = cs[0]
assert group == 'Group 0'
assert 0.95*c < y < 1.05 * c


def test_nonparametric_model(c=0.3, lambd=0.1, k=0.5, n=100000):
C = scipy.stats.bernoulli.rvs(c, size=(n,))
N = scipy.stats.uniform.rvs(scale=5./lambd, size=(n,))
E = numpy.array([sample_weibull(k, lambd) for r in range(n)])
B, T = generate_censored_data(N, E, C)

m = convoys.single.Nonparametric()
m.fit(B, T)

0 comments on commit c1c8eac

Please sign in to comment.