Skip to content

Commit

Permalink
rename .predict to .cdf, remove sample_event
Browse files Browse the repository at this point in the history
  • Loading branch information
Erik Bernhardsson committed Apr 9, 2018
1 parent 11afc66 commit d7a5469
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 58 deletions.
30 changes: 5 additions & 25 deletions convoys/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,26 +40,6 @@ def get_arrays(groups, data, t_converter):
return numpy.array(G), numpy.array(B), numpy.array(T)


def sample_event(model, x, t, hi=1e3):
# We are now at time t. Generate a random event whether the user is going to convert or not
# TODO: this is a hacky thing until we have a "invert CDF" method on each model
def pred(t):
ts = numpy.array([t])
return model.predict(x, ts)[1][-1]
y = pred(t)
r = y + random.random() * (1 - y)
if pred(hi) < r:
return None
lo = t
for j in range(20):
mid = (lo + hi) / 2
if pred(mid) < r:
lo = mid
else:
hi = mid
return (lo + hi)/2


def get_groups(data, group_min_size, max_groups):
group2count = {}
for group, created_at, converted_at, now in data:
Expand Down Expand Up @@ -111,23 +91,23 @@ def plot_cohorts(data, t_max=None, title=None, group_min_size=0, max_groups=100,
label = '%s (n=%.0f, k=%.0f)' % (group, n, k)

if ci is not None:
p_y, p_y_lo, p_y_hi = m.predict(j, t, ci=ci).T
p_y_final, p_y_lo_final, p_y_hi_final = m.predict(j, float('inf'), ci=0.95)
p_y, p_y_lo, p_y_hi = m.cdf(j, t, ci=ci).T
p_y_final, p_y_lo_final, p_y_hi_final = m.cdf(j, float('inf'), ci=0.95)
if str(p_y_final) != 'nan':
label += ' projected: %.2f%% (%.2f%% - %.2f%%)' % (100.*p_y_final, 100.*p_y_lo_final, 100.*p_y_hi_final)
result.append((group, p_y_final, p_y_lo_final, p_y_hi_final))
pyplot.plot(t, 100. * p_y, color=color, linewidth=1.5, alpha=0.7, label=label)
pyplot.fill_between(t, 100. * p_y_lo, 100. * p_y_hi, color=color, alpha=0.2)
else:
p_y = m.predict(j, t).T
p_y_final = m.predict(j, float('inf'), ci=None)
p_y = m.cdf(j, t).T
p_y_final = m.cdf(j, float('inf'), ci=None)
if str(p_y_final) != 'nan':
label += ' projected: %.2f%%' % (100.*p_y_final,)
result.append((group, p_y_final))
pyplot.plot(t, 100. * p_y, color=color, linewidth=1.5, alpha=0.7, label=label)

if extra_model is not None:
extra_p_y = extra_m.predict(j, t)
extra_p_y = extra_m.cdf(j, t)
pyplot.plot(t, 100. * extra_p_y, color=color, linestyle='--', linewidth=1.5, alpha=0.7)
y_max = max(y_max, 110. * max(p_y))

Expand Down
8 changes: 4 additions & 4 deletions convoys/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ def _get_x(self, group):
x[group] = 1
return x

def predict(self, group, t, *args, **kwargs):
return self.base_model.predict(self._get_x(group), t, *args, **kwargs)
def cdf(self, group, t, *args, **kwargs):
return self.base_model.cdf(self._get_x(group), t, *args, **kwargs)


class SingleToMulti(MultiModel):
Expand All @@ -40,8 +40,8 @@ def fit(self, G, B, T):
self._group2model[g] = self.base_model_init()
self._group2model[g].fit([b for b, t in BT], [t for b, t in BT])

def predict(self, group, t, *args, **kwargs):
return self._group2model[group].predict(t, *args, **kwargs)
def cdf(self, group, t, *args, **kwargs):
return self._group2model[group].cdf(t, *args, **kwargs)


class Exponential(RegressionToMulti):
Expand Down
6 changes: 3 additions & 3 deletions convoys/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def fit(self, X, B, T):
'b': b.params(sess, LL, feed_dict),
}

def predict(self, x, t, ci=None, n=1000):
def cdf(self, x, t, ci=None, n=1000):
t = numpy.array(t)
a = LinearCombination.sample(self.params['a'], x, ci, n)
b = LinearCombination.sample(self.params['b'], x, ci, n)
Expand Down Expand Up @@ -104,7 +104,7 @@ def fit(self, X, B, T):
'k': sess.run(k),
}

def predict(self, x, t, ci=None, n=1000):
def cdf(self, x, t, ci=None, n=1000):
t = numpy.array(t)
a = LinearCombination.sample(self.params['a'], x, ci, n)
b = LinearCombination.sample(self.params['b'], x, ci, n)
Expand Down Expand Up @@ -155,7 +155,7 @@ def update_k(sess):
'k': sess.run(k),
}

def predict(self, x, t, ci=None, n=1000):
def cdf(self, x, t, ci=None, n=1000):
t = numpy.array(t)
a = LinearCombination.sample(self.params['a'], x, ci, n)
b = LinearCombination.sample(self.params['b'], x, ci, n)
Expand Down
4 changes: 2 additions & 2 deletions convoys/single.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def _get_value_at(self, j, ci):
else:
return 1 - self._ss[j]

def predict(self, t, ci=None):
def cdf(self, t, ci=None):
t = numpy.array(t)
res = numpy.zeros(t.shape + (3,) if ci else t.shape)
for indexes, value in numpy.ndenumerate(t):
Expand Down Expand Up @@ -119,7 +119,7 @@ def get_LL(log_survived_until, log_survived_after, log_observed):
}
self.params['z_std'] = 0 # not sure what's up, will revisit this

def predict(self, t, ci=None, n=1000):
def cdf(self, t, ci=None, n=1000):
t = numpy.array(t)
if ci:
betas = numpy.random.normal(self.params['beta'], self.params['beta_std'], n)
Expand Down
48 changes: 24 additions & 24 deletions test_convoys.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,18 @@ def test_exponential_regression_model(c=0.3, lambd=0.1, n=100000):
B, T = generate_censored_data(N, E, C)
model = convoys.regression.Exponential()
model.fit(X, B, T)
assert model.predict([1], float('inf')).shape == ()
assert 0.95*c < model.predict([1], float('inf')) < 1.05*c
assert model.predict([1], 0).shape == ()
assert model.predict([1], [0, 1, 2, 3]).shape == (4,)
assert model.cdf([1], float('inf')).shape == ()
assert 0.95*c < model.cdf([1], float('inf')) < 1.05*c
assert model.cdf([1], 0).shape == ()
assert model.cdf([1], [0, 1, 2, 3]).shape == (4,)
t = 10
d = 1 - numpy.exp(-lambd*t)
assert 0.95*c*d < model.predict([1], t) < 1.05*c*d
assert 0.95*c*d < model.cdf([1], t) < 1.05*c*d

# Check the confidence intervals
assert model.predict([1], float('inf'), ci=0.95).shape == (3,)
assert model.predict([1], [0, 1, 2, 3], ci=0.95).shape == (4, 3)
y, y_lo, y_hi = model.predict([1], float('inf'), ci=0.95)
assert model.cdf([1], float('inf'), ci=0.95).shape == (3,)
assert model.cdf([1], [0, 1, 2, 3], ci=0.95).shape == (4, 3)
y, y_lo, y_hi = model.cdf([1], float('inf'), ci=0.95)
c_lo = scipy.stats.beta.ppf(0.025, n*c, n*(1-c))
c_hi = scipy.stats.beta.ppf(0.975, n*c, n*(1-c))
assert 0.95*c < y < 1.05*c
Expand All @@ -63,17 +63,17 @@ def test_weibull_regression_model(cs=[0.3, 0.5, 0.7], lambd=0.1, k=0.5, n=100000

# Validate shape of results
x = numpy.ones((len(cs),))
assert model.predict(x, float('inf')).shape == ()
assert model.predict(x, float('inf'), ci=0.95).shape == (3,)
assert model.predict(x, 1).shape == ()
assert model.predict(x, 1, ci=True).shape == (3,)
assert model.predict(x, [1, 2, 3, 4]).shape == (4,)
assert model.predict(x, [1, 2, 3, 4], ci=True).shape == (4, 3)
assert model.cdf(x, float('inf')).shape == ()
assert model.cdf(x, float('inf'), ci=0.95).shape == (3,)
assert model.cdf(x, 1).shape == ()
assert model.cdf(x, 1, ci=True).shape == (3,)
assert model.cdf(x, [1, 2, 3, 4]).shape == (4,)
assert model.cdf(x, [1, 2, 3, 4], ci=True).shape == (4, 3)

# Check results
for r, c in enumerate(cs):
x = [int(r == j) for j in range(len(cs))]
assert 0.95 * c < model.predict(x, float('inf')) < 1.05 * c
assert 0.95 * c < model.cdf(x, float('inf')) < 1.05 * c
expected_time = 1./lambd * scipy.special.gamma(1 + 1/k)


Expand All @@ -87,7 +87,7 @@ def test_weibull_regression_model_ci(c=0.3, lambd=0.1, k=0.5, n=100000):

model = convoys.regression.Weibull()
model.fit(X, B, T)
y, y_lo, y_hi = model.predict([1], float('inf'), ci=0.95)
y, y_lo, y_hi = model.cdf([1], float('inf'), ci=0.95)
c_lo = scipy.stats.beta.ppf(0.025, n*c, n*(1-c))
c_hi = scipy.stats.beta.ppf(0.975, n*c, n*(1-c))
assert 0.95*c < y < 1.05 * c
Expand All @@ -105,7 +105,7 @@ def test_gamma_regression_model(c=0.3, lambd=0.1, k=3.0, n=100000):

model = convoys.regression.Gamma()
model.fit(X, B, T)
assert 0.95*c < model.predict([1], float('inf')) < 1.05*c
assert 0.95*c < model.cdf([1], float('inf')) < 1.05*c
assert 0.90*k < model.params['k'] < 1.10*k


Expand All @@ -119,15 +119,15 @@ def test_nonparametric_model(c=0.3, lambd=0.1, k=0.5, n=10000):
m = convoys.single.Nonparametric()
m.fit(B, T)

assert 0.90*c < m.predict(float('inf')) < 1.10*c
assert 0.90*c < m.cdf(float('inf')) < 1.10*c

# Check shapes of results
assert m.predict(float('inf')).shape == ()
assert m.predict(float('inf'), ci=0.95).shape == (3,)
assert m.predict(1).shape == ()
assert m.predict([1, 2, 3, 4]).shape == (4,)
assert m.predict(1, ci=0.95).shape == (3,)
assert m.predict([1, 2, 3, 4], ci=0.95).shape == (4, 3)
assert m.cdf(float('inf')).shape == ()
assert m.cdf(float('inf'), ci=0.95).shape == (3,)
assert m.cdf(1).shape == ()
assert m.cdf([1, 2, 3, 4]).shape == (4,)
assert m.cdf(1, ci=0.95).shape == (3,)
assert m.cdf([1, 2, 3, 4], ci=0.95).shape == (4, 3)


def _test_plot_cohorts(cs=[0.3, 0.5, 0.7], k=0.5, lambd=0.1, n=10000, model='weibull', extra_model=None):
Expand Down

0 comments on commit d7a5469

Please sign in to comment.