rename .predict to .cdf, remove sample_event

better · Apr 9, 2018 · d7a5469 · d7a5469
1 parent 11afc66
commit d7a5469
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 58 deletions.
diff --git a/convoys/__init__.py b/convoys/__init__.py
@@ -40,26 +40,6 @@ def get_arrays(groups, data, t_converter):
     return numpy.array(G), numpy.array(B), numpy.array(T)
 
 
-def sample_event(model, x, t, hi=1e3):
-    # We are now at time t. Generate a random event whether the user is going to convert or not
-    # TODO: this is a hacky thing until we have a "invert CDF" method on each model
-    def pred(t):
-        ts = numpy.array([t])
-        return model.predict(x, ts)[1][-1]
-    y = pred(t)
-    r = y + random.random() * (1 - y)
-    if pred(hi) < r:
-        return None
-    lo = t
-    for j in range(20):
-        mid = (lo + hi) / 2
-        if pred(mid) < r:
-            lo = mid
-        else:
-            hi = mid
-    return (lo + hi)/2
-
-
 def get_groups(data, group_min_size, max_groups):
     group2count = {}
     for group, created_at, converted_at, now in data:
@@ -111,23 +91,23 @@ def plot_cohorts(data, t_max=None, title=None, group_min_size=0, max_groups=100,
         label = '%s (n=%.0f, k=%.0f)' % (group, n, k)
 
         if ci is not None:
-            p_y, p_y_lo, p_y_hi = m.predict(j, t, ci=ci).T
-            p_y_final, p_y_lo_final, p_y_hi_final = m.predict(j, float('inf'), ci=0.95)
+            p_y, p_y_lo, p_y_hi = m.cdf(j, t, ci=ci).T
+            p_y_final, p_y_lo_final, p_y_hi_final = m.cdf(j, float('inf'), ci=0.95)
             if str(p_y_final) != 'nan':
                 label += ' projected: %.2f%% (%.2f%% - %.2f%%)' % (100.*p_y_final, 100.*p_y_lo_final, 100.*p_y_hi_final)
             result.append((group, p_y_final, p_y_lo_final, p_y_hi_final))
             pyplot.plot(t, 100. * p_y, color=color, linewidth=1.5, alpha=0.7, label=label)
             pyplot.fill_between(t, 100. * p_y_lo, 100. * p_y_hi, color=color, alpha=0.2)
         else:
-            p_y = m.predict(j, t).T
-            p_y_final = m.predict(j, float('inf'), ci=None)
+            p_y = m.cdf(j, t).T
+            p_y_final = m.cdf(j, float('inf'), ci=None)
             if str(p_y_final) != 'nan':
                 label += ' projected: %.2f%%' % (100.*p_y_final,)
             result.append((group, p_y_final))
             pyplot.plot(t, 100. * p_y, color=color, linewidth=1.5, alpha=0.7, label=label)
 
         if extra_model is not None:
-            extra_p_y = extra_m.predict(j, t)
+            extra_p_y = extra_m.cdf(j, t)
             pyplot.plot(t, 100. * extra_p_y, color=color, linestyle='--', linewidth=1.5, alpha=0.7)
         y_max = max(y_max, 110. * max(p_y))
 

diff --git a/convoys/multi.py b/convoys/multi.py
@@ -23,8 +23,8 @@ def _get_x(self, group):
         x[group] = 1
         return x
 
-    def predict(self, group, t, *args, **kwargs):
-        return self.base_model.predict(self._get_x(group), t, *args, **kwargs)
+    def cdf(self, group, t, *args, **kwargs):
+        return self.base_model.cdf(self._get_x(group), t, *args, **kwargs)
 
 
 class SingleToMulti(MultiModel):
@@ -40,8 +40,8 @@ def fit(self, G, B, T):
             self._group2model[g] = self.base_model_init()
             self._group2model[g].fit([b for b, t in BT], [t for b, t in BT])
 
-    def predict(self, group, t, *args, **kwargs):
-        return self._group2model[group].predict(t, *args, **kwargs)
+    def cdf(self, group, t, *args, **kwargs):
+        return self._group2model[group].cdf(t, *args, **kwargs)
 
 
 class Exponential(RegressionToMulti):

diff --git a/convoys/regression.py b/convoys/regression.py
@@ -65,7 +65,7 @@ def fit(self, X, B, T):
                 'b': b.params(sess, LL, feed_dict),
             }
 
-    def predict(self, x, t, ci=None, n=1000):
+    def cdf(self, x, t, ci=None, n=1000):
         t = numpy.array(t)
         a = LinearCombination.sample(self.params['a'], x, ci, n)
         b = LinearCombination.sample(self.params['b'], x, ci, n)
@@ -104,7 +104,7 @@ def fit(self, X, B, T):
                 'k': sess.run(k),
             }
 
-    def predict(self, x, t, ci=None, n=1000):
+    def cdf(self, x, t, ci=None, n=1000):
         t = numpy.array(t)
         a = LinearCombination.sample(self.params['a'], x, ci, n)
         b = LinearCombination.sample(self.params['b'], x, ci, n)
@@ -155,7 +155,7 @@ def update_k(sess):
                 'k': sess.run(k),
             }
 
-    def predict(self, x, t, ci=None, n=1000):
+    def cdf(self, x, t, ci=None, n=1000):
         t = numpy.array(t)
         a = LinearCombination.sample(self.params['a'], x, ci, n)
         b = LinearCombination.sample(self.params['b'], x, ci, n)

diff --git a/convoys/single.py b/convoys/single.py
@@ -42,7 +42,7 @@ def _get_value_at(self, j, ci):
         else:
             return 1 - self._ss[j]
 
-    def predict(self, t, ci=None):
+    def cdf(self, t, ci=None):
         t = numpy.array(t)
         res = numpy.zeros(t.shape + (3,) if ci else t.shape)
         for indexes, value in numpy.ndenumerate(t):
@@ -119,7 +119,7 @@ def get_LL(log_survived_until, log_survived_after, log_observed):
             }
             self.params['z_std'] = 0  # not sure what's up, will revisit this
 
-    def predict(self, t, ci=None, n=1000):
+    def cdf(self, t, ci=None, n=1000):
         t = numpy.array(t)
         if ci:
             betas = numpy.random.normal(self.params['beta'], self.params['beta_std'], n)

diff --git a/test_convoys.py b/test_convoys.py
@@ -32,18 +32,18 @@ def test_exponential_regression_model(c=0.3, lambd=0.1, n=100000):
     B, T = generate_censored_data(N, E, C)
     model = convoys.regression.Exponential()
     model.fit(X, B, T)
-    assert model.predict([1], float('inf')).shape == ()
-    assert 0.95*c < model.predict([1], float('inf')) < 1.05*c
-    assert model.predict([1], 0).shape == ()
-    assert model.predict([1], [0, 1, 2, 3]).shape == (4,)
+    assert model.cdf([1], float('inf')).shape == ()
+    assert 0.95*c < model.cdf([1], float('inf')) < 1.05*c
+    assert model.cdf([1], 0).shape == ()
+    assert model.cdf([1], [0, 1, 2, 3]).shape == (4,)
     t = 10
     d = 1 - numpy.exp(-lambd*t)
-    assert 0.95*c*d < model.predict([1], t) < 1.05*c*d
+    assert 0.95*c*d < model.cdf([1], t) < 1.05*c*d
 
     # Check the confidence intervals
-    assert model.predict([1], float('inf'), ci=0.95).shape == (3,)
-    assert model.predict([1], [0, 1, 2, 3], ci=0.95).shape == (4, 3)
-    y, y_lo, y_hi = model.predict([1], float('inf'), ci=0.95)
+    assert model.cdf([1], float('inf'), ci=0.95).shape == (3,)
+    assert model.cdf([1], [0, 1, 2, 3], ci=0.95).shape == (4, 3)
+    y, y_lo, y_hi = model.cdf([1], float('inf'), ci=0.95)
     c_lo = scipy.stats.beta.ppf(0.025, n*c, n*(1-c))
     c_hi = scipy.stats.beta.ppf(0.975, n*c, n*(1-c))
     assert 0.95*c < y < 1.05*c
@@ -63,17 +63,17 @@ def test_weibull_regression_model(cs=[0.3, 0.5, 0.7], lambd=0.1, k=0.5, n=100000
 
     # Validate shape of results
     x = numpy.ones((len(cs),))
-    assert model.predict(x, float('inf')).shape == ()
-    assert model.predict(x, float('inf'), ci=0.95).shape == (3,)
-    assert model.predict(x, 1).shape == ()
-    assert model.predict(x, 1, ci=True).shape == (3,)
-    assert model.predict(x, [1, 2, 3, 4]).shape == (4,)
-    assert model.predict(x, [1, 2, 3, 4], ci=True).shape == (4, 3)
+    assert model.cdf(x, float('inf')).shape == ()
+    assert model.cdf(x, float('inf'), ci=0.95).shape == (3,)
+    assert model.cdf(x, 1).shape == ()
+    assert model.cdf(x, 1, ci=True).shape == (3,)
+    assert model.cdf(x, [1, 2, 3, 4]).shape == (4,)
+    assert model.cdf(x, [1, 2, 3, 4], ci=True).shape == (4, 3)
 
     # Check results
     for r, c in enumerate(cs):
         x = [int(r == j) for j in range(len(cs))]
-        assert 0.95 * c < model.predict(x, float('inf')) < 1.05 * c
+        assert 0.95 * c < model.cdf(x, float('inf')) < 1.05 * c
         expected_time = 1./lambd * scipy.special.gamma(1 + 1/k)
 
 
@@ -87,7 +87,7 @@ def test_weibull_regression_model_ci(c=0.3, lambd=0.1, k=0.5, n=100000):
 
     model = convoys.regression.Weibull()
     model.fit(X, B, T)
-    y, y_lo, y_hi = model.predict([1], float('inf'), ci=0.95)
+    y, y_lo, y_hi = model.cdf([1], float('inf'), ci=0.95)
     c_lo = scipy.stats.beta.ppf(0.025, n*c, n*(1-c))
     c_hi = scipy.stats.beta.ppf(0.975, n*c, n*(1-c))
     assert 0.95*c < y < 1.05 * c
@@ -105,7 +105,7 @@ def test_gamma_regression_model(c=0.3, lambd=0.1, k=3.0, n=100000):
 
     model = convoys.regression.Gamma()
     model.fit(X, B, T)
-    assert 0.95*c < model.predict([1], float('inf')) < 1.05*c
+    assert 0.95*c < model.cdf([1], float('inf')) < 1.05*c
     assert 0.90*k < model.params['k'] < 1.10*k
 
 
@@ -119,15 +119,15 @@ def test_nonparametric_model(c=0.3, lambd=0.1, k=0.5, n=10000):
     m = convoys.single.Nonparametric()
     m.fit(B, T)
 
-    assert 0.90*c < m.predict(float('inf')) < 1.10*c
+    assert 0.90*c < m.cdf(float('inf')) < 1.10*c
 
     # Check shapes of results
-    assert m.predict(float('inf')).shape == ()
-    assert m.predict(float('inf'), ci=0.95).shape == (3,)
-    assert m.predict(1).shape == ()
-    assert m.predict([1, 2, 3, 4]).shape == (4,)
-    assert m.predict(1, ci=0.95).shape == (3,)
-    assert m.predict([1, 2, 3, 4], ci=0.95).shape == (4, 3)
+    assert m.cdf(float('inf')).shape == ()
+    assert m.cdf(float('inf'), ci=0.95).shape == (3,)
+    assert m.cdf(1).shape == ()
+    assert m.cdf([1, 2, 3, 4]).shape == (4,)
+    assert m.cdf(1, ci=0.95).shape == (3,)
+    assert m.cdf([1, 2, 3, 4], ci=0.95).shape == (4, 3)
 
 
 def _test_plot_cohorts(cs=[0.3, 0.5, 0.7], k=0.5, lambd=0.1, n=10000, model='weibull', extra_model=None):