diff --git a/convoys/__init__.py b/convoys/__init__.py index 737d746..59342d7 100644 --- a/convoys/__init__.py +++ b/convoys/__init__.py @@ -1,15 +1,13 @@ import abc -import bisect import datetime import lifelines import math import numpy import random import seaborn -import six from matplotlib import pyplot -from convoys.model import Model from convoys.regression import ExponentialRegression, WeibullRegression, GammaRegression +from convoys.single import KaplanMeier def get_timescale(t): @@ -38,41 +36,6 @@ def get_arrays(data, t_converter): return X, numpy.array(B), numpy.array(T) -class KaplanMeier(Model): - def fit(self, X, B, T): - kmf = lifelines.KaplanMeierFitter() - kmf.fit(T, event_observed=B) - self.ts = kmf.survival_function_.index.values - self.ps = 1.0 - kmf.survival_function_['KM_estimate'].values - self.ps_hi = 1.0 - kmf.confidence_interval_['KM_estimate_lower_0.95'].values - self.ps_lo = 1.0 - kmf.confidence_interval_['KM_estimate_upper_0.95'].values - - def predict(self, x, ts, ci=None): - js = [bisect.bisect_left(self.ts, t) for t in ts] - def array_lookup(a): - return numpy.array([a[j] for j in js if j < len(self.ts)]) - if ci is not None: - return (array_lookup(self.ts), array_lookup(self.ps), array_lookup(self.ps_lo), array_lookup(self.ps_hi)) - else: - return (array_lookup(self.ts), array_lookup(self.ps)) - - def predict_final(self, x, ci=None): - if ci is not None: - return (self.ps[-1], self.ps_lo[-1], self.ps_hi[-1]) - else: - return self.ps[-1] - - def predict_time(self, x, ci=None): - # TODO: should not use median here, but mean is no good - def median(ps): - i = bisect.bisect_left(ps, 0.5) - return self.ts[min(i, len(ps)-1)] - if ci is not None: - return median(self.ps), median(self.ps_lo), median(self.ps_hi) - else: - return median(self.ps) - - def sample_event(model, x, t, hi=1e3): # We are now at time t. Generate a random event whether the user is going to convert or not # TODO: this is a hacky thing until we have a "invert CDF" method on each model diff --git a/convoys/model.py b/convoys/model.py deleted file mode 100644 index 0f7eb3f..0000000 --- a/convoys/model.py +++ /dev/null @@ -1,21 +0,0 @@ -import abc -import six - - -@six.add_metaclass(abc.ABCMeta) -class Model(): - @abc.abstractmethod - def fit(self, X, B, T): - pass - - @abc.abstractmethod - def predict(self, x, ts, ci=None): - pass - - @abc.abstractmethod - def predict_final(self, x, ci=None): - pass - - @abc.abstractmethod - def predict_time(self, x, ci=None): - pass diff --git a/convoys/regression.py b/convoys/regression.py index 39813c7..bf9cb52 100644 --- a/convoys/regression.py +++ b/convoys/regression.py @@ -4,8 +4,6 @@ import tensorflow as tf import sys -from convoys.model import Model - tf.logging.set_verbosity(3) @@ -87,12 +85,12 @@ def _predict(func_values, ci): return numpy.mean(func_values, axis=axis), numpy.percentile(func_values, (1-ci)*50, axis=axis), numpy.percentile(func_values, (1+ci)*50, axis=axis) -class Regression(Model): +class RegressionModel: def __init__(self, L2_reg=1.0): self._L2_reg = L2_reg -class ExponentialRegression(Regression): +class ExponentialRegression(RegressionModel): def fit(self, X, B, T): n, k = X.shape X_input, B_input, T_input = _get_constants((X, B, T)) @@ -134,7 +132,7 @@ def predict_time(self, x, ci=None, n=1000): return _predict(1./numpy.exp(x_prod_alpha), ci) -class WeibullRegression(Regression): +class WeibullRegression(RegressionModel): def fit(self, X, B, T): n, k = X.shape X_input, B_input, T_input = _get_constants((X, B, T)) @@ -180,7 +178,7 @@ def predict_time(self, x, ci=None, n=1000): return _predict(1./numpy.exp(x_prod_alpha) * gamma(1 + 1./self.params['k']), ci) -class GammaRegression(Regression): +class GammaRegression(RegressionModel): def fit(self, X, B, T): n, k = X.shape X_input, B_input, T_input = _get_constants((X, B, T)) diff --git a/convoys/single.py b/convoys/single.py new file mode 100644 index 0000000..ad7f642 --- /dev/null +++ b/convoys/single.py @@ -0,0 +1,37 @@ +import bisect +import lifelines +import numpy + +class KaplanMeier: + def fit(self, X, B, T): + kmf = lifelines.KaplanMeierFitter() + kmf.fit(T, event_observed=B) + self.ts = kmf.survival_function_.index.values + self.ps = 1.0 - kmf.survival_function_['KM_estimate'].values + self.ps_hi = 1.0 - kmf.confidence_interval_['KM_estimate_lower_0.95'].values + self.ps_lo = 1.0 - kmf.confidence_interval_['KM_estimate_upper_0.95'].values + + def predict(self, x, ts, ci=None): + js = [bisect.bisect_left(self.ts, t) for t in ts] + def array_lookup(a): + return numpy.array([a[j] for j in js if j < len(self.ts)]) + if ci is not None: + return (array_lookup(self.ts), array_lookup(self.ps), array_lookup(self.ps_lo), array_lookup(self.ps_hi)) + else: + return (array_lookup(self.ts), array_lookup(self.ps)) + + def predict_final(self, x, ci=None): + if ci is not None: + return (self.ps[-1], self.ps_lo[-1], self.ps_hi[-1]) + else: + return self.ps[-1] + + def predict_time(self, x, ci=None): + # TODO: should not use median here, but mean is no good + def median(ps): + i = bisect.bisect_left(ps, 0.5) + return self.ts[min(i, len(ps)-1)] + if ci is not None: + return median(self.ps), median(self.ps_lo), median(self.ps_hi) + else: + return median(self.ps) diff --git a/requirements.txt b/requirements.txt index 4d5f04f..49aabb2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,4 @@ matplotlib>=2.0.0 numpy scipy seaborn==0.8.1 -six==1.11.0 tensorflow==1.6.0rc1 diff --git a/test_convoys.py b/test_convoys.py index ae6420f..14620e2 100644 --- a/test_convoys.py +++ b/test_convoys.py @@ -113,3 +113,6 @@ def test_plot_cohorts(cs=[0.3, 0.5, 0.7], k=2.0, lambd=0.1, n=100000): assert group == 'Group 0' assert 0.95*c < y < 1.05 * c assert 0.70*(c_hi-c_lo) < (y_hi-y_lo) < 1.30*(c_hi-c_lo) + + # Also plot with default arguments (TODO: add assertions) + convoys.plot_cohorts(data)