Skip to content

Commit

Permalink
Merge pull request #11 from better/use-regression-models
Browse files Browse the repository at this point in the history
Use regression models for plotting etc
  • Loading branch information
erikbern committed Feb 20, 2018
2 parents b22b03c + c152e19 commit 917d075
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 336 deletions.
295 changes: 25 additions & 270 deletions convoys/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,10 @@
import numpy
import random
import seaborn
import scipy.optimize
import six
from autograd import jacobian, hessian, grad
from autograd.scipy.special import expit, gamma, gammainc, gammaincc, gammaln
from autograd.numpy import exp, log, sum
from matplotlib import pyplot

LOG_EPS = 1e-12 # Used for log likelihood


def log1pexp(x):
# returns log(1 + exp(x))
p, q = min(x, 0), max(x, 0)
return q + log(exp(p-q) + 1)

from convoys.model import Model
from convoys.regression import ExponentialRegression, WeibullRegression, GammaRegression


def get_timescale(t):
Expand All @@ -41,101 +30,24 @@ def get_timedelta_converter(t_factor):


def get_arrays(data, t_converter):
C = [t_converter(converted_at - created_at) if converted_at is not None else 1.0
for created_at, converted_at, now in data]
N = [t_converter(now - created_at)
for created_at, converted_at, now in data]
X = numpy.ones((len(data), 1))
B = [bool(converted_at is not None)
for created_at, converted_at, now in data]
return numpy.array(C), numpy.array(N), numpy.array(B)


@six.add_metaclass(abc.ABCMeta)
class Model():
def __init__(self, params={}):
self.params = params

@abc.abstractmethod
def fit(self, C, N, B):
pass

@abc.abstractmethod
def predict(self, ts, ci=None):
pass

@abc.abstractmethod
def predict_final(self, ci=None):
pass

@abc.abstractmethod
def predict_time(self, ci=None):
pass


class SimpleBinomial(Model):
def fit(self, C, N, B):
self.params['a'] = len(B)
self.params['b'] = sum(B)

def predict(self, ts, ci=None):
a, b = self.params['a'], self.params['b']
def rep(x):
return numpy.ones(len(ts)) * x
if ci is not None:
return ts, rep(b/a), rep(scipy.stats.beta.ppf((1 - ci)/2, b, a-b)), rep(scipy.stats.beta.ppf((1 + ci)/2, b, a-b))
else:
return ts, rep(b/a)

def predict_final(self, ci=None):
a, b = self.params['a'], self.params['b']
if ci is not None:
return b/a, scipy.stats.beta.ppf((1 - ci)/2, b, a-b), scipy.stats.beta.ppf((1 + ci)/2, b, a-b)
else:
return b/a

def predict_time(self):
return 0.0


class Basic(Model):
def fit(self, C, N, B, n_limit=30):
n, k = len(C), 0
self.ts = [0]
self.ns = [n]
self.ks = [k]
events = [(c, 1, 0) for c, n, b in zip(C, N, B) if b] + \
[(n, -int(b), -1) for c, n, b in zip(C, N, B)]
for t, k_delta, n_delta in sorted(events):
k += k_delta
n += n_delta
self.ts.append(t)
self.ks.append(k)
self.ns.append(n)
if n < n_limit:
break

def predict(self, ts, ci=None):
js = [bisect.bisect_left(self.ts, t) for t in ts]
ks = numpy.array([self.ks[j] for j in js if j < len(self.ks)])
ns = numpy.array([self.ns[j] for j in js if j < len(self.ns)])
ts = numpy.array([ts[j] for j in js if j < len(self.ns)])
if ci is not None:
return ts, ks / ns, scipy.stats.beta.ppf((1 - ci)/2, ks, ns-ks), scipy.stats.beta.ppf((1 + ci)/2, ks, ns-ks)
else:
return ts, ks / ns
T = [t_converter(converted_at - created_at) if converted_at is not None else t_converter(now - created_at)
for created_at, converted_at, now in data]
return X, numpy.array(B), numpy.array(T)


class KaplanMeier(Model):
def fit(self, C, N, B):
T = [c if b else n for c, n, b in zip(C, N, B)]
def fit(self, X, B, T):
kmf = lifelines.KaplanMeierFitter()
kmf.fit(T, event_observed=B)
self.ts = kmf.survival_function_.index.values
self.ps = 1.0 - kmf.survival_function_['KM_estimate'].values
self.ps_hi = 1.0 - kmf.confidence_interval_['KM_estimate_lower_0.95'].values
self.ps_lo = 1.0 - kmf.confidence_interval_['KM_estimate_upper_0.95'].values

def predict(self, ts, ci=None):
def predict(self, x, ts, ci=None):
js = [bisect.bisect_left(self.ts, t) for t in ts]
def array_lookup(a):
return numpy.array([a[j] for j in js if j < len(self.ts)])
Expand All @@ -144,13 +56,13 @@ def array_lookup(a):
else:
return (array_lookup(self.ts), array_lookup(self.ps))

def predict_final(self, ci=None):
def predict_final(self, x, ci=None):
if ci is not None:
return (self.ps[-1], self.ps_lo[-1], self.ps_hi[-1])
else:
return self.ps[-1]

def predict_time(self, ci=None):
def predict_time(self, x, ci=None):
# TODO: should not use median here, but mean is no good
def median(ps):
i = bisect.bisect_left(ps, 0.5)
Expand All @@ -161,167 +73,12 @@ def median(ps):
return median(self.ps)


def fit_beta(c, fc):
# Approximate a Beta distribution for c by fitting two things
# 1. second derivative wrt c should match the second derivative of a beta
# LL = (a-1)log(c) + (b-1)log(1-c)
# dLL = (a-1)/c - (b-1)/(1-c)
# ddLL = -(a-1)/c^2 - (b-1)/(1-c)^2 = -h
# a(-1/c^2) + b(-1/(1-c)^2) = -h - 1/c^2 - 1/(1-c)^2
# 2. mode should match c, i.e. (a-1)/(a+b-2) = c <=> a-1 = c(a+b-2)
# a(1-c) - bc = 1-2c
h = grad(grad(fc))(c)
M = numpy.array([[-1/c**2, -1/(1-c)**2],
[(1-c), -c]])
q = numpy.array([-h - 1/c**2 - 1/(1-c)**2, 1 - 2*c])
try:
a, b = numpy.linalg.solve(M, q)
except numpy.linalg.linalg.LinAlgError:
a, b = 1, 1
return a, b


class Exponential(Model):
def fit(self, C, N, B):
def transform(x):
p, q = x
return (expit(p), log1pexp(q))
def f(x):
c, lambd = x
LL_observed = log(c) + log(lambd) - lambd*C
LL_censored = log((1 - c) + c * exp(-lambd*N))
neg_LL = -sum(B * LL_observed + (1 - B) * LL_censored)
return neg_LL

g = lambda x: f(transform(x))
res = scipy.optimize.minimize(
fun=g,
jac=jacobian(g),
hess=hessian(g),
x0=(0, 0),
method='trust-ncg')
c, lambd = transform(res.x)
fc = lambda c: f((c, lambd))
a, b = fit_beta(c, fc)
self.params = dict(a=a, b=b, lambd=lambd)

def predict(self, ts, ci=None):
a, b, lambd = self.params['a'], self.params['b'], self.params['lambd']
y = 1 - exp(-ts * lambd)
if ci is not None:
return ts, a / (a + b) * y, scipy.stats.beta.ppf((1 - ci)/2, a, b) * y, scipy.stats.beta.ppf((1 + ci)/2, a, b) * y
else:
return ts, y

def predict_final(self, ci=None):
a, b = self.params['a'], self.params['b']
if not ci:
return a / (a + b)
else:
return (a / (a + b),
scipy.stats.beta.ppf((1 - ci)/2, a, b),
scipy.stats.beta.ppf((1 + ci)/2, a, b))

def predict_time(self):
return 1.0 / self.params['lambd']


class Weibull(Model):
def fit(self, C, N, B):
def transform(x):
p, q, r = x
return (expit(p), log1pexp(q), log1pexp(r))
def f(x):
c, lambd, k = x
# PDF of Weibull: k * lambda * (x * lambda)^(k-1) * exp(-(t * lambda)^k)
LL_observed = log(c) + log(k) + log(lambd) + (k-1)*(log(C) + log(lambd)) - (C*lambd)**k
# CDF of Weibull: 1 - exp(-(t * lambda)^k)
LL_censored = log((1-c) + c * exp(-(N*lambd)**k))
neg_LL = -sum(B * LL_observed + (1 - B) * LL_censored)
return neg_LL

g = lambda x: f(transform(x))
res = scipy.optimize.minimize(
fun=g,
jac=jacobian(g),
hess=hessian(g),
x0=(0, 0, 0),
method='trust-ncg')
c, lambd, k = transform(res.x)
fc = lambda c: f((c, lambd, k))
a, b = fit_beta(c, fc)
self.params = dict(a=a, b=b, lambd=lambd, k=k)

def predict(self, ts, ci=None):
a, b, lambd, k = self.params['a'], self.params['b'], self.params['lambd'], self.params['k']
y = 1 - exp(-(ts*lambd)**k)
if ci is not None:
return ts, a / (a + b) * y, scipy.stats.beta.ppf((1 - ci)/2, a, b) * y, scipy.stats.beta.ppf((1 + ci)/2, a, b) * y
else:
return ts, y

def predict_final(self, ci=None):
a, b = self.params['a'], self.params['b']
if ci is not None:
return a / (a + b), scipy.stats.beta.ppf((1 - ci)/2, a, b), scipy.stats.beta.ppf((1 + ci)/2, a, b)
else:
return a / (a + b)

def predict_time(self):
return gamma(1 + 1./self.params['k']) / self.params['lambd']


class Gamma(Model):
def fit(self, C, N, B):
# TODO(erikbern): should compute Jacobian of this one
def transform(x):
p, q, r = x
return (expit(p), log1pexp(q), log1pexp(r))
def f(x):
c, lambd, k = x
neg_LL = 0
# PDF of gamma: 1.0 / gamma(k) * lambda ^ k * t^(k-1) * exp(-t * lambda)
LL_observed = log(c) - gammaln(k) + k*log(lambd) + (k-1)*log(C + LOG_EPS) - lambd*C
# CDF of gamma: gammainc(k, lambda * t)
LL_censored = log((1-c) + c * gammaincc(k, lambd*N) + LOG_EPS)
neg_LL = -sum(B * LL_observed + (1 - B) * LL_censored)
return neg_LL

g = lambda x: f(transform(x))
res = scipy.optimize.minimize(
fun=g,
x0=(0, 0, 0),
method='Nelder-Mead')
c, lambd, k = transform(res.x)
fc = lambda c: f((c, lambd, k))
a, b = fit_beta(c, fc)
self.params = dict(a=a, b=b, lambd=lambd, k=k)

def predict(self, ts, ci=None):
a, b, lambd, k = self.params['a'], self.params['b'], self.params['lambd'], self.params['k']
y = gammainc(k, lambd*ts)
if ci is not None:
return ts, a / (a + b) * y, scipy.stats.beta.ppf((1 - ci)/2, a, b) * y, scipy.stats.beta.ppf((1 + ci)/2, a, b) * y
else:
return ts, y

def predict_final(self, ci=None):
a, b = self.params['a'], self.params['b']
if ci is not None:
return a / (a + b), scipy.stats.beta.ppf((1 - ci)/2, a, b), scipy.stats.beta.ppf((1 + ci)/2, a, b)
else:
return a / (a + b)

def predict_time(self):
return self.params['k'] / self.params['lambd']


def sample_event(model, t, hi=1e3):
def sample_event(model, x, t, hi=1e3):
# We are now at time t. Generate a random event whether the user is going to convert or not
# TODO: this is a hacky thing until we have a "invert CDF" method on each model
def pred(t):
ts = numpy.array([t])
return model.predict(ts)[1][-1]
return model.predict(x, ts)[1][-1]
y = pred(t)
r = y + random.random() * (1 - y)
if pred(hi) < r:
Expand Down Expand Up @@ -364,12 +121,10 @@ def split_by_group(data, group_min_size, max_groups):


_models = {
'basic': Basic,
'kaplan-meier': KaplanMeier,
'simple-binomial': SimpleBinomial,
'exponential': Exponential,
'weibull': Weibull,
'gamma': Gamma,
'exponential': ExponentialRegression,
'weibull': WeibullRegression,
'gamma': GammaRegression,
}

def plot_cohorts(data, t_max=None, title=None, group_min_size=0, max_groups=100, model='kaplan-meier', projection=None):
Expand All @@ -386,24 +141,24 @@ def plot_cohorts(data, t_max=None, title=None, group_min_size=0, max_groups=100,
colors = seaborn.color_palette('hls', len(groups))
y_max = 0
for group, color in zip(sorted(groups), colors):
C, N, B = get_arrays(js[group], t_converter)
X, B, T = get_arrays(js[group], t_converter)
t = numpy.linspace(0, t_max, 1000)

m = _models[model]()
m.fit(C, N, B)
m.fit(X, B, T)

label = '%s (n=%.0f, k=%.0f)' % (group, len(B), sum(B))

if projection:
p = _models[projection]()
p.fit(C, N, B)
p_t, p_y, p_y_lo, p_y_hi = p.predict(t, ci=0.95)
p_y_final, p_y_lo_final, p_y_hi_final = p.predict_final(ci=0.95)
p.fit(X, B, T)
p_t, p_y, p_y_lo, p_y_hi = p.predict([1], t, ci=0.95)
p_y_final, p_y_lo_final, p_y_hi_final = p.predict_final([1], ci=0.95)
label += ' projected: %.2f%% (%.2f%% - %.2f%%)' % (100.*p_y_final, 100.*p_y_lo_final, 100.*p_y_hi_final)
pyplot.plot(p_t, 100. * p_y, color=color, linestyle=':', alpha=0.7)
pyplot.fill_between(p_t, 100. * p_y_lo, 100. * p_y_hi, color=color, alpha=0.2)

m_t, m_y = m.predict(t)
m_t, m_y = m.predict([1], t)
pyplot.plot(m_t, 100. * m_y, color=color, label=label)
y_max = max(y_max, 110. * max(m_y))

Expand Down Expand Up @@ -446,17 +201,17 @@ def plot_timeseries(data, window, model='kaplan-meier', group_min_size=0, max_gr
data = js[group][i1:i2]
t1 += stride

C, N, B = get_arrays(data, t_converter)
X, B, T = get_arrays(data, t_converter)
if sum(B) < window_min_size:
continue

p = _models[model]()
p.fit(C, N, B)
p.fit(X, B, T)

if time:
y, y_lo, y_hi = p.predict_time(ci=0.95)
y, y_lo, y_hi = p.predict_time([1], ci=0.95)
else:
y, y_lo, y_hi = p.predict_final(ci=0.95)
y, y_lo, y_hi = p.predict_final([1], ci=0.95)
print('%30s %40s %.4f %.4f %.4f' % (group, t1, y, y_lo, y_hi))
ts.append(t2)
ys.append(y)
Expand Down
Loading

0 comments on commit 917d075

Please sign in to comment.