-
Notifications
You must be signed in to change notification settings - Fork 41
/
__init__.py
126 lines (108 loc) · 4.28 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import abc
import datetime
import math
import numpy
import random
import seaborn
from matplotlib import pyplot
from convoys.multi import Exponential, Weibull, Gamma, Nonparametric
def get_timescale(t):
def get_timedelta_converter(t_factor):
return lambda td: td.total_seconds() * t_factor
if type(t) != datetime.timedelta:
# Assume numeric type
return '', lambda x: x
elif t >= datetime.timedelta(days=1):
return 'Days', get_timedelta_converter(1./(24*60*60))
elif t >= datetime.timedelta(hours=1):
return 'Hours', get_timedelta_converter(1./(60*60))
elif t >= datetime.timedelta(minutes=1):
return 'Minutes', get_timedelta_converter(1./60)
else:
return 'Minutes', get_timedelta_converter(1)
def get_arrays(groups, data, t_converter):
G, B, T = [], [], []
group2j = dict((group, j) for j, group in enumerate(groups))
for group, created_at, converted_at, now in data:
if converted_at is not None and converted_at < created_at:
print('created at', created_at, 'but converted at', converted_at)
continue
if now < created_at:
print('created at', created_at, 'but now is', now)
continue
if group in group2j:
G.append(group2j[group])
B.append(converted_at is not None)
T.append(t_converter(converted_at - created_at) if converted_at is not None else t_converter(now - created_at))
return numpy.array(G), numpy.array(B), numpy.array(T)
def sample_event(model, x, t, hi=1e3):
# We are now at time t. Generate a random event whether the user is going to convert or not
# TODO: this is a hacky thing until we have a "invert CDF" method on each model
def pred(t):
ts = numpy.array([t])
return model.predict(x, ts)[1][-1]
y = pred(t)
r = y + random.random() * (1 - y)
if pred(hi) < r:
return None
lo = t
for j in range(20):
mid = (lo + hi) / 2
if pred(mid) < r:
lo = mid
else:
hi = mid
return (lo + hi)/2
def get_groups(data, group_min_size, max_groups):
group2count = {}
for group, created_at, converted_at, now in data:
group2count[group] = group2count.get(group, 0) + 1
# Remove groups with too few data points
# Pick the top groups
# Sort groups lexicographically
groups = [group for group, count in group2count.items() if count >= group_min_size]
groups = sorted(groups, key=group2count.get, reverse=True)[:max_groups]
return sorted(groups)
_models = {
'nonparametric': Nonparametric,
'exponential': Exponential,
'weibull': Weibull,
'gamma': Gamma,
}
def plot_cohorts(data, t_max=None, title=None, group_min_size=0, max_groups=100, model='nonparametric'):
# Set x scale
if t_max is None:
t_max = max(now - created_at for group, created_at, converted_at, now in data)
t_unit, t_converter = get_timescale(t_max)
t_max = t_converter(t_max)
# Split data by group and get data
groups = get_groups(data, group_min_size, max_groups)
G, B, T = get_arrays(groups, data, t_converter)
# Fit model
m = _models[model]()
m.fit(G, B, T)
# Plot
colors = seaborn.color_palette('hls', len(groups))
t = numpy.linspace(0, t_max, 1000)
y_max = 0
result = []
for j, (group, color) in enumerate(zip(groups, colors)):
n = sum(1 for g in G if g == j) # TODO: slow
k = sum(1 for g, b in zip(G, B) if g == j and b) # TODO: slow
label = '%s (n=%.0f, k=%.0f)' % (group, n, k)
p_y, p_y_lo, p_y_hi = m.predict(j, t, ci=0.95).T
p_y_final, p_y_lo_final, p_y_hi_final = m.predict_final(j, ci=0.95)
label += ' projected: %.2f%% (%.2f%% - %.2f%%)' % (100.*p_y_final, 100.*p_y_lo_final, 100.*p_y_hi_final)
pyplot.plot(t, 100. * p_y, color=color, linestyle=':', alpha=0.7, label=label)
pyplot.fill_between(t, 100. * p_y_lo, 100. * p_y_hi, color=color, alpha=0.2)
result.append((group, p_y_final, p_y_lo_final, p_y_hi_final))
y_max = max(y_max, 110. * max(p_y))
if title:
pyplot.title(title)
pyplot.xlim([0, t_max])
pyplot.ylim([0, y_max])
pyplot.xlabel(t_unit)
pyplot.ylabel('Conversion rate %')
pyplot.legend()
pyplot.gca().grid(True)
return result