In [3]:
import composeml as cp

def total_spent(df):
    return df['amount'].sum()

label_maker = cp.LabelMaker(
    labeling_function=total_spent,
    target_entity='customer_id',
    time_index='transaction_time',
    window_size='1h',
)

labels = label_maker.search(
    cp.demos.load_transactions(),
    num_examples_per_instance=10,
    minimum_data='2h',
    gap='2min',
    verbose=True,
)


Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|██████████| customer_id: 50/50 


In [4]:
labels.head()

Unnamed: 0,customer_id,time,total_spent
0,1,2014-01-01 02:45:30,217.94
1,1,2014-01-01 02:47:30,217.94
2,1,2014-01-01 02:49:30,217.94
3,1,2014-01-01 02:51:30,217.94
4,1,2014-01-01 02:53:30,217.94


In [5]:
labels.threshold(100).head()

Unnamed: 0,customer_id,time,total_spent
0,1,2014-01-01 02:45:30,True
1,1,2014-01-01 02:47:30,True
2,1,2014-01-01 02:49:30,True
3,1,2014-01-01 02:51:30,True
4,1,2014-01-01 02:53:30,True


In [6]:
labels.apply_lead('1h').head()

Unnamed: 0,customer_id,time,total_spent
0,1,2014-01-01 01:45:30,217.94
1,1,2014-01-01 01:47:30,217.94
2,1,2014-01-01 01:49:30,217.94
3,1,2014-01-01 01:51:30,217.94
4,1,2014-01-01 01:53:30,217.94


In [7]:
labels.bin(4, quantiles=False).head()

Unnamed: 0,customer_id,time,total_spent
0,1,2014-01-01 02:45:30,"(198.455, 271.072]"
1,1,2014-01-01 02:47:30,"(198.455, 271.072]"
2,1,2014-01-01 02:49:30,"(198.455, 271.072]"
3,1,2014-01-01 02:51:30,"(198.455, 271.072]"
4,1,2014-01-01 02:53:30,"(198.455, 271.072]"


In [8]:
inf = float('inf')
edges = [-inf, 34, 50, 67, inf]
labels.bin(edges, quantiles=False,).head()

Unnamed: 0,customer_id,time,total_spent
0,1,2014-01-01 02:45:30,"(67.0, inf]"
1,1,2014-01-01 02:47:30,"(67.0, inf]"
2,1,2014-01-01 02:49:30,"(67.0, inf]"
3,1,2014-01-01 02:51:30,"(67.0, inf]"
4,1,2014-01-01 02:53:30,"(67.0, inf]"


In [9]:
labels.bin(4, quantiles=True).head()

Unnamed: 0,customer_id,time,total_spent
0,1,2014-01-01 02:45:30,"(196.25, 217.94]"
1,1,2014-01-01 02:47:30,"(196.25, 217.94]"
2,1,2014-01-01 02:49:30,"(196.25, 217.94]"
3,1,2014-01-01 02:51:30,"(196.25, 217.94]"
4,1,2014-01-01 02:53:30,"(196.25, 217.94]"


In [10]:
stats = labels.total_spent.describe()
stats = stats.round(3).to_string()
print(stats)

count     50.000
mean     215.182
std       90.518
min       53.220
25%      196.250
50%      217.940
75%      290.390
max      343.690


In [11]:
quantiles = [0, .34, .5, .67, 1]
labels.bin(quantiles, quantiles=True).head()

Unnamed: 0,customer_id,time,total_spent
0,1,2014-01-01 02:45:30,"(196.25, 217.94]"
1,1,2014-01-01 02:47:30,"(196.25, 217.94]"
2,1,2014-01-01 02:49:30,"(196.25, 217.94]"
3,1,2014-01-01 02:51:30,"(196.25, 217.94]"
4,1,2014-01-01 02:53:30,"(196.25, 217.94]"


In [12]:
values = ['low', 'medium', 'high']
labels.bin(3, labels=values).head()

Unnamed: 0,customer_id,time,total_spent
0,1,2014-01-01 02:45:30,medium
1,1,2014-01-01 02:47:30,medium
2,1,2014-01-01 02:49:30,medium
3,1,2014-01-01 02:51:30,medium
4,1,2014-01-01 02:53:30,medium


In [13]:
labels.threshold(100).describe()

Label Distribution
------------------
False      8
True      42
Total:    50


Settings
--------
gap                                 2min
minimum_data                          2h
num_examples_per_instance             10
target_column                total_spent
target_entity                customer_id
target_type                     discrete
window_size                           1h


Transforms
----------
1. threshold
  - value:    100



In [14]:
labels.sample(n=10, random_state=0)

Unnamed: 0,customer_id,time,total_spent
2,1,2014-01-01 02:49:30,217.94
4,1,2014-01-01 02:53:30,217.94
10,2,2014-01-01 02:00:00,290.39
11,2,2014-01-01 02:02:00,290.39
22,3,2014-01-01 03:49:05,196.25
27,3,2014-01-01 03:59:05,196.25
28,3,2014-01-01 04:01:05,196.25
31,4,2014-01-01 02:41:00,343.69
38,4,2014-01-01 02:55:00,225.18
41,5,2014-01-01 03:48:25,53.22


In [15]:
labels.sample(frac=.1, random_state=0)

Unnamed: 0,customer_id,time,total_spent
2,1,2014-01-01 02:49:30,217.94
10,2,2014-01-01 02:00:00,290.39
11,2,2014-01-01 02:02:00,290.39
28,3,2014-01-01 04:01:05,196.25
41,5,2014-01-01 03:48:25,53.22


In [16]:
categorical = labels.bin(4, labels=['A', 'B', 'C', 'D'])

In [17]:
n = {'A': 2, 'B': 2, 'C': 2, 'D': 2}
categorical.sample(n=n, random_state=0)

Unnamed: 0,customer_id,time,total_spent
6,1,2014-01-01 02:57:30,C
11,2,2014-01-01 02:02:00,D
16,2,2014-01-01 02:12:00,D
26,3,2014-01-01 03:57:05,B
38,4,2014-01-01 02:55:00,C
42,5,2014-01-01 03:50:25,A
46,5,2014-01-01 03:58:25,A
48,5,2014-01-01 04:02:25,B


In [18]:
frac = {'A': .1, 'B': .1, 'C': .1, 'D': .1}
categorical.sample(frac=frac, random_state=0)

Unnamed: 0,customer_id,time,total_spent
6,1,2014-01-01 02:57:30,C
11,2,2014-01-01 02:02:00,D
16,2,2014-01-01 02:12:00,D
26,3,2014-01-01 03:57:05,B
46,5,2014-01-01 03:58:25,A
