In [1]:
import pandas as pd
import numpy as np

from py.model import BG_NBD

  from .autonotebook import tqdm as notebook_tqdm


# Data Processing

In [2]:
raw_data = pd.read_table(
    "data/CDNOW_sample.txt",
    delim_whitespace=True,
    header=None, 
)

In [3]:
raw_data

Unnamed: 0,0,1,2,3,4
0,4,1,19970101,2,29.33
1,4,1,19970118,2,29.73
2,4,1,19970802,1,14.96
3,4,1,19971212,2,26.48
4,21,2,19970101,3,63.34
...,...,...,...,...,...
6914,23556,2356,19970726,3,45.74
6915,23556,2356,19970927,3,31.47
6916,23556,2356,19980103,2,28.98
6917,23556,2356,19980607,2,28.98


In [4]:
raw_data.columns = ["customer_id", "customer_index", "date", "quantity", "amount"]
new_data = raw_data.assign(
    new_date=lambda x: x["date"].astype("str"),
    date=lambda x: pd.to_datetime([f"{row[:4]}-{row[4:6]}-{row[6:]}" for row in x["new_date"]])
).drop(columns=["new_date"])

In [5]:
new_data

Unnamed: 0,customer_id,customer_index,date,quantity,amount
0,4,1,1997-01-01,2,29.33
1,4,1,1997-01-18,2,29.73
2,4,1,1997-08-02,1,14.96
3,4,1,1997-12-12,2,26.48
4,21,2,1997-01-01,3,63.34
...,...,...,...,...,...
6914,23556,2356,1997-07-26,3,45.74
6915,23556,2356,1997-09-27,3,31.47
6916,23556,2356,1998-01-03,2,28.98
6917,23556,2356,1998-06-07,2,28.98


In [6]:
train = new_data.query('date < "1998-01-01"')
test = new_data.query('date >= "1998-01-01"')

train

Unnamed: 0,customer_id,customer_index,date,quantity,amount
0,4,1,1997-01-01,2,29.33
1,4,1,1997-01-18,2,29.73
2,4,1,1997-08-02,1,14.96
3,4,1,1997-12-12,2,26.48
4,21,2,1997-01-01,3,63.34
...,...,...,...,...,...
6912,23556,2356,1997-06-10,2,26.73
6913,23556,2356,1997-07-19,2,29.33
6914,23556,2356,1997-07-26,3,45.74
6915,23556,2356,1997-09-27,3,31.47


In [7]:
test["date"].max()

Timestamp('1998-06-30 00:00:00')

In [8]:
train.loc[:, "amount_indicator"] = train.groupby("customer_id", as_index=False)["date"].transform(lambda x: x == x.min())
train_amount_filtered = train.loc[train["amount_indicator"] == 0, ["customer_id", "amount"]]
train_amount_filtered = train_amount_filtered.groupby("customer_id", as_index=False).agg("mean")
train_amount_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.loc[:, "amount_indicator"] = train.groupby("customer_id", as_index=False)["date"].transform(lambda x: x == x.min())


Unnamed: 0,customer_id,amount
0,4,23.723333
1,21,11.770000
2,111,75.347778
3,112,11.770000
4,114,25.550000
...,...,...
1035,23500,16.192500
1036,23506,13.970000
1037,23507,30.637500
1038,23551,44.928000


In [9]:
train_processed = train.groupby("customer_id", as_index=False).agg(
    frequency=("customer_id", "size"),
    recency=("date", lambda x: np.round((x.max() - x.min()).days / 7)),
    T=("date", lambda x: np.round((pd.to_datetime("1998-01-01") - x.min()).days / 7)),
).assign(
    frequency=lambda x: x["frequency"] - 1,
).merge(train_amount_filtered, on="customer_id")

In [10]:
train_processed

Unnamed: 0,customer_id,frequency,recency,T,amount
0,4,3,49.0,52.0,23.723333
1,21,1,2.0,52.0,11.770000
2,111,9,48.0,52.0,75.347778
3,112,1,5.0,52.0,11.770000
4,114,2,36.0,52.0,25.550000
...,...,...,...,...,...
1035,23500,8,28.0,40.0,16.192500
1036,23506,1,9.0,40.0,13.970000
1037,23507,4,38.0,40.0,30.637500
1038,23551,5,24.0,40.0,44.928000


In [11]:
customers_no_transactions_in_test = set(train["customer_id"]) - set(test["customer_id"])
customers_no_transactions_in_test

padded_test = pd.DataFrame(
    {
        "customer_id": [i for i in customers_no_transactions_in_test],
        "frequency": np.repeat(0, len(customers_no_transactions_in_test)),
        "monetary_value": np.repeat(0, len(customers_no_transactions_in_test)),
        "duration": np.round((test["date"].max() - pd.to_datetime("1998-01-01")).days / 7)
    }
)

In [12]:
test_processed = test.groupby("customer_id", as_index=False).agg(
    frequency=("customer_id", "size"),
    monetary_value=("amount", "mean")
).assign(
    duration=np.round((test["date"].max() - pd.to_datetime("1998-01-01")).days / 7),
)

test_processed = pd.concat([test_processed, padded_test])
test_processed.sort_values("customer_id")

Unnamed: 0,customer_id,frequency,monetary_value,duration
2,4,0,0.000,26.0
4,18,0,0.000,26.0
6,21,0,0.000,26.0
15,50,0,0.000,26.0
17,60,0,0.000,26.0
...,...,...,...,...
512,23537,2,19.775,26.0
1675,23551,0,0.000,26.0
513,23554,1,24.600,26.0
514,23556,2,28.980,26.0


## Model

Note: due to convergence issues with the original model (p ~ Beta(a, b)), switch to a logit-normal distribution for p instead. This isn't exactly the same thing as a Beta distribution, but it should be materially close enough.

This allows for a hierarchical parameterization that samples a lot better.

In [13]:
model = BG_NBD("stan/bg-nbd.stan", seed=1234)
model.fit(train_processed, chains=4)

05:32:21 - cmdstanpy - INFO - CmdStan start processing
chain 1 |[33m          [0m| 00:00 Status
[A

chain 1 |[33m▉         [0m| 00:05 Iteration:    1 / 2000 [  0%]  (Warmup)

[A[A
chain 1 |[33m█▎        [0m| 00:06 Iteration:  100 / 2000 [  5%]  (Warmup)

chain 1 |[33m█▊        [0m| 00:07 Iteration:  200 / 2000 [ 10%]  (Warmup)
[A

chain 1 |[33m██▎       [0m| 00:07 Iteration:  300 / 2000 [ 15%]  (Warmup)
[A

chain 1 |[33m██▋       [0m| 00:08 Iteration:  400 / 2000 [ 20%]  (Warmup)
[A

chain 1 |[33m███▏      [0m| 00:09 Iteration:  500 / 2000 [ 25%]  (Warmup)
[A

chain 1 |[33m███▋      [0m| 00:10 Iteration:  600 / 2000 [ 30%]  (Warmup)
[A

chain 1 |[33m████      [0m| 00:10 Iteration:  700 / 2000 [ 35%]  (Warmup)
[A

chain 1 |[33m████▌     [0m| 00:11 Iteration:  800 / 2000 [ 40%]  (Warmup)

[A[A
chain 1 |[33m█████     [0m| 00:12 Iteration:  900 / 2000 [ 45%]  (Warmup)
[A

chain 1 |[34m█████▉    [0m| 00:13 Iteration: 1001 / 2000 [ 50%]  (Sampling)
[A

ch

                                                                                                                                                                                                                                                                                                                                


05:32:44 - cmdstanpy - INFO - CmdStan done processing.
Exception: gamma_lpdf: Random variable[3] is 0, but must be positive finite! (in 'bg-nbd.stan', line 35, column 4 to column 44)
Exception: gamma_lpdf: Shape parameter is inf, but must be positive finite! (in 'bg-nbd.stan', line 35, column 4 to column 44)
	Exception: gamma_lpdf: Shape parameter is inf, but must be positive finite! (in 'bg-nbd.stan', line 35, column 4 to column 44)
	Exception: gamma_lpdf: Random variable[1] is 0, but must be positive finite! (in 'bg-nbd.stan', line 35, column 4 to column 44)
	Exception: gamma_lpdf: Random variable[7] is 0, but must be positive finite! (in 'bg-nbd.stan', line 35, column 4 to column 44)
Exception: gamma_lpdf: Random variable[1] is 0, but must be positive finite! (in 'bg-nbd.stan', line 35, column 4 to column 44)
	Exception: gamma_lpdf: Shape parameter is 0, but must be positive finite! (in 'bg-nbd.stan', line 35, column 4 to column 44)
Exception: gamma_lpdf: Shape parameter is inf, bu




## Ensuring Convergence

In [14]:
model.summary_table()

Unnamed: 0,Mean,MCSE,StdDev,5%,50%,95%,N_Eff,N_Eff/s,R_hat
lp__,-12274.400000,1.772640,47.352700,-12353.200000,-12274.800000,-12196.500000,713.594000,18.653100,1.003210
p_logit_sigma,1.945360,0.005725,0.190226,1.646770,1.937180,2.269850,1103.896741,28.855519,1.002113
p[936],0.503399,0.003857,0.290421,0.044294,0.506803,0.940730,5668.690000,148.178000,1.001540
z[852],0.287593,0.009638,0.740279,-1.002710,0.336814,1.373180,5899.570000,154.213000,1.001270
p[549],0.037793,0.000517,0.039258,0.002315,0.024383,0.118341,5762.900000,150.640000,1.001250
...,...,...,...,...,...,...,...,...,...
p[693],0.505360,0.003585,0.290635,0.045320,0.512735,0.941040,6572.140000,171.794000,0.999040
z[744],-0.976101,0.007885,0.651777,-2.149330,-0.929056,-0.013383,6832.700000,178.605000,0.999038
lambda[241],0.379077,0.001146,0.103770,0.224121,0.369083,0.560480,8202.990000,214.424000,0.999032
z[698],0.509512,0.011355,0.954100,-1.102800,0.566926,1.979800,7060.370000,184.556000,0.999025


In [15]:
model.diagnostics()

'Processing csv files: /tmp/tmpbvkpkvj9/bg-nbd_e0q1pqg/bg-nbd-20240204053221_1.csv, /tmp/tmpbvkpkvj9/bg-nbd_e0q1pqg/bg-nbd-20240204053221_2.csv, /tmp/tmpbvkpkvj9/bg-nbd_e0q1pqg/bg-nbd-20240204053221_3.csv, /tmp/tmpbvkpkvj9/bg-nbd_e0q1pqg/bg-nbd-20240204053221_4.csv\n\nChecking sampler transitions treedepth.\nTreedepth satisfactory for all transitions.\n\nChecking sampler transitions for divergences.\nNo divergent transitions found.\n\nChecking E-BFMI - sampler transitions HMC potential energy.\nE-BFMI satisfactory.\n\nEffective sample size satisfactory.\n\nSplit R-hat values satisfactory all parameters.\n\nProcessing complete, no problems detected.\n'

Diagnostics are fine once we move from a true Beta(a, b) distribution on p to a logit-normal distribution.

# Predictive Checking

Note: the validation set is a half year (i.e. 26 weeks).

Since lambda and p are constant through time according to the model assumptions, it doesn't matter when we start the process.

In [16]:
simulations = model.predict(26, n_jobs=4)

In [47]:
test_processed

Unnamed: 0,customer_id,frequency,monetary_value,duration
0,111,6,65.486667,26.0
1,113,2,13.380000,26.0
2,114,2,28.735000,26.0
3,166,3,11.750000,26.0
4,208,1,70.380000,26.0
...,...,...,...,...
1837,16348,0,0.000000,26.0
1838,8157,0,0.000000,26.0
1839,16359,0,0.000000,26.0
1840,16362,0,0.000000,26.0
