In [1]:
import cmdstanpy as cmd
import pandas as pd
import numpy as np

from py.model import BG_NBD

  from .autonotebook import tqdm as notebook_tqdm


# Data Processing

In [2]:
raw_data = pd.read_table(
    "data/CDNOW_sample.txt",
    delim_whitespace=True,
    header=None, 
)

In [3]:
raw_data

Unnamed: 0,0,1,2,3,4
0,4,1,19970101,2,29.33
1,4,1,19970118,2,29.73
2,4,1,19970802,1,14.96
3,4,1,19971212,2,26.48
4,21,2,19970101,3,63.34
...,...,...,...,...,...
6914,23556,2356,19970726,3,45.74
6915,23556,2356,19970927,3,31.47
6916,23556,2356,19980103,2,28.98
6917,23556,2356,19980607,2,28.98


In [4]:
raw_data.columns = ["customer_id", "customer_index", "date", "quantity", "amount"]
new_data = raw_data.assign(
    new_date=lambda x: x["date"].astype("str"),
    date=lambda x: pd.to_datetime([f"{row[:4]}-{row[4:6]}-{row[6:]}" for row in x["new_date"]])
).drop(columns=["new_date"])

In [5]:
new_data

Unnamed: 0,customer_id,customer_index,date,quantity,amount
0,4,1,1997-01-01,2,29.33
1,4,1,1997-01-18,2,29.73
2,4,1,1997-08-02,1,14.96
3,4,1,1997-12-12,2,26.48
4,21,2,1997-01-01,3,63.34
...,...,...,...,...,...
6914,23556,2356,1997-07-26,3,45.74
6915,23556,2356,1997-09-27,3,31.47
6916,23556,2356,1998-01-03,2,28.98
6917,23556,2356,1998-06-07,2,28.98


In [6]:
train = new_data.query('date < "1998-01-01"')
test = new_data.query('date >= "1998-01-01"')

train

Unnamed: 0,customer_id,customer_index,date,quantity,amount
0,4,1,1997-01-01,2,29.33
1,4,1,1997-01-18,2,29.73
2,4,1,1997-08-02,1,14.96
3,4,1,1997-12-12,2,26.48
4,21,2,1997-01-01,3,63.34
...,...,...,...,...,...
6912,23556,2356,1997-06-10,2,26.73
6913,23556,2356,1997-07-19,2,29.33
6914,23556,2356,1997-07-26,3,45.74
6915,23556,2356,1997-09-27,3,31.47


In [7]:
train.loc[:, "amount_indicator"] = train.groupby("customer_id", as_index=False)["date"].transform(lambda x: x == x.min())
train_amount_filtered = train.loc[train["amount_indicator"] == 0, ["customer_id", "amount"]]
train_amount_filtered = train_amount_filtered.groupby("customer_id", as_index=False).agg("mean")
train_amount_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.loc[:, "amount_indicator"] = train.groupby("customer_id", as_index=False)["date"].transform(lambda x: x == x.min())


Unnamed: 0,customer_id,amount
0,4,23.723333
1,21,11.770000
2,111,75.347778
3,112,11.770000
4,114,25.550000
...,...,...
1035,23500,16.192500
1036,23506,13.970000
1037,23507,30.637500
1038,23551,44.928000


In [8]:
train_processed = train.groupby("customer_id", as_index=False).agg(
    frequency=("customer_id", "size"),
    recency=("date", lambda x: np.round((x.max() - x.min()).days / 7)),
    T=("date", lambda x: np.round((pd.to_datetime("1998-01-01") - x.min()).days / 7)),
).assign(
    frequency=lambda x: x["frequency"] - 1,
).merge(train_amount_filtered, on="customer_id")

In [9]:
train_processed

Unnamed: 0,customer_id,frequency,recency,T,amount
0,4,3,49.0,52.0,23.723333
1,21,1,2.0,52.0,11.770000
2,111,9,48.0,52.0,75.347778
3,112,1,5.0,52.0,11.770000
4,114,2,36.0,52.0,25.550000
...,...,...,...,...,...
1035,23500,8,28.0,40.0,16.192500
1036,23506,1,9.0,40.0,13.970000
1037,23507,4,38.0,40.0,30.637500
1038,23551,5,24.0,40.0,44.928000


In [10]:
customers_no_transactions_in_test = set(train["customer_id"]) - set(test["customer_id"])
customers_no_transactions_in_test

padded_test = pd.DataFrame(
    {
        "customer_id": [i for i in customers_no_transactions_in_test],
        "frequency": np.repeat(0, len(customers_no_transactions_in_test)),
        "monetary_value": np.repeat(0, len(customers_no_transactions_in_test)),
        "duration": np.round((test["date"].max() - pd.to_datetime("1998-01-01")).days / 7)
    }
)

In [11]:
test_processed = test.groupby("customer_id", as_index=False).agg(
    frequency=("customer_id", "size"),
    monetary_value=("amount", "mean")
).assign(
    duration=np.round((test["date"].max() - pd.to_datetime("1998-01-01")).days / 7),
)

test_processed = pd.concat([test_processed, padded_test])
test_processed.sort_values("customer_id")

Unnamed: 0,customer_id,frequency,monetary_value,duration
2,4,0,0.000,26.0
4,18,0,0.000,26.0
6,21,0,0.000,26.0
15,50,0,0.000,26.0
17,60,0,0.000,26.0
...,...,...,...,...
512,23537,2,19.775,26.0
1675,23551,0,0.000,26.0
513,23554,1,24.600,26.0
514,23556,2,28.980,26.0


## Model

Note: due to convergence issues with the original model (p ~ Beta(a, b)), switch to a logit-normal distribution for p instead. This isn't exactly the same thing as a Beta distribution, but it should be materially close enough.

This allows for a hierarchical parameterization that samples a lot better.

In [12]:
model = BG_NBD("stan/bg-nbd.stan", seed=1234)
model.fit(train_processed, chains=4)

01:46:05 - cmdstanpy - INFO - CmdStan start processing
chain 1 |[33m          [0m| 00:00 Status
[A

[A[A

chain 1 |[33m▉         [0m| 00:05 Iteration:    1 / 2000 [  0%]  (Warmup)
chain 1 |[33m█▎        [0m| 00:06 Iteration:  100 / 2000 [  5%]  (Warmup)

[A[A
chain 1 |[33m█▊        [0m| 00:06 Iteration:  200 / 2000 [ 10%]  (Warmup)

chain 1 |[33m██▎       [0m| 00:07 Iteration:  300 / 2000 [ 15%]  (Warmup)
[A

chain 1 |[33m██▋       [0m| 00:08 Iteration:  400 / 2000 [ 20%]  (Warmup)
[A

chain 1 |[33m███▏      [0m| 00:09 Iteration:  500 / 2000 [ 25%]  (Warmup)
[A

chain 1 |[33m███▋      [0m| 00:09 Iteration:  600 / 2000 [ 30%]  (Warmup)
[A

chain 1 |[33m████      [0m| 00:10 Iteration:  700 / 2000 [ 35%]  (Warmup)
[A

chain 1 |[33m████▌     [0m| 00:11 Iteration:  800 / 2000 [ 40%]  (Warmup)
[A

chain 1 |[33m█████     [0m| 00:12 Iteration:  900 / 2000 [ 45%]  (Warmup)
[A

chain 1 |[34m█████▉    [0m| 00:13 Iteration: 1001 / 2000 [ 50%]  (Sampling)
[A

ch

                                                                                                                                                                                                                                                                                                                                


01:46:29 - cmdstanpy - INFO - CmdStan done processing.
Exception: gamma_lpdf: Random variable[3] is 0, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
Exception: gamma_lpdf: Shape parameter is inf, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
	Exception: gamma_lpdf: Shape parameter is inf, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
	Exception: gamma_lpdf: Random variable[1] is 0, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
	Exception: gamma_lpdf: Random variable[7] is 0, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
Exception: gamma_lpdf: Random variable[1] is 0, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
	Exception: gamma_lpdf: Shape parameter is 0, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
Exception: gamma_lpdf: Shape parameter is inf, bu




## Ensuring Convergence

In [13]:
model.summary_table()

Unnamed: 0,Mean,MCSE,StdDev,5%,50%,95%,N_Eff,N_Eff/s,R_hat
lp__,-12274.400000,1.772640,47.352700,-12353.200000,-12274.800000,-12196.50000,713.594000,17.536500,1.003210
p_logit_sigma,1.945360,0.005725,0.190226,1.646770,1.937180,2.26985,1103.896741,27.128102,1.002113
p[936],0.503399,0.003857,0.290421,0.044294,0.506803,0.94073,5668.690000,139.307000,1.001540
p_logit[102],-3.419230,0.016768,1.266780,-5.689920,-3.291760,-1.60948,5707.250000,140.255000,1.001330
z[852],0.287593,0.009638,0.740279,-1.002710,0.336814,1.37318,5899.570000,144.981000,1.001270
...,...,...,...,...,...,...,...,...,...
p_logit[693],-0.004374,0.022888,1.778610,-3.047620,0.050951,2.77012,6038.940000,148.406000,0.999036
p_logit[698],-0.338830,0.022905,1.872290,-3.525650,-0.247263,2.55215,6681.850000,164.206000,0.999034
lambda[241],0.379077,0.001146,0.103770,0.224121,0.369083,0.56048,8202.990000,201.587000,0.999032
z[698],0.509512,0.011355,0.954100,-1.102800,0.566926,1.97980,7060.370000,173.508000,0.999025


In [14]:
model.diagnostics()

'Processing csv files: /tmp/tmpl9mrxu1i/bg-nbdmy0j784u/bg-nbd-20240204014605_1.csv, /tmp/tmpl9mrxu1i/bg-nbdmy0j784u/bg-nbd-20240204014605_2.csv, /tmp/tmpl9mrxu1i/bg-nbdmy0j784u/bg-nbd-20240204014605_3.csv, /tmp/tmpl9mrxu1i/bg-nbdmy0j784u/bg-nbd-20240204014605_4.csv\n\nChecking sampler transitions treedepth.\nTreedepth satisfactory for all transitions.\n\nChecking sampler transitions for divergences.\nNo divergent transitions found.\n\nChecking E-BFMI - sampler transitions HMC potential energy.\nE-BFMI satisfactory.\n\nEffective sample size satisfactory.\n\nSplit R-hat values satisfactory all parameters.\n\nProcessing complete, no problems detected.\n'

# Predictive Checking

In [16]:
model.simulate_for_customer_id(4, time_interval=[0, 104])

[[6.925882033358372,
  14.466555836686222,
  23.30066676380836,
  45.67134138732557,
  66.76375828647916],
 [17.956536633464886, 36.66300975369279],
 [5.613068880861968, 14.839089468573876],
 [],
 [4.111745102635516,
  4.319752797750938,
  7.539987781008268,
  13.850770188457641,
  19.20792719300864,
  20.345450671058455,
  25.510089238593764,
  25.731091001442643,
  28.368910653589502],
 [44.879204447412654],
 [30.147592981600482,
  30.20328635964837,
  42.18940719646582,
  45.63633964146254,
  63.47618995663842,
  66.52680304992538,
  76.39258416440929,
  85.0972913897067,
  91.04776403600124,
  94.87065810956908,
  98.07675636060718],
 [4.008105994276497, 39.553465623565316],
 [9.2577039028824, 14.24027603287512, 15.906961037248863, 36.13564724112824],
 [9.305928459315798,
  23.450904296182777,
  48.63243589794011,
  64.79991384508526,
  74.89058852813284,
  86.70437026495371,
  92.0784427145238,
  99.32811620524888,
  102.21288854025067],
 [7.7180359168868184,
  31.756799407782925,