In [1]:
import cmdstanpy as cmd
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Data Processing

In [2]:
raw_data = pd.read_table(
    "data/CDNOW_sample.txt",
    delim_whitespace=True,
    header=None, 
)

In [3]:
raw_data

Unnamed: 0,0,1,2,3,4
0,4,1,19970101,2,29.33
1,4,1,19970118,2,29.73
2,4,1,19970802,1,14.96
3,4,1,19971212,2,26.48
4,21,2,19970101,3,63.34
...,...,...,...,...,...
6914,23556,2356,19970726,3,45.74
6915,23556,2356,19970927,3,31.47
6916,23556,2356,19980103,2,28.98
6917,23556,2356,19980607,2,28.98


In [4]:
raw_data.columns = ["customer_id", "customer_index", "date", "quantity", "amount"]
new_data = raw_data.assign(
    new_date=lambda x: x["date"].astype("str"),
    date=lambda x: pd.to_datetime([f"{row[:4]}-{row[4:6]}-{row[6:]}" for row in x["new_date"]])
).drop(columns=["new_date"])

In [5]:
new_data

Unnamed: 0,customer_id,customer_index,date,quantity,amount
0,4,1,1997-01-01,2,29.33
1,4,1,1997-01-18,2,29.73
2,4,1,1997-08-02,1,14.96
3,4,1,1997-12-12,2,26.48
4,21,2,1997-01-01,3,63.34
...,...,...,...,...,...
6914,23556,2356,1997-07-26,3,45.74
6915,23556,2356,1997-09-27,3,31.47
6916,23556,2356,1998-01-03,2,28.98
6917,23556,2356,1998-06-07,2,28.98


In [6]:
train = new_data.query('date < "1998-01-01"')
test = new_data.query('date >= "1998-01-01"')

train

Unnamed: 0,customer_id,customer_index,date,quantity,amount
0,4,1,1997-01-01,2,29.33
1,4,1,1997-01-18,2,29.73
2,4,1,1997-08-02,1,14.96
3,4,1,1997-12-12,2,26.48
4,21,2,1997-01-01,3,63.34
...,...,...,...,...,...
6912,23556,2356,1997-06-10,2,26.73
6913,23556,2356,1997-07-19,2,29.33
6914,23556,2356,1997-07-26,3,45.74
6915,23556,2356,1997-09-27,3,31.47


In [7]:
train["amount_indicator"] = train.groupby("customer_id", as_index=False)["date"].transform(lambda x: x == x.min())
train_amount_filtered = train.loc[train["amount_indicator"] == 0, ["customer_id", "amount"]]
train_amount_filtered = train_amount_filtered.groupby("customer_id", as_index=False).agg("mean")
train_amount_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["amount_indicator"] = train.groupby("customer_id", as_index=False)["date"].transform(lambda x: x == x.min())


Unnamed: 0,customer_id,amount
0,4,23.723333
1,21,11.770000
2,111,75.347778
3,112,11.770000
4,114,25.550000
...,...,...
1035,23500,16.192500
1036,23506,13.970000
1037,23507,30.637500
1038,23551,44.928000


In [8]:

train_processed = train.groupby("customer_id", as_index=False).agg(
    frequency=("customer_id", "size"),
    recency=("date", lambda x: np.round((x.max() - x.min()).days / 7)),
    T=("date", lambda x: np.round((pd.to_datetime("1998-01-01") - x.min()).days / 7)),
).assign(
    frequency=lambda x: x["frequency"] - 1,
).merge(train_amount_filtered, on="customer_id")

In [9]:
train_processed

Unnamed: 0,customer_id,frequency,recency,T,amount
0,4,3,49.0,52.0,23.723333
1,21,1,2.0,52.0,11.770000
2,111,9,48.0,52.0,75.347778
3,112,1,5.0,52.0,11.770000
4,114,2,36.0,52.0,25.550000
...,...,...,...,...,...
1035,23500,8,28.0,40.0,16.192500
1036,23506,1,9.0,40.0,13.970000
1037,23507,4,38.0,40.0,30.637500
1038,23551,5,24.0,40.0,44.928000


In [11]:
customers_no_transactions_in_test = set(train["customer_id"]) - set(test["customer_id"])
customers_no_transactions_in_test

padded_test = pd.DataFrame(
    {
        "customer_id": [i for i in customers_no_transactions_in_test],
        "frequency": np.repeat(0, len(customers_no_transactions_in_test)),
        "monetary_value": np.repeat(0, len(customers_no_transactions_in_test)),
        "duration": np.round((test["date"].max() - pd.to_datetime("1998-01-01")).days / 7)
    }
)

In [12]:
test_processed = test.groupby("customer_id", as_index=False).agg(
    frequency=("customer_id", "size"),
    monetary_value=("amount", "mean")
).assign(
    duration=np.round((test["date"].max() - pd.to_datetime("1998-01-01")).days / 7),
)

test_processed = pd.concat([test_processed, padded_test])
test_processed.sort_values("customer_id")

Unnamed: 0,customer_id,frequency,monetary_value,duration
2,4,0,0.000,26.0
4,18,0,0.000,26.0
6,21,0,0.000,26.0
15,50,0,0.000,26.0
17,60,0,0.000,26.0
...,...,...,...,...
512,23537,2,19.775,26.0
1675,23551,0,0.000,26.0
513,23554,1,24.600,26.0
514,23556,2,28.980,26.0


## Model

In [73]:
model = cmd.CmdStanModel(stan_file="stan/bg-nbd.stan")

21:21:08 - cmdstanpy - INFO - compiling stan file /home/braydentang/Documents/Projects/cltv/stan/bg-nbd.stan to exe file /home/braydentang/Documents/Projects/cltv/stan/bg-nbd
21:21:23 - cmdstanpy - INFO - compiled model executable: /home/braydentang/Documents/Projects/cltv/stan/bg-nbd


In [74]:
data_dict = {
    "prior_only": 0,
    "N_customers": train_processed.shape[0],
    "recency": train_processed["recency"].astype(int).to_numpy(),
    "frequency": train_processed["frequency"].astype(int).to_numpy(),
    "T_age": train_processed["T"].astype(int).to_numpy()
}

samples = model.sample(
    data_dict,
    chains=4,
    seed=1234,
)

21:21:25 - cmdstanpy - INFO - CmdStan start processing
chain 1 |[33m          [0m| 00:00 Status
[A

[A[A

chain 1 |[33m▉         [0m| 00:05 Iteration:    1 / 2000 [  0%]  (Warmup)
chain 1 |[33m█▎        [0m| 00:06 Iteration:  100 / 2000 [  5%]  (Warmup)

[A[A
chain 1 |[33m█▊        [0m| 00:06 Iteration:  200 / 2000 [ 10%]  (Warmup)

chain 1 |[33m██▎       [0m| 00:07 Iteration:  300 / 2000 [ 15%]  (Warmup)
[A

chain 1 |[33m██▋       [0m| 00:08 Iteration:  400 / 2000 [ 20%]  (Warmup)
[A

chain 1 |[33m███▏      [0m| 00:09 Iteration:  500 / 2000 [ 25%]  (Warmup)
[A

chain 1 |[33m███▋      [0m| 00:09 Iteration:  600 / 2000 [ 30%]  (Warmup)
[A

chain 1 |[33m████      [0m| 00:10 Iteration:  700 / 2000 [ 35%]  (Warmup)
[A

chain 1 |[33m████▌     [0m| 00:11 Iteration:  800 / 2000 [ 40%]  (Warmup)
[A

chain 1 |[33m█████     [0m| 00:12 Iteration:  900 / 2000 [ 45%]  (Warmup)
[A

chain 1 |[34m█████▉    [0m| 00:12 Iteration: 1001 / 2000 [ 50%]  (Sampling)
[A

ch

                                                                                                                                                                                                                                                                                                                                


21:21:48 - cmdstanpy - INFO - CmdStan done processing.
Exception: gamma_lpdf: Random variable[3] is 0, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
Exception: gamma_lpdf: Shape parameter is inf, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
	Exception: gamma_lpdf: Shape parameter is inf, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
	Exception: gamma_lpdf: Random variable[1] is 0, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
	Exception: gamma_lpdf: Random variable[7] is 0, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
Exception: gamma_lpdf: Random variable[1] is 0, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
	Exception: gamma_lpdf: Shape parameter is 0, but must be positive finite! (in 'bg-nbd.stan', line 38, column 4 to column 44)
Exception: gamma_lpdf: Shape parameter is inf, bu




In [75]:
samples.diagnose()


'Processing csv files: /tmp/tmpw05u_zlb/bg-nbd0xteympx/bg-nbd-20240202212125_1.csv, /tmp/tmpw05u_zlb/bg-nbd0xteympx/bg-nbd-20240202212125_2.csv, /tmp/tmpw05u_zlb/bg-nbd0xteympx/bg-nbd-20240202212125_3.csv, /tmp/tmpw05u_zlb/bg-nbd0xteympx/bg-nbd-20240202212125_4.csv\n\nChecking sampler transitions treedepth.\nTreedepth satisfactory for all transitions.\n\nChecking sampler transitions for divergences.\nNo divergent transitions found.\n\nChecking E-BFMI - sampler transitions HMC potential energy.\nE-BFMI satisfactory.\n\nEffective sample size satisfactory.\n\nSplit R-hat values satisfactory all parameters.\n\nProcessing complete, no problems detected.\n'

In [76]:

diagnostics = samples.summary()

In [77]:
diagnostics.sort_values("N_Eff", ascending=False)

Unnamed: 0,Mean,MCSE,StdDev,5%,50%,95%,N_Eff,N_Eff/s,R_hat
z[57],0.064245,0.009483,0.996115,-1.57407,0.059208,1.70825,11034.100000,282.093000,0.999378
z[1001],0.033726,0.009813,1.024610,-1.65687,0.051874,1.74260,10901.900000,278.713000,0.999539
z[825],0.145989,0.009968,1.035880,-1.60509,0.174069,1.81030,10798.900000,276.081000,0.999496
z[475],0.137116,0.010198,1.053880,-1.62032,0.150856,1.85080,10679.200000,273.021000,0.999557
p_logit[57],-1.215350,0.019054,1.949100,-4.42733,-1.208010,1.99295,10463.700000,267.512000,0.999463
...,...,...,...,...,...,...,...,...,...
p_logit_mu,-1.342091,0.001988,0.109554,-1.51997,-1.339300,-1.16182,3038.099758,77.670964,0.999831
gamma_beta,11.679629,0.018683,0.847577,10.32400,11.652100,13.08340,2058.175492,52.618573,1.000179
gamma_alpha,1.598303,0.002206,0.095392,1.44560,1.594340,1.75854,1870.065325,47.809416,1.000038
p_logit_sigma,1.945360,0.005725,0.190226,1.64677,1.937180,2.26985,1103.896741,28.221826,1.002113


In [87]:
np.quantile(samples.draws_pd(vars=["p"]).loc[:, "p[11]"], (0.05, 0.95))

array([0.06817143, 0.9479934 ])