In [1]:
import cmdstanpy as cmd
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Data Processing

In [2]:
raw_data = pd.read_table(
    "data/CDNOW_sample.txt",
    delim_whitespace=True,
    header=None, 
)

In [3]:
raw_data

Unnamed: 0,0,1,2,3,4
0,4,1,19970101,2,29.33
1,4,1,19970118,2,29.73
2,4,1,19970802,1,14.96
3,4,1,19971212,2,26.48
4,21,2,19970101,3,63.34
...,...,...,...,...,...
6914,23556,2356,19970726,3,45.74
6915,23556,2356,19970927,3,31.47
6916,23556,2356,19980103,2,28.98
6917,23556,2356,19980607,2,28.98


In [4]:
raw_data.columns = ["customer_id", "customer_index", "date", "quantity", "amount"]
new_data = raw_data.assign(
    new_date=lambda x: x["date"].astype("str"),
    date=lambda x: pd.to_datetime([f"{row[:4]}-{row[4:6]}-{row[6:]}" for row in x["new_date"]])
).drop(columns=["new_date"])

In [5]:
new_data

Unnamed: 0,customer_id,customer_index,date,quantity,amount
0,4,1,1997-01-01,2,29.33
1,4,1,1997-01-18,2,29.73
2,4,1,1997-08-02,1,14.96
3,4,1,1997-12-12,2,26.48
4,21,2,1997-01-01,3,63.34
...,...,...,...,...,...
6914,23556,2356,1997-07-26,3,45.74
6915,23556,2356,1997-09-27,3,31.47
6916,23556,2356,1998-01-03,2,28.98
6917,23556,2356,1998-06-07,2,28.98


In [6]:
train = new_data.query('date < "1998-01-01"')
test = new_data.query('date >= "1998-01-01"')

train

Unnamed: 0,customer_id,customer_index,date,quantity,amount
0,4,1,1997-01-01,2,29.33
1,4,1,1997-01-18,2,29.73
2,4,1,1997-08-02,1,14.96
3,4,1,1997-12-12,2,26.48
4,21,2,1997-01-01,3,63.34
...,...,...,...,...,...
6912,23556,2356,1997-06-10,2,26.73
6913,23556,2356,1997-07-19,2,29.33
6914,23556,2356,1997-07-26,3,45.74
6915,23556,2356,1997-09-27,3,31.47


In [7]:
train["amount_indicator"] = train.groupby("customer_id", as_index=False)["date"].transform(lambda x: x == x.min())
train_amount_filtered = train.loc[train["amount_indicator"] == 0, ["customer_id", "amount"]]
train_amount_filtered = train_amount_filtered.groupby("customer_id", as_index=False).agg("mean")
train_amount_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["amount_indicator"] = train.groupby("customer_id", as_index=False)["date"].transform(lambda x: x == x.min())


Unnamed: 0,customer_id,amount
0,4,23.723333
1,21,11.770000
2,111,75.347778
3,112,11.770000
4,114,25.550000
...,...,...
1035,23500,16.192500
1036,23506,13.970000
1037,23507,30.637500
1038,23551,44.928000


In [8]:

train_processed = train.groupby("customer_id", as_index=False).agg(
    frequency=("customer_id", "size"),
    recency=("date", lambda x: np.round((x.max() - x.min()).days / 7)),
    T=("date", lambda x: np.round((pd.to_datetime("1998-01-01") - x.min()).days / 7)),
).assign(
    frequency=lambda x: x["frequency"] - 1,
).merge(train_amount_filtered, on="customer_id")

In [9]:
train_processed

Unnamed: 0,customer_id,frequency,recency,T,amount
0,4,3,49.0,52.0,23.723333
1,21,1,2.0,52.0,11.770000
2,111,9,48.0,52.0,75.347778
3,112,1,5.0,52.0,11.770000
4,114,2,36.0,52.0,25.550000
...,...,...,...,...,...
1035,23500,8,28.0,40.0,16.192500
1036,23506,1,9.0,40.0,13.970000
1037,23507,4,38.0,40.0,30.637500
1038,23551,5,24.0,40.0,44.928000


In [10]:
train.query('customer_id in [4, 23551]')

Unnamed: 0,customer_id,customer_index,date,quantity,amount,amount_indicator
0,4,1,1997-01-01,2,29.33,True
1,4,1,1997-01-18,2,29.73,False
2,4,1,1997-08-02,1,14.96,False
3,4,1,1997-12-12,2,26.48,False
6903,23551,2354,1997-03-25,1,39.99,True
6904,23551,2354,1997-06-19,1,12.99,False
6905,23551,2354,1997-08-08,4,102.36,False
6906,23551,2354,1997-08-15,4,85.75,False
6907,23551,2354,1997-09-04,1,11.77,False
6908,23551,2354,1997-09-11,1,11.77,False


In [11]:
customers_no_transactions_in_test = set(train["customer_id"]) - set(test["customer_id"])
customers_no_transactions_in_test

padded_test = pd.DataFrame(
    {
        "customer_id": [i for i in customers_no_transactions_in_test],
        "frequency": np.repeat(0, len(customers_no_transactions_in_test)),
        "monetary_value": np.repeat(0, len(customers_no_transactions_in_test)),
        "duration": np.round((test["date"].max() - pd.to_datetime("1998-01-01")).days / 7)
    }
)

In [12]:
test_processed = test.groupby("customer_id", as_index=False).agg(
    frequency=("customer_id", "size"),
    monetary_value=("amount", "mean")
).assign(
    duration=np.round((test["date"].max() - pd.to_datetime("1998-01-01")).days / 7),
)

test_processed = pd.concat([test_processed, padded_test])
test_processed.sort_values("customer_id")

Unnamed: 0,customer_id,frequency,monetary_value,duration
2,4,0,0.000,26.0
4,18,0,0.000,26.0
6,21,0,0.000,26.0
15,50,0,0.000,26.0
17,60,0,0.000,26.0
...,...,...,...,...
512,23537,2,19.775,26.0
1675,23551,0,0.000,26.0
513,23554,1,24.600,26.0
514,23556,2,28.980,26.0


## Model

In [62]:
model = cmd.CmdStanModel(stan_file="stan/bg-nbd.stan")

06:52:15 - cmdstanpy - INFO - compiling stan file /home/braydentang/Documents/Projects/cltv/stan/bg-nbd.stan to exe file /home/braydentang/Documents/Projects/cltv/stan/bg-nbd
06:52:31 - cmdstanpy - INFO - compiled model executable: /home/braydentang/Documents/Projects/cltv/stan/bg-nbd


In [68]:
data_dict = {
    "prior_only": 0,
    "N_customers": train_processed.shape[0],
    "recency": train_processed["recency"].astype(int).to_numpy(),
    "frequency": train_processed["frequency"].astype(int).to_numpy(),
    "T_age": train_processed["T"].astype(int).to_numpy()
}

samples = model.sample(
    data_dict,
    chains=4,
    seed=1234,
    iter_sampling=4000
)




06:59:24 - cmdstanpy - INFO - CmdStan start processing


KeyboardInterrupt: 

In [64]:
samples.diagnose()
diagnostics = samples.summary()

In [66]:
diagnostics.sort_values("N_Eff", ascending=False)

Unnamed: 0,Mean,MCSE,StdDev,5%,50%,95%,N_Eff,N_Eff/s,R_hat
lambda[897],0.126104,0.000466,0.047790,0.057956,0.120406,0.213469,10523.700000,237.771000,0.999252
lambda[379],0.311634,0.000731,0.074604,0.199746,0.305778,0.444616,10423.600000,235.509000,0.999317
lambda[883],0.559356,0.001082,0.108332,0.394929,0.552078,0.744552,10030.800000,226.633000,0.999609
lambda[258],0.241347,0.000631,0.062942,0.148001,0.235702,0.354041,9952.050000,224.854000,0.999132
lambda[521],0.133232,0.000506,0.050443,0.062791,0.126948,0.228049,9949.110000,224.788000,0.999238
...,...,...,...,...,...,...,...,...,...
gamma_beta,11.653848,0.020125,0.851997,10.333400,11.604200,13.127500,1792.241824,40.493489,1.001521
gamma_alpha,1.595099,0.002363,0.096075,1.444300,1.592410,1.761550,1653.235959,37.352823,1.002792
beta_b,1.525892,0.024217,0.296437,1.095900,1.487250,2.050720,149.843304,3.385524,1.001134
beta_a,0.633019,0.008145,0.095726,0.488591,0.625601,0.799151,138.128626,3.120846,1.002065
