Beta-geometric/Negative Binomial Distribution Model (BG/NBD) - Forecasting Individual-Level Repeat-Buying

In [1]:
import polars as pl
import numpy as np
import altair as alt
from scipy.optimize import minimize

In [27]:
CDNOW_master = (
    pl.scan_csv(source = 'data/CDNOW/CDNOW_master.csv', 
                has_header=False, 
                separator=',', 
                schema={'CustID': pl.String,
                        'Date': pl.String,
                        'Quant': pl.Int16,
                        'Spend': pl.Float64})
    .with_columns(pl.col('Date').str.to_date('%Y%m%d'))
    .with_columns((pl.col('Date') - pl.date(1996,12,31)).dt.total_days().alias('PurchDay'))
)

display(CDNOW_master.head().collect())
display(CDNOW_master.describe())

CustID,Date,Quant,Spend,PurchDay
str,date,i16,f64,i64
"""00001""",1997-01-01,1,11.77,1
"""00002""",1997-01-12,1,12.0,12
"""00002""",1997-01-12,5,77.0,12
"""00003""",1997-01-02,2,20.76,2
"""00003""",1997-03-30,2,20.76,89


statistic,CustID,Date,Quant,Spend,PurchDay
str,str,str,f64,f64,f64
"""count""","""69659""","""69659""",69659.0,69659.0,69659.0
"""null_count""","""0""","""0""",0.0,0.0,0.0
"""mean""",,"""1997-07-02 22:36:51.401000""",2.41004,35.893648,183.942262
"""std""",,,2.333924,36.281942,159.511302
"""min""","""00001""","""1997-01-01""",1.0,0.0,1.0
"""25%""",,"""1997-02-22""",1.0,14.49,53.0
"""50%""",,"""1997-04-24""",2.0,25.98,114.0
"""75%""",,"""1997-11-07""",3.0,43.7,311.0
"""max""","""23570""","""1998-06-30""",99.0,1286.01,546.0


In [28]:
# number of customers in the database and the maximum number of transactions made by any one individual:

NumCust = CDNOW_master.select(pl.col('CustID').n_unique()).collect().item(0,0)
tmpMaxTrans = CDNOW_master.group_by('CustID').len().select('len').max().collect().item(0,0)

print(f'{NumCust = }')
print(f'{tmpMaxTrans = }')

NumCust = 23570
tmpMaxTrans = 217


In [36]:
# Initialize unique customer-level aggregates
aggregates = (
    CDNOW_master
    .group_by("CustID", "PurchDay").agg(
        pl.col("Quant").alias("Quantities"),
        pl.col("Spend").alias("Spendings")
    )
).collect()

unique_customers = aggregates.select("CustID").unique()

In [32]:
# Initialize matrices
tmp_trans_mat = np.zeros((NumCust, tmpMaxTrans + 1))
tmp_quant_mat = np.zeros((NumCust, tmpMaxTrans + 1))
tmp_spend_mat = np.zeros((NumCust, tmpMaxTrans + 1))

# Create a mapping for customer indices
customer_map = {cust_id: idx for idx, cust_id in enumerate(unique_customers.to_series().to_list())}

# Fill matrices
for row in aggregates.iter_rows():
    cust_id, day, quant, spend = row
    idx = customer_map[cust_id]
    
    # Find the next empty column for this customer
    next_col = np.argmax(tmp_trans_mat[idx, 1:] == 0) + 1
    tmp_trans_mat[idx, next_col] = day
    tmp_quant_mat[idx, next_col] = quant
    tmp_spend_mat[idx, next_col] = spend

# Add customer IDs to the first column
for cust_id, idx in customer_map.items():
    tmp_trans_mat[idx, 0] = cust_id
    tmp_quant_mat[idx, 0] = cust_id
    tmp_spend_mat[idx, 0] = cust_id

ValueError: setting an array element with a sequence.