Beta-geometric/Negative Binomial Distribution Model (BG/NBD) - Forecasting Individual-Level Repeat-Buying

In [1]:
import polars as pl
import numpy as np
import altair as alt
from scipy.optimize import minimize

In [33]:
CDNOW_master = (
    pl.scan_csv(source = 'data/CDNOW/CDNOW_master.csv', 
                has_header=False, 
                separator=',', 
                schema={'CustID': pl.String,
                        'Date': pl.String,
                        'Quant': pl.Int16,
                        'Spend': pl.Decimal(None, 5)})
    .with_columns(pl.col('Date').str.to_date('%Y%m%d'))
    .with_columns((pl.col('Date') - pl.date(1996,12,31)).dt.total_days().cast(pl.UInt16).alias('PurchDay'))
    .group_by('CustID', 'Date')
    .agg(pl.col('*').exclude('PurchDay').sum(), pl.col('PurchDay').max()) # Multiple transactions by a customer on a single day are aggregated into one
    .sort('CustID', 'Date')
    .with_columns((pl.col("CustID").cum_count().over("CustID") - 1).cast(pl.UInt16).alias("DoR"))    
)

display(CDNOW_master.head().collect())
display(CDNOW_master.describe())

CustID,Date,Quant,Spend,PurchDay,DoR
str,date,i64,"decimal[*,5]",u16,u16
"""00001""",1997-01-01,1,11.77,1,0
"""00002""",1997-01-12,6,89.0,12,0
"""00003""",1997-01-02,2,20.76,2,0
"""00003""",1997-03-30,2,20.76,89,1
"""00003""",1997-04-02,2,19.54,92,2


statistic,CustID,Date,Quant,Spend,PurchDay,DoR
str,str,str,f64,f64,f64,f64
"""count""","""67591""","""67591""",67591.0,67591.0,67591.0,67591.0
"""null_count""","""0""","""0""",0.0,0.0,0.0,0.0
"""mean""",,"""1997-07-01 11:44:58.175000""",2.483777,36.991843,182.489562,3.976387
"""std""",,,2.446038,38.143508,159.229817,9.45283
"""min""","""00001""","""1997-01-01""",1.0,0.0,1.0,0.0
"""25%""",,"""1997-02-22""",1.0,14.79,53.0,0.0
"""50%""",,"""1997-04-21""",2.0,26.73,111.0,1.0
"""75%""",,"""1997-11-06""",3.0,44.7,310.0,4.0
"""max""","""23570""","""1998-06-30""",99.0,1554.58,546.0,170.0


In [34]:
TransMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='PurchDay', aggregate_function='max', maintain_order=True)
    .fill_null(0)
)

QuantMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='Quant', aggregate_function='sum', maintain_order=True)
)

SpendMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='Spend', aggregate_function='sum', maintain_order=True)
    .fill_null(0)
)

In [35]:
RptSpend = (
    CDNOW_master
    .filter(pl.col('PurchDay') <= 273)
    .with_columns(pl.when(pl.col('DoR') > 0)
                  .then(pl.col('Spend'))
                  .otherwise(0)
                  .alias('Repeat Spend'))
    .group_by('CustID')
    .agg(pl.col('Repeat Spend').sum())
    .sort('CustID')
    .collect()
)

RptSpend

CustID,Repeat Spend
str,"decimal[*,5]"
"""00001""",0.00000
"""00002""",0.00000
"""00003""",40.30000
"""00004""",44.69000
"""00005""",231.88000
…,…
"""23566""",0.00000
"""23567""",0.00000
"""23568""",98.73000
"""23569""",0.00000


In [36]:
ranked = (
    TransMAT
    .select('CustID', '0')
    .with_columns(pl.col('0'))
    .with_columns(((pl.col('0') - 1) // 7 + 1).alias('Trial Week'))
)

test = (
    ranked
    .join(RptSpend, on='CustID', how='left')
    .sort('Trial Week', 'Repeat Spend', descending=[False, True], maintain_order=True)
)

test = test[9::10].select('CustID')
test.write_csv('sampledID_python.csv', include_header=False)

In [51]:
final_ids = []
for week in range(1, 13):  # Sequential interval processing
    week_data = (
        ranked
        .filter(pl.col('Trial Week') == week)
        .join(RptSpend, on='CustID', how='left')
        .sort('Repeat Spend', descending=True, maintain_order=True)  # Explicit tiebreaker
        .select('CustID')
    )
    final_ids.extend(week_data['CustID'].to_list())

# Sample every 10th ID starting from the 10th row
sampled_ids = final_ids[9::10]

with open('sampledID_python.csv', "w") as f:
    for i in sampled_ids:
        f.write(f'{i}\n')

In [52]:
py = []
with open('sampledID_python.csv', 'r') as mat:
    csvFile = mat.readlines()
    for lines in csvFile:
        line = lines.strip().split(',')
        line = int(line[0])
        line = int(line)
        py.append(line)
        
matlab = []
with open('sampledID.csv', 'r') as mat:
    csvFile = mat.readlines()
    for lines in csvFile:
        line = lines.strip().split(',')
        line = int(line[0])
        matlab.append(line)
        
cdsample = []
with open('data/CDNOW/CDNOW_sample.csv', 'r') as mat:
    csvFile = mat.readlines()
    for lines in csvFile:
        line = lines.strip().split(',')
        line = int(line[0])
        cdsample.append(line)

matlab_id = []
for id in matlab:
    id = str(id)
    if len(id) == 5:
        matlab_id.append(id)
    else:
        id = "0" * (5 - len(id)) + id
        matlab_id.append(id)

In [53]:
len(set(cdsample)-set(matlab))
len(set(cdsample) - set(py))

158

In [54]:
breaks = []
for i in range(len(matlab)):
    if int(py[i]) != matlab[i]:
        breaks.append(i)

len(breaks)

159

In [None]:
# # Vectorized Method in Numpy - Using Masks

# TransMAT = TransMAT.to_numpy()
# SpendMAT = SpendMAT.to_numpy()
# # Step 1: Calculate x (number of valid transactions)
# x = np.sum(((TransMAT[:, 2:] > 0) & (TransMAT[:, 2:] <= 273)), axis=1, dtype='int16')

# # Step 2: Create a mask to include only valid columns for each customer
# mask = ((TransMAT[:, 2:] > 0) & (TransMAT[:, 2:] <= 273))  # Exclude ID Column & Trial

# RptSpend = np.sum(SpendMAT[:,2:] * mask, axis=1, dtype='float64')

# RptSpend