Beta-geometric/Negative Binomial Distribution Model (BG/NBD) - Forecasting Individual-Level Repeat-Buying

In [1]:
import polars as pl
import numpy as np
import altair as alt
from scipy.optimize import minimize

In [2]:
CDNOW_master = (
    pl.scan_csv(source = 'data/CDNOW/CDNOW_master.csv', 
                has_header=False, 
                separator=',', 
                schema={'CustID': pl.Int32,
                        'Date': pl.String,
                        'Quant': pl.Int16,
                        'Spend': pl.Float64})
    .with_columns(pl.col('Date').str.to_date("%Y%m%d"))
    .with_columns((pl.col('Date') - pl.date(1996,12,31)).dt.total_days().cast(pl.UInt16).alias('PurchDay'))
    .with_columns((pl.col('Spend')*100).round(0).cast(pl.Int64).alias('Spend Scaled'))
    .group_by('CustID', 'Date', maintain_order=True)
    .agg(pl.col('*').exclude('PurchDay').sum(), pl.col('PurchDay').max()) # Multiple transactions by a customer on a single day are aggregated into one
    .with_columns((pl.col("CustID").cum_count().over("CustID") - 1).cast(pl.UInt16).alias("DoR"))    
)

display(CDNOW_master.head().collect())
display(CDNOW_master.describe())

CustID,Date,Quant,Spend,Spend Scaled,PurchDay,DoR
i32,date,i64,f64,i64,u16,u16
1,1997-01-01,1,11.77,1177,1,0
2,1997-01-12,6,89.0,8900,12,0
3,1997-01-02,2,20.76,2076,2,0
3,1997-03-30,2,20.76,2076,89,1
3,1997-04-02,2,19.54,1954,92,2


statistic,CustID,Date,Quant,Spend,Spend Scaled,PurchDay,DoR
str,f64,str,f64,f64,f64,f64,f64
"""count""",67591.0,"""67591""",67591.0,67591.0,67591.0,67591.0,67591.0
"""null_count""",0.0,"""0""",0.0,0.0,0.0,0.0,0.0
"""mean""",11479.968931,"""1997-07-01 11:44:58.175000""",2.483777,36.991843,3699.184255,182.489562,3.976387
"""std""",6813.132779,,2.446038,38.143508,3814.350807,159.229817,9.45283
"""min""",1.0,"""1997-01-01""",1.0,0.0,0.0,1.0,0.0
"""25%""",5516.0,"""1997-02-22""",1.0,14.79,1479.0,53.0,0.0
"""50%""",11425.0,"""1997-04-21""",2.0,26.73,2673.0,111.0,1.0
"""75%""",17269.0,"""1997-11-06""",3.0,44.7,4470.0,310.0,4.0
"""max""",23570.0,"""1998-06-30""",99.0,1554.58,155458.0,546.0,170.0


In [11]:
master_TransMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='PurchDay', aggregate_function='max', maintain_order=True)
    .fill_null(0)
)

master_QuantMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='Quant', aggregate_function='sum', maintain_order=True)
    .fill_null(0)
)

master_SpendMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='Spend', aggregate_function='sum', maintain_order=True)
    .fill_null(0)
)

In [4]:
RptSpend = (
    CDNOW_master
    .filter(pl.col('PurchDay') <= 273)
    .with_columns(pl.when(pl.col('DoR') > 0)
                  .then(pl.col('Spend Scaled'))
                  .otherwise(0)
                  .alias('Repeat Spend (Scaled)'))
    .group_by('CustID')
    .agg(pl.col('Repeat Spend (Scaled)').sum())
    .sort('CustID')
    .collect()
    .join(master_TransMAT.select('CustID', '0'), on='CustID', how='left')  
    .rename({'0': 'Trial Day'})
)

RptSpend

CustID,Repeat Spend (Scaled),Trial Day
i32,i64,u16
1,0,1
2,0,12
3,4030,2
4,4469,1
5,23188,1
…,…,…
23566,0,84
23567,0,84
23568,9873,84
23569,0,84


In [5]:
# Sampling technique - Python Method:
# id_df = (
#     RptSpend
#     .with_columns(((pl.col('Trial Day') - 1) // 7 + 1).alias('Trial Week'))
#     .sort(['Trial Week','Repeat Spend (Scaled)', 'CustID'], descending=[False, True, False], maintain_order=True)
# )

# sampledID = id_df[9::10].select('CustID')

# Sampling done on MATLAB - (numerical precision & sorting not deterministic)
CDNOW_sample = (
    pl.scan_csv(source='data/CDNOW/CDNOW_sample.csv',
                has_header=False,
                separator=',',
                schema={'CustID': pl.Int32,
                        'NewID': pl.Int32,
                        'Date': pl.String,
                        'Quant': pl.Int16,
                        'Spend': pl.Float64})
    .with_columns(pl.col('Date').str.to_date("%Y%m%d"))
    .with_columns((pl.col('Date') - pl.date(1996,12,31)).dt.total_days().cast(pl.UInt16).alias('PurchDay'))
    .with_columns((pl.col('Spend')*100).round(0).cast(pl.Int64).alias('Spend Scaled'))
    .group_by('CustID', 'Date', maintain_order=True)
    .agg(pl.col('*').exclude('PurchDay').sum(), pl.col('PurchDay').max())
    .with_columns((pl.col("CustID").cum_count().over("CustID") - 1).cast(pl.UInt16).alias("DoR"))      
)

Creating summaries of the 1/10th sample data given the xMAT data structure. We divide the 78 weeks in half: Period 1 is a 39-week calibration period while Period 2 is a 39-week longitudinal holdout used for model validation.

In [135]:
sample_TransMAT = (
    CDNOW_sample
    .collect()
    .pivot(on='DoR', index='CustID', values='PurchDay', aggregate_function='max', maintain_order=True)
    .fill_null(0)
)

sample_QuantMAT = (
    CDNOW_sample
    .collect()
    .pivot(on='DoR', index='CustID', values='Quant', aggregate_function='sum', maintain_order=True)
    .fill_null(0)
)

sample_SpendMAT = (
    CDNOW_sample
    .collect()
    .pivot(on='DoR', index='CustID', values='Spend Scaled', aggregate_function='sum', maintain_order=True)
    .fill_null(0)
)

# The number of repeat transactions made by each customer in each period
calwk = 273 # 39 week calibration period
NumHH = len(sample_TransMAT)

px = (
    CDNOW_sample
    .collect()
    .group_by('CustID', maintain_order=True)
    .agg(
        pl.col('PurchDay')
        .filter((pl.col('PurchDay') <= calwk) & (pl.col('DoR') > 0))
        .count()
        .alias('P1X'),

        pl.col('PurchDay')
        .filter((pl.col('PurchDay') > calwk) & (pl.col('DoR') > 0))
        .count()
        .alias('P2X')        
    )
)
# The number of CDs purchased and total spend across these repeat transactions



In [166]:
pSpend = (
    CDNOW_sample.collect()
    .join(px, on='CustID', how='left')
    .group_by('CustID', maintain_order=True)
    .agg(
        pl.col('Spend Scaled')
        .filter((pl.col('DoR') > 0) & (pl.col('DoR') < (pl.col('P1X') + 3)))
        .sum()
        .alias('P1X Spend'),
        
        pl.col('Spend Scaled')
        .filter((pl.col('DoR') > 0) & (pl.col('DoR') >= (pl.col('P1X') + 3)))
        .sum()
        .alias('P2X Spend')        
    )
)
pSpend

CustID,P1X Spend,P2X Spend
i32,i64,i64
4,7117,0
21,1177,0
50,0,0
71,0,0
86,0,0
…,…,…
23537,3955,0
23551,22464,0
23554,2460,0
23556,19123,0


In [163]:
px

CustID,P1X,P2X
i32,u32,u32
4,2,1
21,1,0
50,0,0
71,0,0
86,0,0
…,…,…
23537,0,2
23551,5,0
23554,0,1
23556,4,2


In [165]:
p2x

array([[1],
       [0],
       [0],
       ...,
       [1],
       [2],
       [0]], shape=(2357, 1))

In [161]:
# The number of repeat transactions made by each customer in each period
TransMAT = sample_TransMAT.to_numpy()
QuantMAT = sample_QuantMAT.to_numpy()
SpendMAT = sample_SpendMAT.to_numpy()

p1x = np.sum(((TransMAT[:,2:] > 0) & (TransMAT[:,2:] <= calwk)), axis=1, keepdims=True)
p2x = np.sum(((TransMAT[:,2:] > 0) & (TransMAT[:,2:] > calwk)), axis=1, keepdims=True)

# The number of CDs purchased and total spend across these repeat transactions
p1Quant = np.zeros((NumHH, 1), dtype=np.int16)
p2Quant = np.zeros((NumHH, 1), dtype=np.int16)
p1Spend = np.zeros((NumHH, 1), dtype=np.int64)
p2Spend = np.zeros((NumHH, 1), dtype=np.int64)

for i in range(NumHH):
    if p1x[i,0] == 0:
        p1Quant[i] = 0
        p1Spend[i] = 0
    else:
        p1Quant[i] = np.sum(QuantMAT[i, 2:3+p1x[i,0]])
        p1Spend[i] = np.sum(SpendMAT[i, 2:3+p1x[i,0]])
    p2Quant[i] = np.sum(QuantMAT[i,3+p1x[i,0]:])
    p2Spend[i] = np.sum(SpendMAT[i,3+p1x[i,0]:])
    
# The average spend per repeat transaction
mx = np.zeros((NumHH, 1))
tmpindx = p1x > 0
mx[tmpindx] = p1Spend[tmpindx] / p1x[tmpindx]

In [162]:
p1Spend

array([[ 7117],
       [ 1177],
       [    0],
       ...,
       [    0],
       [16225],
       [    0]], shape=(2357, 1))

When fitting models such as the Pareto/NBD and BG/NBD to these data, we also want to know the “recency” information for each customer, as well as their effective calibration period: