Beta-geometric/Negative Binomial Distribution Model (BG/NBD) - Forecasting Individual-Level Repeat-Buying

In [1]:
import polars as pl
import numpy as np
import altair as alt
from scipy.optimize import minimize

In [2]:
CDNOW_master = (
    pl.scan_csv(source = 'data/CDNOW/CDNOW_master.csv', 
                has_header=False, 
                separator=',', 
                schema={'CustID': pl.String,
                        'Date': pl.String,
                        'Quant': pl.Int16,
                        'Spend': pl.Float32})
    .with_columns(pl.col('Date').str.to_date('%Y%m%d'))
    .with_columns((pl.col('Date') - pl.date(1996,12,31)).dt.total_days().alias('PurchDay'))
    .group_by('CustID', 'Date')
    .agg(pl.col('*').exclude('PurchDay').sum(), pl.col('PurchDay').max()) # Multiple transactions by a customer on any day are aggregated into one
    .sort('CustID', 'Date')
    .with_columns((pl.col("CustID").cum_count().over("CustID") - 1).cast(pl.UInt16).alias("DoR"))    
)

display(CDNOW_master.head().collect())
display(CDNOW_master.describe())

CustID,Date,Quant,Spend,PurchDay,DoR
str,date,i64,f32,i64,u16
"""00001""",1997-01-01,1,11.77,1,0
"""00002""",1997-01-12,6,89.0,12,0
"""00003""",1997-01-02,2,20.76,2,0
"""00003""",1997-03-30,2,20.76,89,1
"""00003""",1997-04-02,2,19.540001,92,2


statistic,CustID,Date,Quant,Spend,PurchDay,DoR
str,str,str,f64,f64,f64,f64
"""count""","""67591""","""67591""",67591.0,67591.0,67591.0,67591.0
"""null_count""","""0""","""0""",0.0,0.0,0.0,0.0
"""mean""",,"""1997-07-01 11:44:58.175000""",2.483777,36.991844,182.489562,3.976387
"""std""",,,2.446038,38.143509,159.229817,9.45283
"""min""","""00001""","""1997-01-01""",1.0,0.0,1.0,0.0
"""25%""",,"""1997-02-22""",1.0,14.79,53.0,0.0
"""50%""",,"""1997-04-21""",2.0,26.73,111.0,1.0
"""75%""",,"""1997-11-06""",3.0,44.700001,310.0,4.0
"""max""","""23570""","""1998-06-30""",99.0,1554.580078,546.0,170.0


In [3]:
TransMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='PurchDay', aggregate_function='max', maintain_order=True)
    .fill_null(0)
)

QuantMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='Quant', aggregate_function='sum', maintain_order=True)
    .fill_null(0)
)

SpendtMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='Spend', aggregate_function='sum', maintain_order=True)
    .fill_null(0)
)