In [1]:
import polars as pl
import numpy as np
from scipy.special import gammaln, hyp2f1
from scipy.optimize import minimize

from IPython.display import display_markdown
import matplotlib.pyplot as plt
import matplotlib_inline

matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

Source:
- ["Counting Your Customers" the Easy Way: An Alternative to the Pareto/NBD Model](https://www.brucehardie.com/abstracts/abstract-fhl_2004-04.html)
- [Implementing the BG/NBD Model for Customer Base Analysis in Excel](https://www.brucehardie.com/notes/004/)

In [2]:
CDNOW = (
    pl.scan_csv(source='data/CDNOW/CDNOW_sample.csv',
                has_header=False,
                separator=',',
                schema={'CustID': pl.Int32,
                        'ID': pl.Int32,
                        'Date': pl.String,
                        'Quant': pl.Int16,
                        'Spend': pl.Float64})
    .with_columns(pl.col('Date').str.to_date("%Y%m%d"))
    .with_columns((pl.col('Date') - pl.date(1996,12,31)).dt.total_days().cast(pl.UInt16).alias('PurchDay'))
    .with_columns((pl.col('Spend')*100).round(0).cast(pl.Int64).alias('Spend Scaled'))
    .group_by('ID', 'Date')
    .agg(pl.col('*').exclude('PurchDay').sum(), pl.col('PurchDay').max())
    .sort('ID', 'Date')
    .with_columns((pl.col("ID").cum_count().over("ID") - 1).cast(pl.UInt16).alias("DoR"))      
    .drop('CustID')
)

In [3]:
calwk = 273 # 39 week calibration period

# The number of repeat transactions made by each customer in each period
freq_x = (
    CDNOW
    .group_by('ID', maintain_order=True)
    .agg(
        pl.col('PurchDay')
        .filter((pl.col('PurchDay') <= calwk) & (pl.col('DoR') > 0))
        .count()
        .alias('P1X'), # Period 1: Calibration Period

        pl.col('PurchDay')
        .filter((pl.col('PurchDay') > calwk) & (pl.col('DoR') > 0))
        .count()
        .alias('P2X')  # Period 2: Longitudinal Holdout Period      
    )
)

# The number of CDs purchased and total spend across these repeat transactions
pSpendQuant = (
    CDNOW
    .join(freq_x, on='ID', how='left')
    .group_by('ID', maintain_order=True)
    .agg(
        
        pl.col('Spend Scaled')
        .filter((pl.col('DoR') > 0) & (pl.col('DoR') <= pl.col('P1X')) & (pl.col('P1X') != 0))
        .sum()
        .alias('P1X Spend'),
        
        pl.col('Quant')
        .filter((pl.col('DoR') > 0) & (pl.col('DoR') <= pl.col('P1X')) & (pl.col('P1X') != 0))
        .sum()
        .alias('P1X Quant'),        
        
        pl.col('Spend Scaled')
        .filter((pl.col('DoR') > 0) & (pl.col('DoR') > pl.col('P1X')))
        .sum()
        .alias('P2X Spend'),
        
        pl.col('Quant')
        .filter((pl.col('DoR') > 0) & (pl.col('DoR') > pl.col('P1X')))
        .sum()
        .alias('P2X Quant')                
    )
)

# The average spend per repeat transaction
m_x = (
    pSpendQuant
    .join(freq_x, on='ID', how='left')
    .with_columns(
        (pl.col('P1X Spend') / pl.col('P1X')).alias('m_x_calib'),
        (pl.col('P2X Spend') / pl.col('P2X')).alias('m_x_valid')
    ).fill_nan(0)
)

# time of last calibration period repeat purchase (in weeks) - Recency
ttlrp = (
    CDNOW
    .join(freq_x, on='ID', how='left')
    .with_columns(
        pl.col('PurchDay').filter(pl.col('DoR') == 0)
        .first()
        .over('ID')
        .alias('Trial Day')
    )
    .group_by('ID', maintain_order=True)
    .agg(
        pl.col('PurchDay', 'Trial Day')
        .filter(pl.col('DoR') <= pl.col('P1X'))
        .max()
        # .alias('LastPurch')
    )
    .with_columns(
        # effective calibration period (in weeks)
        ((pl.col('PurchDay') - pl.col('Trial Day')) / 7).alias('t_x'), # Time to Last Repeat Purchase - Recency
        ((calwk - pl.col('Trial Day'))/7).alias('T')
    )
    .drop('PurchDay', 'Trial Day')
)

# Time of trial purchase (in weeks)
tofp = (
    CDNOW
    .filter(pl.col('DoR') == 0)
    .with_columns((pl.col('PurchDay') / 7).alias('Time of First Purch'))
    .group_by('Time of First Purch').agg(pl.len().alias('Count'))
    .sort('Time of First Purch')
)

rfm_data = (
    m_x
    .join(other=ttlrp, on="ID", how="left")
    .rename({'P1X': 'x'})
    .select('ID', 'x', 't_x', 'T')
)

In [4]:
def bgnbd_est(rfm_data, guess={'r': 0.01, 'alpha': 0.01, 'a': 0.01, 'b':0.01}):
    
    def log_likelihood(x):
        r, alpha, a, b = x

        ln_A_1 = gammaln(rfm_data[:,0] + r) - gammaln(r) + r * np.log(alpha)
        ln_A_2 = gammaln(a + b) + gammaln(b + rfm_data[:,0]) - gammaln(b) - gammaln(a + b + rfm_data[:,0])
        ln_A_3 = -(r + rfm_data[:,0]) * np.log(alpha + rfm_data[:,2])
        ln_A_4 = np.where(rfm_data[:,0] > 0, 
                          np.log(a) - np.log(b + rfm_data[:,0] - 1) - (r + rfm_data[:,0]) * np.log(alpha + rfm_data[:,1]),
                          0)
        return -np.sum(ln_A_1 + ln_A_2 + np.log(np.exp(ln_A_3) + (rfm_data[:,0] > 0) * np.exp(ln_A_4)))
    
    bnds = [(0, np.inf) for _ in range(4)]
    return minimize(log_likelihood, x0=list(guess.values()), bounds=bnds)

result = bgnbd_est(rfm_data.select('x', 't_x', 'T').collect().to_numpy())
r, alpha, a, b = result.x
ll = result.fun

display_markdown(f'''$r$ = {r:0.4f}

$\\alpha$ = {alpha:0.4f}

$a$ = {a:0.4f}

$b$ = {b:0.4f}

Log-Likelihood = {-ll:0.4f}''', raw=True)

  np.log(a) - np.log(b + rfm_data[:,0] - 1) - (r + rfm_data[:,0]) * np.log(alpha + rfm_data[:,1]),


$r$ = 0.2426

$\alpha$ = 4.4136

$a$ = 0.7929

$b$ = 2.4258

Log-Likelihood = -9582.4292

In [5]:
r, alpha, a, b = 0.242594150135163, 4.41359212062015, 0.792919156875463, 2.4258950494563

In [6]:
forecast_horizon = (calwk * 2) // 7

t = np.arange(1/7, forecast_horizon, 1/7)
z = t / (alpha + t)
h2f1 = hyp2f1(r, b, (a + b - 1), z)
E_X_t = (a + b - 1) / (a - 1) * (1 - (alpha / (alpha + t))**r * h2f1)

tofp_array = tofp.collect().to_numpy()

num_triers = tofp_array[:, 1]
trial_week = tofp_array[:, 0]
time_trial_week = np.arange(1/7, np.max(trial_week), 1/7)

index = ((t.reshape(-1, 1) - time_trial_week) * 7).astype(np.int16)
index = np.clip(index - 1, 0, E_X_t.shape[0] - 1)

# Compute cumulative repeat sales
cum_rpt_sls = np.sum(np.tril(E_X_t[index]) * num_triers, axis=1)

# Compute weekly repeat sales
wkly_rpt_sls = np.diff(cum_rpt_sls[6::7], prepend=0)

wkly_rpt_sls

array([ 3.45882751, 11.60378582, 20.08495471, 29.14030054, 38.06999252,
       48.2700452 , 56.58029375, 64.89178284, 73.11856376, 81.7028405 ,
       89.13746435, 95.70266923, 97.04402442, 93.55891092, 90.00293954,
       86.94761736, 84.0891682 , 81.53582973, 79.39818795, 77.2646425 ,
       75.16239719, 73.30137137, 71.56133426, 69.92932431, 68.26604383,
       66.82564548, 65.57001014, 64.40697566, 63.18653243, 61.88322392,
       60.76934085, 59.70675802, 58.63398038, 57.66364098, 56.72135071,
       55.82049889, 55.09954206, 54.27533588, 53.49443501, 52.73960615,
       51.92726728, 51.21761918, 50.5323692 , 49.87015708, 49.22972731,
       48.60991888, 48.00965626, 47.42794139, 46.86384657, 46.31650818,
       45.78512102, 45.26893331, 44.76724221, 44.27938968, 43.80475896,
       43.3427712 , 42.89288251, 42.4545813 , 42.02738584, 41.610842  ,
       41.2045213 , 40.808019  , 40.4209525 , 40.04295974, 39.62375933,
       39.25877987, 38.91100698, 38.55056258, 38.21138897, 37.85

#### Computing Conditional Expectations

In [7]:
x = 2
t_x = 30.43
T = 38.86
t = 39 # the length of the period over which we wish to make the conditional forecast

a_cust = r+x
b_cust = b+x
c_cust = a+b+x-1
z_cust = t/(alpha + T + t)

h2f1_cust = hyp2f1(a_cust, b_cust, c_cust, z_cust)

E_Y_X = (a + b + x - 1) / (a - 1) * (1 - ((alpha + T) / (alpha + T + t))**(r + x) * h2f1_cust) / \
        (1 + (x > 0) * a / (b + x - 1) * ((alpha + T) / (alpha + t_x))**(r + x))

display_markdown(f'''$E(Y(t) \\mid X = x, t_{{x}}, T, r, \\alpha, a, b)$ = {E_Y_X:0.4f}''' ,raw=True)

$E(Y(t) \mid X = x, t_{x}, T, r, \alpha, a, b)$ = 1.2259