In [1]:
import polars as pl
import numpy as np
from utils import Donation
from scipy.optimize import minimize
from scipy.special import gammaln, comb, hyp2f1
from scipy.special import beta as beta_fn

import altair as alt
from IPython.display import display_markdown

**Source**: 
- [Customer-Base Analysis in a Discrete-Time Noncontractual Setting](http://www.brucehardie.com/papers/020/)
- [Implementing the BG/BB Model for Customer-Base Analysis in Excel](http://www.brucehardie.com/notes/010/)
- [Implementing the $S_{BB}-G/B$ Model in MATLAB](http://www.brucehardie.com/notes/023/)

In [2]:
data = Donation()
rfm_summary_calib = data.p1x_data()
rfm_array_calib = rfm_summary_calib.collect().to_numpy()
rfm_summary_valid = data.p2x_data()
p1x, t_x, _, num_donors = [*rfm_array_calib.T]

n = 6
n * (n + 1)/2 + 1 # Possible recency/frequency patterns in calibration period
years = data.years

### Parameter Estimation

In [3]:
def bgbb_est(rfm_data, guess={'alpha': 1, 'beta': 0.5, 'gamma': 0.5, 'delta': 2.5}):
    p1x, t_x, n, num_donors = [*rfm_data.T]

    def log_likelihood(param):
        alpha, beta, gamma, delta = param
        B_alpha_beta = beta_fn(alpha, beta)
        B_gamma_delta = beta_fn(gamma, delta)
        
        L = np.zeros((n.shape[0], n[0]+1))
        L[:,0] = beta_fn(alpha+p1x,beta+n-p1x)/B_alpha_beta * \
                 beta_fn(gamma,delta+n)/B_gamma_delta
        
        i = np.arange(6)
        L[:,1:] = beta_fn(alpha+p1x.reshape(-1,1), beta+t_x.reshape(-1,1)-p1x.reshape(-1,1)+i)/B_alpha_beta* \
                  beta_fn(gamma+1, delta+t_x.reshape(-1,1)+i)/B_gamma_delta        
        
        n_t_x_1 = n - t_x - 1
        L[:,1:] = np.where(i <= n_t_x_1.reshape(-1,1), L[:,1:], 0)

        return -np.sum(num_donors * np.log(np.sum(L, axis=1)))
    
    bnds = [[0, np.inf] for _ in range(4)]
    return minimize(log_likelihood, x0=list(guess.values()), bounds=bnds)

# Sample parameters
# alpha = 1.20352083040498
# beta = 0.749714243061896
# gamma = 0.656712169147878
# delta = 2.78340801635898

res = bgbb_est(rfm_array_calib)
alpha, beta, gamma, delta = res.x
ll = res.fun

display_markdown(f'''$\\alpha$ = {alpha:0.4f}

$\\beta$ = {beta:0.4f}

$\\gamma$ = {gamma:0.4f}

$\\delta$ = {delta:0.4f}

Log-Likelihood = {-ll:0.4f}''', raw=True)

$\alpha$ = 1.2035

$\beta$ = 0.7497

$\gamma$ = 0.6568

$\delta$ = 2.7838

Log-Likelihood = -33225.5813

### Likelihood Function

Likelihood function for a randomly chosen customer with purchase history ($x, t_{x}, n$)

In [4]:
B_alpha_beta = beta_fn(alpha, beta)
B_gamma_delta = beta_fn(gamma, delta)

A1 = beta_fn(alpha+p1x, beta+n-p1x)/B_alpha_beta * beta_fn(gamma, delta+n)/B_gamma_delta
i = np.arange(6).reshape(-1,1)
A1a = beta_fn(alpha+p1x, beta+t_x-p1x+i)/B_alpha_beta * beta_fn(gamma+1, delta+t_x+i)/B_gamma_delta  
A1a = np.where(i <= (n - t_x - 1), A1a, 0)
L = A1 + np.sum(A1a, axis=0)

### In-Sample Model Fit Plot

In [5]:
x = np.arange(n+1)
A1 = comb(n,x)*beta_fn(alpha+x, beta+n-x)/B_alpha_beta*beta_fn(gamma, delta+n)/B_gamma_delta 
i = np.arange(n).reshape(-1,1)
A2 = comb(i,x)*beta_fn(alpha+x, beta+i-x)/B_alpha_beta * beta_fn(gamma+1, delta+i)/B_gamma_delta
P_X_n = A1 + np.sum(A2,axis=0)

model_repeat_calib = pl.DataFrame({'Model': P_X_n * np.sum(rfm_array_calib[:,3])})

actual_model_repeat_calib = (
    rfm_summary_calib
    .group_by('P1X')
    .agg((pl.col('Count').sum()).alias('Actual'))
    .sort('P1X')
    .collect()
    .hstack(model_repeat_calib)
    .unpivot(on=['Actual', 'Model'], index='P1X', value_name='No of people', variable_name='Actual Vs Estimated')
)

(
    alt.Chart(actual_model_repeat_calib).mark_bar()
    .encode(x=alt.X('P1X:O', title='No. of repeat transactions', axis=alt.Axis(labelAngle=0)), 
         y=alt.Y('No of people:Q', title='No. of people'), 
         color='Actual Vs Estimated:N', 
         xOffset='Actual Vs Estimated')
    .properties(
        width=650,
        height=250,
        title='Predicted vs. Actual Frequency of Repeat Transactions (Calibration Period) in 1996 to 2001'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)   
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


### Calibration Period Model Fit Plot

In [6]:
n_star = 5
x_star = np.arange(n_star+1)

A1 = comb(n_star,x_star)*beta_fn(alpha+x_star, beta+n_star-x_star)/B_alpha_beta * beta_fn(gamma, delta+n+n_star)/B_gamma_delta
A1 += np.where(x_star == 0, 1 - beta_fn(gamma, delta+n)/B_gamma_delta, 0)
i = np.arange(n_star).reshape(-1,1)
A2 = comb(i,x_star)*beta_fn(alpha+x_star, beta+i-x_star)/B_alpha_beta * beta_fn(gamma+1, delta+n+i)/B_gamma_delta
P_X_n_star = A1 + np.sum(A2, axis=0)

valid_repeat_count = rfm_summary_valid.collect().to_numpy()[:,2]
model_repeat_valid = pl.DataFrame({'Model': P_X_n_star * np.sum(valid_repeat_count)})

actual_model_repeat_valid = (
    rfm_summary_valid
    .group_by('P2X')
    .agg((pl.col('Count').sum()).alias('Actual'))
    .sort('P2X')
    .collect()
    .hstack(model_repeat_valid)
    .unpivot(on=['Actual', 'Model'], index='P2X', value_name='No of people', variable_name='Actual Vs Estimated')
)

(
    alt.Chart(actual_model_repeat_valid).mark_bar()
    .encode(x=alt.X('P2X:O', title='No. of repeat transactions', axis=alt.Axis(labelAngle=0)), 
         y=alt.Y('No of people:Q', title='No. of people'), 
         color='Actual Vs Estimated:N', 
         xOffset='Actual Vs Estimated')
    .properties(
        width=650,
        height=250,
        title='Predicted vs. Actual Frequency of Repeat Transactions (Validation Period) in 2002-2006'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)   
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


### Tracking Plots

In [7]:
act_yearly_repeat = data.data.select(pl.col('*').exclude('ID', '1995')).sum().collect().to_numpy()
act_cum_repeat = act_yearly_repeat.cumsum()

A1 = alpha / (alpha + beta)
A2 = 1/(gamma-1)
n_trans = np.arange(1, len(years))
A3 = np.exp(gammaln(delta+n_trans+1)+gammaln(gamma+delta)-gammaln(delta)-gammaln(gamma+delta+n_trans))
E_X_n = A1 * (delta*A2-A2*A3)

est_cum_repeat = np.sum(rfm_array_calib[:,3]) * E_X_n
est_yearly_repeat = np.diff(est_cum_repeat, prepend=0)

yearly_repeat = pl.DataFrame({'Year': years[1:], 'Actual': act_yearly_repeat.flatten(), 'Model': est_yearly_repeat.flatten()})
yearly_repeat = yearly_repeat.unpivot(on=['Actual', 'Model'], index='Year', variable_name='Actual Vs Model', value_name='Repeat Trans')
cum_repeat = pl.DataFrame({'Year': years[1:], 'Actual': act_cum_repeat.flatten(), 'Model': est_cum_repeat.flatten()})
cum_repeat = cum_repeat.unpivot(on=['Actual', 'Model'], index='Year', variable_name='Actual Vs Model', value_name='Repeat Trans')

(
    alt.Chart(yearly_repeat).mark_line().encode(
        x=alt.X('Year', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('Repeat Trans', title='No. of repeat transactions'),
        strokeDash='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Annual Repeat Transactions'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [8]:
(
    alt.Chart(yearly_repeat).mark_bar().encode(
        x=alt.X('Year', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('Repeat Trans', title='No. of repeat transactions'),
        color='Actual Vs Model',
        xOffset='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Annual Repeat Transactions'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [9]:
(
    alt.Chart(cum_repeat).mark_line().encode(
        x=alt.X('Year', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('Repeat Trans', title='Cumulative no. of repeat transactions'),
        strokeDash='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Cumulative Repeat Transactions'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [10]:
(
    alt.Chart(cum_repeat).mark_bar().encode(
        x=alt.X('Year', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('Repeat Trans', title='Cumulative no. of repeat transactions'),
        color='Actual Vs Model',
        xOffset='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Cumulative Repeat Transactions'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


### Conditional Expectations

In [24]:
n_star = 5

A2 = beta_fn(alpha+p1x+1, beta+n-p1x)/B_alpha_beta
A3 = delta/(gamma-1)*np.exp(gammaln(gamma+delta)-gammaln(delta+1))*(np.exp(gammaln(1+delta+n)-\
     gammaln(gamma+delta+n))-np.exp(gammaln(1+delta+n+n_star)-gammaln(gamma+delta+n+n_star)))

ce = A2 * A3 / L

exp_total = ce * num_donors

ce_df = (
    data.rfm_data()
    .group_by('P1X', 't_x', 'np1x')
    .agg(pl.col('P2X').sum().alias('Actual Total'))
    .sort(['t_x', 'P1X'], descending=True)
    .collect()
    .hstack([pl.Series('Exp Total', exp_total)])
    .hstack([pl.Series('Conditional Expectation', ce)])
)

# Actual total 2002-2006 donations by p1x / tx
actual_ce_mat = ce_df.sort('t_x').pivot(index='P1X', on='t_x', values='Actual Total').sort('P1X').fill_null(0).to_numpy()
p1x_frequency = actual_ce_mat[:,0]
actual_ce_mat = actual_ce_mat[:,1:]

# Expected total 2002-2006 donations by p1x / tx
est_ce_mat = ce_df.sort('t_x').pivot(index='P1X', on='t_x', values='Exp Total').sort('P1X').fill_null(0).to_numpy()[:,1:]

# Number of Donors
num_donors_mat = rfm_summary_calib.collect().sort('t_x').pivot(index='P1X', on='t_x', values='Count').sort('P1X').fill_null(0).to_numpy()[:,1:]

# CE by Frequency
actual_ce_freq = np.sum(actual_ce_mat, axis=1) / np.sum(num_donors_mat, axis=1)
est_ce_freq = np.sum(est_ce_mat, axis=1) / np.sum(num_donors_mat, axis=1)
ce_freq = pl.DataFrame({'x': p1x_frequency, 'Actual': actual_ce_freq, 'Model': est_ce_freq})
ce_freq = ce_freq.unpivot(index='x', on=['Actual', 'Model'], variable_name='Actual Vs Model', value_name='CE by Freq')

# CE by Recency
actual_ce_rec = np.sum(actual_ce_mat, axis=0) / np.sum(num_donors_mat, axis=0)
est_ce_rec = np.sum(est_ce_mat, axis=0) / np.sum(num_donors_mat, axis=0)
ce_rec = pl.DataFrame({'t_x': years[:len(p1x_frequency)], 'Actual': actual_ce_rec, 'Model': est_ce_rec}).with_columns(pl.col('t_x').cast(pl.Int16))
ce_rec = ce_rec.unpivot(index='t_x', on=['Actual', 'Model'], variable_name='Actual Vs Model', value_name='CE by Rec')

In [25]:
(
    alt.Chart(ce_freq).mark_line().encode(
        x=alt.X('x', title='No. of repeat transactions (1996-2001)', axis=alt.Axis(labelAngle=0, values=np.arange(7))),
        y=alt.Y('CE by Freq', title='No. of repeat transactions (2002–2006)'),
        strokeDash='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Conditional Expectations of Repeat Transactions in 2002–2006 as a Function of Frequency'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [26]:
(
    alt.Chart(ce_rec).mark_line().encode(
        x=alt.X('t_x', title='Year of last transaction', axis=alt.Axis(labelAngle=0, values=np.arange(1995, 2002, 1), format='.0f')),
        y=alt.Y('CE by Rec', title='No. of repeat transactions (2002–2006)'),
        strokeDash='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Conditional Expectations of Repeat Transactions in 2002–2006 as a Function of Recency'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [33]:
tx_year_map = {tx: int(year) for tx, year in enumerate(years)}

(
    ce_df
    .with_columns(pl.col("t_x").replace(tx_year_map).alias("Year"))
    .sort('Year')
    .pivot(on='Year', index='P1X', values='Conditional Expectation')
    .sort('P1X')
    .style.tab_header(title="Expected Number of Repeat Transactions in 2002–2006", subtitle='as a Function of Recency and Frequency')
    .tab_stub(rowname_col="P1X")
    .tab_stubhead(label='P1X')
    .fmt_number(decimals=2)   
    .tab_spanner(
        label="Year of last transaction",
        columns=years[:7]
    ).data_color(
        domain=[0, 4],
        palette=["white", "rebeccapurple"],
        na_color="white",
    ).sub_missing(
        columns=pl.col('*'),
        missing_text=""
    )    
)

Expected Number of Repeat Transactions in 2002–2006,Expected Number of Repeat Transactions in 2002–2006,Expected Number of Repeat Transactions in 2002–2006,Expected Number of Repeat Transactions in 2002–2006,Expected Number of Repeat Transactions in 2002–2006,Expected Number of Repeat Transactions in 2002–2006,Expected Number of Repeat Transactions in 2002–2006,Expected Number of Repeat Transactions in 2002–2006
as a Function of Recency and Frequency,as a Function of Recency and Frequency,as a Function of Recency and Frequency,as a Function of Recency and Frequency,as a Function of Recency and Frequency,as a Function of Recency and Frequency,as a Function of Recency and Frequency,as a Function of Recency and Frequency
P1X,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction
P1X,1995,1996,1997,1998,1999,2000,2001
0,0.07,,,,,,
1,,0.09,0.31,0.59,0.84,1.02,1.15
2,,,0.12,0.54,1.06,1.44,1.67
3,,,,0.22,1.03,1.8,2.19
4,,,,,0.58,2.03,2.71
5,,,,,,1.81,3.23
6,,,,,,,3.75


### P(Alive) as a Function of Recency and Frequency

In [14]:
A1 = np.exp(gammaln(alpha+p1x)+gammaln(beta+n-p1x)-gammaln(alpha+beta+n))/B_alpha_beta * \
                        np.exp(gammaln(gamma)+gammaln(delta+n+1)-gammaln(gamma+delta+n+1))/B_gamma_delta
P_alive = A1 * L**-1

tx_year_map = {tx: int(year) for tx, year in enumerate(years)}

(
    rfm_summary_calib
    .with_columns(pl.col("t_x").replace(tx_year_map).alias("Year"))
    .collect()
    .hstack(pl.DataFrame({'P(Alive)': P_alive}))
    .sort('Year')
    .pivot(on='Year', index='P1X', values='P(Alive)')
    .sort('P1X')
    .style.tab_header(title="P(Alive in 2002) as a Function of Recency and Frequency")
    .tab_stub(rowname_col="P1X")
    .tab_stubhead(label='P1X')
    .fmt_number(decimals=2)   
    .tab_spanner(
        label="Year of last transaction",
        columns=years[:7]
    ).data_color(
        domain=[0, 1],
        palette=["white", "rebeccapurple"],
        na_color="white",
    ).sub_missing(
        columns=pl.col('*'),
        missing_text=""
    )    
)

P(Alive in 2002) as a Function of Recency and Frequency,P(Alive in 2002) as a Function of Recency and Frequency,P(Alive in 2002) as a Function of Recency and Frequency,P(Alive in 2002) as a Function of Recency and Frequency,P(Alive in 2002) as a Function of Recency and Frequency,P(Alive in 2002) as a Function of Recency and Frequency,P(Alive in 2002) as a Function of Recency and Frequency,P(Alive in 2002) as a Function of Recency and Frequency
P1X,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction
P1X,1995,1996,1997,1998,1999,2000,2001
0,0.11,,,,,,
1,,0.07,0.25,0.48,0.68,0.83,0.93
2,,,0.07,0.3,0.59,0.8,0.93
3,,,,0.1,0.44,0.77,0.93
4,,,,,0.2,0.7,0.93
5,,,,,,0.52,0.93
6,,,,,,,0.93


### Posterior Mean of P as a Function of Recency and Frequency

In [15]:
l = 1
m = 0
alphal = alpha + l
gammam = gamma + m

B_alphal_beta = beta_fn(alphal, beta)
B_gammam_delta = beta_fn(gammam, delta)

A1 = beta_fn(alphal+p1x, beta+n-p1x)/B_alphal_beta * beta_fn(gammam, delta+n)/B_gammam_delta
i = np.arange(6).reshape(-1,1)
A2 = beta_fn(alphal+p1x, beta+t_x-p1x+i)/B_alphal_beta * beta_fn(gammam+1, delta+t_x+i)/B_gammam_delta  
A2 = np.where(i <= (n - t_x - 1), A2, 0)
L_lm = A1 + np.sum(A2, axis=0) 

E_P_Theta = (B_alphal_beta/B_alpha_beta) * (B_gammam_delta/B_gamma_delta) * (L_lm/L)

(
    rfm_summary_calib
    .with_columns(pl.col("t_x").replace(tx_year_map).alias("Year"))
    .collect()
    .hstack(pl.DataFrame({'E_P_Theta': E_P_Theta}))
    .sort('Year')
    .pivot(on='Year', index='P1X', values='E_P_Theta')
    .sort('P1X')
    .style.tab_header(title="Posterior Mean of P as a Function of Recency and Frequency")
    .tab_stub(rowname_col="P1X")
    .tab_stubhead(label='P1X')
    .fmt_number(decimals=2)   
    .tab_spanner(
        label="Year of last transaction",
        columns=years[:7]
    ).data_color(
        domain=[0.2, 1],
        palette=["white", "rebeccapurple"],
        na_color="white",
    ).sub_missing(
        columns=pl.col('*'),
        missing_text=""
    )    
)

Posterior Mean of P as a Function of Recency and Frequency,Posterior Mean of P as a Function of Recency and Frequency,Posterior Mean of P as a Function of Recency and Frequency,Posterior Mean of P as a Function of Recency and Frequency,Posterior Mean of P as a Function of Recency and Frequency,Posterior Mean of P as a Function of Recency and Frequency,Posterior Mean of P as a Function of Recency and Frequency,Posterior Mean of P as a Function of Recency and Frequency
P1X,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction,Year of last transaction
P1X,1995,1996,1997,1998,1999,2000,2001
0,0.49,,,,,,
1,,0.66,0.44,0.34,0.3,0.28,0.28
2,,,0.75,0.54,0.44,0.41,0.4
3,,,,0.8,0.61,0.54,0.53
4,,,,,0.82,0.68,0.65
5,,,,,,0.83,0.78
6,,,,,,,0.91


Prior and Selected Posterior Distributions of (a) $P$ and (b) $\Theta$

Probability of Being Active in 2002–2006 as a Function of Recency and Frequency

### Discounted Expected Residual Transactions (DERT)

In [16]:
d = 0.1 # discount rate

A1 = beta_fn(alpha+p1x+1, beta+n-p1x)/B_alpha_beta * beta_fn(gamma, delta+n+1)/(B_gamma_delta * (1 + d))
A2 = hyp2f1(1, delta+n+1, gamma+delta+n+1, 1/(1+d)) / L
A1 * A2

array([5.90979814, 5.08939012, 4.26898211, 3.44857409, 2.62816608,
       1.80775807, 2.85511859, 3.19701231, 2.8421641 , 2.27259363,
       1.60901298, 0.91843569, 1.62930189, 1.66557547, 1.3219255 ,
       0.35212864, 0.84429959, 0.93523279, 0.18760064, 0.49488438,
       0.13496306, 0.11475603])