In [46]:
import polars as pl
import numpy as np
from scipy.optimize import minimize
from scipy.special import gammaln, comb
from utils import donation_data, rfm_data, p1x_data

import altair as alt
from IPython.display import display_markdown
from great_tables import style, loc

In [47]:
rfm_summary = p1x_data()
rfm_summary.collect()

P1X,t_x,np1x,Count
i8,i32,i32,u32
6,6,6,1203
5,6,6,728
4,6,6,512
3,6,6,357
2,6,6,234
…,…,…,…
1,3,6,129
2,2,6,613
1,2,6,277
1,1,6,1091


In [48]:
# Possible recency/frequency patterns 
n = 6
n * (n + 1)/2 + 1

22.0

### Parameter Estimation

In [49]:
def bgbb_est(rfm_data, guess={'alpha': 1, 'beta': 0.5, 'gamma': 0.5, 'delta': 2.5}):
    p1x, t_x, n, num_donors = [*rfm_data.T]

    def log_likelihood(param):
        alpha, beta, gamma, delta = param
        B_alpha_beta = np.exp(gammaln(alpha)+gammaln(beta)-gammaln(alpha+beta))
        B_gamma_delta = np.exp(gammaln(gamma)+gammaln(delta)-gammaln(gamma+delta))
        
        L = np.zeros((n.shape[0], n[0]+1))
        L[:,0] = np.exp(gammaln(alpha+p1x)+gammaln(beta+n-p1x)-gammaln(alpha+beta+n))/B_alpha_beta* \
                        np.exp(gammaln(gamma)+gammaln(delta+n)-gammaln(gamma+delta+n))/B_gamma_delta
        
        i = np.arange(6)
        L[:,1:] = np.exp(gammaln(alpha+p1x.reshape(-1,1))+gammaln(beta+t_x.reshape(-1,1)-p1x.reshape(-1,1)+i)-gammaln(alpha+beta+t_x.reshape(-1,1)+i))/B_alpha_beta* \
                        np.exp(gammaln(gamma+1)+gammaln(delta+t_x.reshape(-1,1)+i)-gammaln(gamma+delta+t_x.reshape(-1,1)+i+1))/B_gamma_delta        
        
        n_t_x_1 = n - t_x - 1
        L[:,1:] = np.where(i <= n_t_x_1.reshape(-1,1), L[:,1:], 0)

        return -np.sum(num_donors * np.log(np.sum(L, axis=1)))
    
    bnds = [[0, np.inf] for _ in range(4)]
    return minimize(log_likelihood, x0=list(guess.values()), bounds=bnds)

# Sample parameters
# alpha = 1.20352083040498
# beta = 0.749714243061896
# gamma = 0.656712169147878
# delta = 2.78340801635898

rfm_array = rfm_summary.collect().to_numpy()
res = bgbb_est(rfm_array)
alpha, beta, gamma, delta = res.x
ll = res.fun

display_markdown(f'''$\\alpha$ = {alpha:0.4f}

$\\beta$ = {beta:0.4f}

$\\gamma$ = {gamma:0.4f}

$\\delta$ = {delta:0.4f}

Log-Likelihood = {-ll:0.4f}''', raw=True)

$\alpha$ = 1.2035

$\beta$ = 0.7497

$\gamma$ = 0.6568

$\delta$ = 2.7838

Log-Likelihood = -33225.5813

### In-Sample Model Fit Plot

In [50]:
B_alpha_beta = np.exp(gammaln(alpha)+gammaln(beta)-gammaln(alpha+beta))
B_gamma_delta = np.exp(gammaln(gamma)+gammaln(delta)-gammaln(gamma+delta))

n = 6 
x = np.arange(n+1)
P_X_n = np.zeros((n+1, n+1))

P_X_n[:,0] = comb(n,x)*np.exp(gammaln(alpha+x)+gammaln(beta+n-x)-gammaln(alpha+beta+n))/B_alpha_beta*np.exp(gammaln(gamma)+gammaln(delta+n)-gammaln(gamma+delta+n))/B_gamma_delta
i = np.arange(n)
P_X_n[:,1:] = comb(i,x.reshape(-1,1))*np.exp(gammaln(alpha+x.reshape(-1,1))+gammaln(beta+i-x.reshape(-1,1))-gammaln(alpha+beta+i))/B_alpha_beta*\
              np.exp(gammaln(gamma+1)+gammaln(delta+i)-gammaln(gamma+delta+i+1))/B_gamma_delta
P_X_n = np.sum(P_X_n, axis=1)

model_repeat = pl.DataFrame({'Model': P_X_n * np.sum(rfm_array[:,3])})

actual_model_repeat = (
    rfm_summary
    .group_by('P1X')
    .agg((pl.col('Count').sum()).alias('Actual'))
    .sort('P1X')
    .collect()
    .hstack(model_repeat)
    .unpivot(on=['Actual', 'Model'], index='P1X', value_name='No of people', variable_name='Actual Vs Estimated')
)

(
    alt.Chart(actual_model_repeat).mark_bar()
    .encode(x=alt.X('P1X:O', title='No. of repeat transactions', axis=alt.Axis(labelAngle=0)), 
         y=alt.Y('No of people:Q', title='No. of people'), 
         color='Actual Vs Estimated:N', 
         xOffset='Actual Vs Estimated')
    .properties(
        width=650,
        height=250,
        title='Predicted vs. Actual Frequency of Repeat Transactions'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)   
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


### Tracking Plots

In [51]:
act_yearly_repeat = donation_data().select(pl.col('*').exclude('ID', '1995')).sum().collect().to_numpy()
act_cum_repeat = act_yearly_repeat.cumsum()


years = donation_data().select(pl.col('*').exclude('ID')).collect().columns
A1 = alpha / (alpha + beta)
A2 = 1/(gamma-1)
n = np.arange(1, len(years))
A3 = np.exp(gammaln(delta+n+1)+gammaln(gamma+delta)-gammaln(delta)-gammaln(gamma+delta+n))
E_X_n = A1 * (delta*A2-A2*A3)

est_cum_repeat = np.sum(rfm_array[:,3]) * E_X_n
est_yearly_repeat = np.diff(est_cum_repeat, prepend=0)

yearly_repeat = pl.DataFrame({'Year': years[1:], 'Actual': act_yearly_repeat.flatten(), 'Model': est_yearly_repeat.flatten()})
yearly_repeat = yearly_repeat.unpivot(on=['Actual', 'Model'], index='Year', variable_name='Actual Vs Model', value_name='Repeat Trans')
cum_repeat = pl.DataFrame({'Year': years[1:], 'Actual': act_cum_repeat.flatten(), 'Model': est_cum_repeat.flatten()})
cum_repeat = cum_repeat.unpivot(on=['Actual', 'Model'], index='Year', variable_name='Actual Vs Model', value_name='Repeat Trans')

(
    alt.Chart(yearly_repeat).mark_line().encode(
        x=alt.X('Year', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('Repeat Trans', title='No. of repeat transactions'),
        strokeDash='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Annual Repeat Transactions'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [52]:
(
    alt.Chart(yearly_repeat).mark_bar().encode(
        x=alt.X('Year', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('Repeat Trans', title='No. of repeat transactions'),
        color='Actual Vs Model',
        xOffset='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Annual Repeat Transactions'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [53]:
(
    alt.Chart(cum_repeat).mark_line().encode(
        x=alt.X('Year', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('Repeat Trans', title='Cumulative no. of repeat transactions'),
        strokeDash='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Cumulative Repeat Transactions'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [54]:
(
    alt.Chart(cum_repeat).mark_bar().encode(
        x=alt.X('Year', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('Repeat Trans', title='Cumulative no. of repeat transactions'),
        color='Actual Vs Model',
        xOffset='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Cumulative Repeat Transactions'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


### Conditional Expectations

In [55]:
p1x, t_x, n, num_donors = [*rfm_array.T]
n_star = 5

B_alpha_beta = np.exp(gammaln(alpha)+gammaln(beta)-gammaln(alpha+beta))
B_gamma_delta = np.exp(gammaln(gamma)+gammaln(delta)-gammaln(gamma+delta))

A2 = np.exp(gammaln(alpha+p1x+1)+gammaln(beta+n-p1x)-gammaln(alpha+beta+n+1))/B_alpha_beta
A3 = delta/(gamma-1)*np.exp(gammaln(gamma+delta)-gammaln(delta+1))*(np.exp(gammaln(1+delta+n.max())-\
     gammaln(gamma+delta+n.max()))-np.exp(gammaln(1+delta+n.max()+n_star)-gammaln(gamma+delta+n.max()+n_star)))

L = np.zeros((n.shape[0], n[0]+1))
L[:,0] = np.exp(gammaln(alpha+p1x)+gammaln(beta+n-p1x)-gammaln(alpha+beta+n))/B_alpha_beta* \
                np.exp(gammaln(gamma)+gammaln(delta+n)-gammaln(gamma+delta+n))/B_gamma_delta

i = np.arange(6)
L[:,1:] = np.exp(gammaln(alpha+p1x.reshape(-1,1))+gammaln(beta+t_x.reshape(-1,1)-p1x.reshape(-1,1)+i)-gammaln(alpha+beta+t_x.reshape(-1,1)+i))/B_alpha_beta* \
                np.exp(gammaln(gamma+1)+gammaln(delta+t_x.reshape(-1,1)+i)-gammaln(gamma+delta+t_x.reshape(-1,1)+i+1))/B_gamma_delta        

n_t_x_1 = n - t_x - 1
L[:,1:] = np.where(i <= n_t_x_1.reshape(-1,1), L[:,1:], 0)

ce = A2 * A3 / np.sum(L, axis=1)

exp_total = ce * num_donors

ce_df = (
    rfm_data()
    .group_by('P1X', 't_x', 'np1x')
    .agg(pl.col('P2X').sum().alias('Actual Total'))
    .sort(['t_x', 'P1X'], descending=True)
    .collect()
    .hstack([pl.Series('Exp Total', exp_total)])
)

# Actual total 2002-2006 donations by p1x / tx
actual_ce_mat = ce_df.sort('t_x').pivot(index='P1X', on='t_x', values='Actual Total').sort('P1X').fill_null(0).to_numpy()
p1x_frequency = actual_ce_mat[:,0]
actual_ce_mat = actual_ce_mat[:,1:]

# Expected total 2002-2006 donations by p1x / tx
est_ce_mat = ce_df.sort('t_x').pivot(index='P1X', on='t_x', values='Exp Total').sort('P1X').fill_null(0).to_numpy()[:,1:]

# Number of Donors
num_donors_mat = rfm_summary.collect().sort('t_x').pivot(index='P1X', on='t_x', values='Count').sort('P1X').fill_null(0).to_numpy()[:,1:]

# CE by Frequency
actual_ce_freq = np.sum(actual_ce_mat, axis=1) / np.sum(num_donors_mat, axis=1)
est_ce_freq = np.sum(est_ce_mat, axis=1) / np.sum(num_donors_mat, axis=1)
ce_freq = pl.DataFrame({'x': p1x_frequency, 'Actual': actual_ce_freq, 'Model': est_ce_freq})
ce_freq = ce_freq.unpivot(index='x', on=['Actual', 'Model'], variable_name='Actual Vs Model', value_name='CE by Freq')

# CE by Recency
actual_ce_rec = np.sum(actual_ce_mat, axis=0) / np.sum(num_donors_mat, axis=0)
est_ce_rec = np.sum(est_ce_mat, axis=0) / np.sum(num_donors_mat, axis=0)
ce_rec = pl.DataFrame({'t_x': years[:len(p1x_frequency)], 'Actual': actual_ce_rec, 'Model': est_ce_rec}).with_columns(pl.col('t_x').cast(pl.Int16))
ce_rec = ce_rec.unpivot(index='t_x', on=['Actual', 'Model'], variable_name='Actual Vs Model', value_name='CE by Rec')

In [56]:
(
    alt.Chart(ce_freq).mark_line().encode(
        x=alt.X('x', title='No. of repeat transactions (1996-2001)', axis=alt.Axis(labelAngle=0, values=np.arange(7))),
        y=alt.Y('CE by Freq', title='No. of repeat transactions (2002–2006)'),
        strokeDash='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Conditional Expectations of Repeat Transactions in 2002–2006 as a Function of Frequency'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [57]:
(
    alt.Chart(ce_rec).mark_line().encode(
        x=alt.X('t_x', title='Year of last transaction', axis=alt.Axis(labelAngle=0, values=np.arange(1995, 2002, 1), format='.0f')),
        y=alt.Y('CE by Rec', title='No. of repeat transactions (2002–2006)'),
        strokeDash='Actual Vs Model'
    ).properties(
            width=650,
            height=250,
            title='Predicted vs. Actual Conditional Expectations of Repeat Transactions in 2002–2006 as a Function of Recency'
    ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False)  
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


### Posterior Mean of P as a Function of Recency and Frequency

In [69]:
l = 1
m = 0

alphal = alpha + l
gammam = gamma + m

B_alphal_beta = np.exp(gammaln(alphal)+gammaln(beta)-gammaln(alphal+beta))
B_gammam_delta = np.exp(gammaln(gammam)+gammaln(delta)-gammaln(gammam+delta))

L_lm = np.zeros((n.shape[0], n[0]+1))
L_lm[:,0] = np.exp(gammaln(alphal+p1x)+gammaln(beta+n-p1x)-gammaln(alphal+beta+n))/B_alphal_beta* \
                np.exp(gammaln(gammam)+gammaln(delta+n)-gammaln(gammam+delta+n))/B_gammam_delta

i = np.arange(6)
L_lm[:,1:] = np.exp(gammaln(alphal+p1x.reshape(-1,1))+gammaln(beta+t_x.reshape(-1,1)-p1x.reshape(-1,1)+i)-gammaln(alphal+beta+t_x.reshape(-1,1)+i))/B_alphal_beta* \
                np.exp(gammaln(gammam+1)+gammaln(delta+t_x.reshape(-1,1)+i)-gammaln(gammam+delta+t_x.reshape(-1,1)+i+1))/B_gammam_delta        

n_t_x_1 = n - t_x - 1
L_lm[:,1:] = np.where(i <= n_t_x_1.reshape(-1,1), L_lm[:,1:], 0)

E_P_Theta = B_alphal_beta/B_alpha_beta*B_gammam_delta/B_gamma_delta*np.sum(L_lm, axis=1)/np.sum(L, axis=1)

tx_year_map = {tx: int(year) for tx, year in enumerate(years)}



(
    rfm_summary
    .with_columns(pl.col("t_x").replace(tx_year_map).alias("Year"))
    .collect()
    .hstack(pl.DataFrame({'E_P_Theta': E_P_Theta}))
    .sort('Year')
    .pivot(on='Year', index='P1X', values='E_P_Theta')
    .sort('P1X')
    .style.tab_header(title="Posterior Mean of P as a Function of Recency and Frequency")
    .tab_stub(rowname_col="P1X")
    .tab_stubhead(label='P1X')
    .fmt_number(decimals=2)   
    .tab_spanner(
        label="Year of last transaction",
        columns=years[:7]
    )     
    .tab_style(
        style=style.text(color=pl.when(pl.col('*').is_null())
                                 .then(pl.lit('white'))
                                 .otherwise(pl.lit('black'))),
        locations=loc.body(columns=years[:7])
    )    
    .data_color(
        domain=[0, 1],
        palette=["white", "rebeccapurple"],
        na_color="white",
    )
)

DuplicateError: the name 'literal' is duplicate

It's possible that multiple expressions are returning the same default column name. If this is the case, try renaming the columns with `.alias("new_name")` to avoid duplicate column names.