In [1]:
import polars as pl
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy.optimize import minimize
from scipy.special import gammaln
from utils import CDNOW_sample, rfm_summary, modified_silverman

import altair as alt
from IPython.display import display_markdown

In [2]:
CDNOW = CDNOW_sample()

# For the Gamma-Gamma model, we need to filter out customers who have made only one purchase.
rfm_data = rfm_summary(CDNOW).filter(pl.col('P1X') > 0)

rfm_data_array = rfm_data.select('P1X', 't_x', 'T', 'zbar').collect().to_numpy()
x = rfm_data_array[:,0] # frequency
zbar = rfm_data_array[:,3] / 100 # monetary value
t_x = rfm_data_array[:,1]
T = rfm_data_array[:,2]

The Gamma-Gamma model assumes that there is no relationship between the monetary value and the purchase frequency. We can check this assumption by calculating the correlation between the average spend and the frequency of purchases.

In [3]:
corr_data = rfm_data.select('P1X', 'zbar').collect()
(
    corr_data.corr()
    .with_columns(pl.Series(corr_data.columns).alias("index"))
    .style.tab_header(title="Correlations Between Frequency & Monetary Value")
    .tab_stub(rowname_col="index")
    .fmt_number(decimals=3)
)

# The value of this correlation is close to 0.11, which in practice is considered low enough to proceed with the model.

Correlations Between Frequency & Monetary Value,Correlations Between Frequency & Monetary Value,Correlations Between Frequency & Monetary Value
Unnamed: 0_level_1,P1X,zbar
P1X,1.0,0.114
zbar,0.114,1.0


In [4]:
# Descriptive statistics of the average spend per repeat transaction
summary = rfm_data.select('zbar').with_columns(pl.col('zbar') / 100).describe()
summary

# We note that the distribution of observed individual means is highly skewed to the right.

statistic,zbar
str,f64
"""count""",946.0
"""null_count""",0.0
"""mean""",35.077848
"""std""",30.283506
"""min""",2.99
"""25%""",15.76
"""50%""",27.54
"""75%""",41.79
"""max""",299.63381


Probability density estimate of the sample

In [5]:
m = np.arange(2.5, 301, 2.5) # Average transaction value range

# Apply log transformation for boundary correction
m_log = np.log(m)
zbar_log = np.log(zbar)

bw = modified_silverman(zbar_log)
print('Kernel Smoothing Bandwidth:', bw) 

Kernel Smoothing Bandwidth: 0.18800626075684287


In [6]:
# Estimate the probability density function
# Method 1 - Using sklearn
kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(zbar_log.reshape(-1,1))
log_density = kde.score_samples(m_log.reshape(-1,1))
f = np.exp(log_density) / m # Transform the density back to the original scale

# Method 2 - Using statsmodels
# import statsmodels.api as sm
# kde = sm.nonparametric.KDEUnivariate(zbar_log)
# kde.fit(kernel='gau', bw=bw)
# f_log = kde.evaluate(m_log)
# f = f_log / m 

The distribution of average spend per (repeat) transaction across the 946 individuals who made a repeat transaction in the calibration period. Each customer’s average
is computed across a (typically very) small number of transactions.

In [7]:
(
    alt.Chart(pl.DataFrame({'Average Transaction Value (z)': m, 'f(z)': f}))
    .mark_line().encode(
        x='Average Transaction Value (z)',
        y=alt.Y('f(z)', scale=alt.Scale(domain=[0, 0.04]))
    ).properties(
            width=500,
            height=400,
            title='Observed distribution of average transaction values across customers'
        ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False) 
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


### Parameter Estimation

In [8]:
def gammagamma(x, zbar, guess={'p': 0.01, 'q': 0.01, 'gamma': 0.01}):
        
    def log_likelihood(param):
        p, q, gamma = param[0], param[1], param[2]
        
        ll = gammaln(p*x+q)-gammaln(p*x)-gammaln(q)+q*np.log(gamma)+(p*x-1)*np.log(zbar)+(p*x)*np.log(x)-(p*x+q)*np.log(gamma+x*zbar)
        
        return -np.sum(ll)
    
    bnds = [(1e-6, np.inf) for _ in range(3)]
    
    return minimize(log_likelihood, x0=list(guess.values()), bounds=bnds, method='L-BFGS-B')
        
res = gammagamma(x=x, zbar=zbar)
p, q, gamma = res.x
ll = res.fun

# Sample Parameters
# p = 6.24983547654959
# q = 3.7441106896737
# gamma = 15.4423198312514

display_markdown(f'''$p$ = {p:0.4f}

$q$ = {q:0.4f}

$\\gamma$ = {gamma:0.4f}

Log-Likelihood = {-ll:0.4f}''', raw=True)


$p$ = 6.2492

$q$ = 3.7442

$\gamma$ = 15.4446

Log-Likelihood = -4055.9177

The distribution where the means have been computed across x → ∞ transactions

In [9]:
zeta = np.arange(300) + 1
f_zeta = (p * gamma)**q * zeta**(-q-1) * np.exp(-p*gamma/zeta) / np.exp(gammaln(q))

(
    alt.Chart(pl.DataFrame({'Unobserved mean transaction value (ζ)': zeta, 'f(ζ)': f_zeta}))
    .mark_line().encode(
        x='Unobserved mean transaction value (ζ)',
        y=alt.Y('f(ζ)', scale=alt.Scale(domain=[0, 0.04]))
    ).properties(
            width=500,
            height=400,
            title='Distribution of the (unobserved) mean transaction value (ζ)'
        ).configure_view(stroke=None).configure_axisY(grid=False).configure_axisX(grid=False) 
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting
