In [1]:
import polars as pl
import numpy as np
import altair as alt
import pandas as pd
from scipy.optimize import minimize

In [2]:
kiwi_lf = pl.scan_csv(source="data/kiwibubbles/kiwibubbles_tran.csv",
                      has_header=False,
                      separator=",",
                      schema={'ID': pl.UInt16,
                              'Market': pl.UInt8,
                              'Week': pl.Int16,
                              'Day': pl.Int16,
                              'Units': pl.Int16})

kiwi_lf_m2 = (kiwi_lf.filter(pl.col('Market') == 2).drop('Market'))
num_panellists_m2 = 1499


kiwi_lf_m2 = (
    kiwi_lf_m2
    .sort(by='ID')
    .with_columns((pl.col("ID").cum_count().over("ID") - 1).cast(pl.UInt16).alias("DoR"))    
)

In [3]:
def shift_week(group_df):    
    week_arr = group_df["Week"].sort().to_numpy().copy()  # Sort array to handle duplicates systematically
    for i in range(1, len(week_arr)):
        if week_arr[i] <= week_arr[i - 1]: # If duplicate or less, increment by 1
            week_arr[i] = week_arr[i - 1] + 1
    return group_df.with_columns(pl.Series("shWeek", week_arr))

shifted_lf = (
    kiwi_lf_m2
    .group_by('ID')
    .map_groups(shift_week, schema={'Week': pl.Int16, 
                                    'shWeek':pl.Int16,
                                    'DoR':pl.UInt16,
                                    'Units':pl.Int16,
                                    'Day':pl.Int16,
                                    'ID':pl.UInt16})
)

In [4]:
week_range, dor_range = np.meshgrid(np.arange(1, 53, dtype='int16'), np.arange(0, 12, dtype='uint16'))
dummy_lf = pl.DataFrame({'shWeek': week_range.reshape(-1), 'DoR': dor_range.reshape(-1)})

sh_agg_trans = (
    shifted_lf
    .collect()
    .group_by('shWeek', 'DoR')
    .agg(pl.len().alias('Count'))
)

shweek_total_trans = (
    sh_agg_trans
    .group_by('shWeek')
    .agg(pl.col('Count').sum().alias('Total')) 
)

sh_agg_trans_longform = (
    dummy_lf
    .join(sh_agg_trans, on=['shWeek', 'DoR'], how='left')
    .join(shweek_total_trans, on='shWeek', how='left')
    .fill_null(0)
)

In [5]:
sh_agg_trans_wideform = (
    sh_agg_trans_longform
    .pivot(on='DoR', index='shWeek', values='Count')
    .join(shweek_total_trans, on='shWeek', how='left')
)

col_total = sh_agg_trans_wideform.select(pl.col('*').exclude('shWeek').sum())

display(sh_agg_trans_wideform)
display(col_total)

shWeek,0,1,2,3,4,5,6,7,8,9,10,11,Total
i16,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
1,8,0,0,0,0,0,0,0,0,0,0,0,8
2,6,1,0,0,0,0,0,0,0,0,0,0,7
3,2,1,0,0,0,0,0,0,0,0,0,0,3
4,16,1,0,0,0,0,0,0,0,0,0,0,17
5,8,4,0,0,0,0,0,0,0,0,0,0,12
…,…,…,…,…,…,…,…,…,…,…,…,…,…
48,1,1,1,1,0,0,0,1,0,0,0,0,5
49,4,0,0,0,0,2,0,1,1,0,0,0,8
50,0,2,0,0,0,0,0,1,2,1,1,1,8
51,0,1,0,0,0,0,0,1,0,0,0,0,2


0,1,2,3,4,5,6,7,8,9,10,11,Total
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
139,52,31,23,17,14,9,8,6,4,2,1,306


In [6]:
sh_cum_trans_longform = sh_agg_trans_longform.with_columns(pl.col('Count').cum_sum().over('DoR').alias('Cum DoR'))
sh_cum_trans_wideform = sh_cum_trans_longform.pivot(on='DoR', index='shWeek', values='Cum DoR')

display(sh_cum_trans_wideform)

shWeek,0,1,2,3,4,5,6,7,8,9,10,11
i16,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
1,8,0,0,0,0,0,0,0,0,0,0,0
2,14,1,0,0,0,0,0,0,0,0,0,0
3,16,2,0,0,0,0,0,0,0,0,0,0
4,32,3,0,0,0,0,0,0,0,0,0,0
5,40,7,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…
48,133,48,30,22,17,12,9,5,3,2,1,0
49,137,48,30,22,17,14,9,6,4,2,1,0
50,137,50,30,22,17,14,9,7,6,3,2,1
51,137,51,30,22,17,14,9,8,6,3,2,1


In [29]:
modified_cum_trans = (
    sh_cum_trans_wideform
    .with_columns(
        pl.sum_horizontal(pl.col(f'{i}' for i in range(2, 12)).alias('AR(t)')),
        pl.sum_horizontal(pl.exclude('shWeek').alias('S(t)'))
    )
    .rename({'shWeek': 'Week (t)', '0': 'T(t)', '1': 'FR(t)'})
    .select('Week (t)', 'T(t)', 'FR(t)', 'AR(t)', 'S(t)')
)


In [47]:
def fr_model(fr: np.ndarray, eligible: np.ndarray, weeks: np.ndarray, guess=[0.05, 0.05]):
    
    def least_square(x):
        p_1, theta_FR = x[0], x[1]
        t = weeks
        t_0 = weeks.reshape(-1,1)

        p_fr_trial = p_1 * (1 - np.exp(-theta_FR * (t - t_0)))
        p_fr_trial = np.triu(p_fr_trial)     
        
        pred_cum_fr = p_fr_trial.T @ eligible    
        
        return np.sum((pred_cum_fr - fr)**2)
        
    return minimize(least_square, guess, bounds=[(0, np.inf), (0, np.inf)])

fr_array = (
    modified_cum_trans
    .select('T(t)', 'FR(t)', 'Week (t)')
    .to_numpy().transpose()         
)

eligible, fr, weeks = np.diff(fr_array[0], prepend=0), fr_array[1], fr_array[2]

result = fr_model(fr, eligible, weeks)
p_1, theta_FR, sse = result.x[0], result.x[1], result.fun

t = np.arange(1, 52+1, 1)

p_fr_trial = p_1 * (1 - np.exp(-theta_FR * (t - 1)))
p_fr_trial

array([0.        , 0.13478782, 0.21997487, 0.27381382, 0.30784049,
       0.32934564, 0.34293707, 0.35152698, 0.35695587, 0.36038698,
       0.36255547, 0.36392597, 0.36479214, 0.36533957, 0.36568554,
       0.3659042 , 0.3660424 , 0.36612974, 0.36618494, 0.36621983,
       0.36624188, 0.36625581, 0.36626462, 0.36627019, 0.3662737 ,
       0.36627593, 0.36627733, 0.36627822, 0.36627878, 0.36627914,
       0.36627936, 0.3662795 , 0.36627959, 0.36627965, 0.36627968,
       0.36627971, 0.36627972, 0.36627973, 0.36627974, 0.36627974,
       0.36627974, 0.36627974, 0.36627974, 0.36627974, 0.36627974,
       0.36627975, 0.36627975, 0.36627975, 0.36627975, 0.36627975,
       0.36627975, 0.36627975])

In [48]:
len(p_fr_trial)

52

In [7]:
def ar_model(ar, eligibles, weeks, j, guess=[0.5, 0.5, 0.5]):

    def least_square(x):
        p_inf, gamma, theta_AR = x[0], x[1], x[2]
        t, t_0 = weeks, weeks.reshape(-1, 1)
        
        sse = np.zeros(len(j))

        for i, DoR in enumerate(j):
            p_j = p_inf * (1 - np.exp(-gamma * DoR))
            p_ar = p_j * (1 - np.exp(-theta_AR * (t - t_0)))
            p_ar = np.triu(p_ar)
            p_ar[:DoR-1] = 0

            pred_cum_ar = p_ar.T @ eligibles[i]
            sse[i] = np.sum((pred_cum_ar - ar[i])**2)

        return np.sum(sse)

    return minimize(least_square, guess, bounds=[(0,np.inf),(0,np.inf),(0,np.inf)])

In [51]:
mod_cum_ar_trans = (
    sh_cum_trans_wideform
    .rename({'shWeek': 'Week'})
    .select('1', '2', '3', '4', '5', 'Week')
    .to_numpy().transpose()
)

eligibles = np.diff(mod_cum_ar_trans[:-2], prepend=0, axis=1)
ar = mod_cum_ar_trans[1:-1]
weeks = mod_cum_ar_trans[-1]
j = np.arange(2, 6, 1)

result = ar_model(ar, eligibles, weeks, j)
p_inf, gamma, theta_AR, sse = result.x[0], result.x[1], result.x[2], result.fun

j = np.arange(2, 5, 1)
p_j = p_inf * (1 - np.exp(-gamma * j))
p_j

t = np.arange(1, 53, 1)


p_ar = p_j[0] * (1 - np.exp(-theta_AR * (t - 2)))
p_ar

array([-0.1760198 ,  0.        ,  0.13949571,  0.25004609,  0.33765727,
        0.40708915,  0.46211392,  0.50572106,  0.54027972,  0.56766747,
        0.58937226,  0.60657331,  0.62020514,  0.63100837,  0.63956993,
        0.64635497,  0.65173212,  0.65599351,  0.65937066,  0.66204705,
        0.66416809,  0.66584901,  0.66718115,  0.66823686,  0.66907352,
        0.66973657,  0.67026203,  0.67067847,  0.67100849,  0.67127003,
        0.67147731,  0.67164157,  0.67177175,  0.67187492,  0.67195668,
        0.67202147,  0.67207282,  0.67211351,  0.67214577,  0.67217132,
        0.67219158,  0.67220763,  0.67222035,  0.67223043,  0.67223842,
        0.67224476,  0.67224977,  0.67225375,  0.6722569 ,  0.6722594 ,
        0.67226138,  0.67226295])

In [49]:
len(p_ar)

52

In [66]:
test = pl.DataFrame({'Week': np.arange(1,53,1), '1': p_fr_trial, '2': p_j[0] * (1 - np.exp(-theta_AR * (t - 2))), '3': (p_j[1] * (1 - np.exp(-theta_AR * (t - 3)))), '4': p_j[2] * (1 - np.exp(-theta_AR * (t - 4)))})

alt.Chart(test).transform_fold(['1', '2', '3', '4'], as_=['key', 'value']).mark_line().encode(x='Week', y=alt.Y('value:Q', scale=alt.Scale(domain=[0,0.8])), color='key:N')