In [1]:
import polars as pl
import numpy as np
import altair as alt
import pandas as pd

In [2]:
kiwi_lf = pl.scan_csv(source="data/kiwibubbles/kiwibubbles_tran.csv",
                      has_header=False,
                      separator=",",
                      schema={'ID': pl.UInt16,
                              'Market': pl.UInt8,
                              'Week': pl.Int16,
                              'Day': pl.Int16,
                              'Units': pl.Int16})

kiwi_lf_m2 = (kiwi_lf.filter(pl.col('Market') == 2).drop('Market'))
num_panellists_m2 = 1499


kiwi_lf_m2 = (
    kiwi_lf_m2
    .sort(by='ID')
    .with_columns((pl.col("ID").cum_count().over("ID") - 1).cast(pl.UInt16).alias("DoR"))    
)

In [3]:
def shift_week(group_df):    
    week_arr = group_df["Week"].sort().to_numpy().copy()  # Sort array to handle duplicates systematically
    for i in range(1, len(week_arr)):
        if week_arr[i] <= week_arr[i - 1]: # If duplicate or less, increment by 1
            week_arr[i] = week_arr[i - 1] + 1
    return group_df.with_columns(pl.Series("shWeek", week_arr))

shifted_lf = (
    kiwi_lf_m2
    .group_by('ID')
    .map_groups(shift_week, schema={'Week': pl.Int16, 
                                    'shWeek':pl.Int16,
                                    'DoR':pl.UInt16,
                                    'Units':pl.Int16,
                                    'Day':pl.Int16,
                                    'ID':pl.UInt16})
)

In [4]:
week_range, dor_range = np.meshgrid(np.arange(1, 53, dtype='int16'), np.arange(0, 12, dtype='uint16'))
dummy_lf = pl.DataFrame({'shWeek': week_range.reshape(-1), 'DoR': dor_range.reshape(-1)})

sh_agg_trans = (
    shifted_lf
    .collect()
    .group_by('shWeek', 'DoR')
    .agg(pl.len().alias('Count'))
)

shweek_total_trans = (
    sh_agg_trans
    .group_by('shWeek')
    .agg(pl.col('Count').sum().alias('Total')) 
)

sh_agg_trans_longform = (
    dummy_lf
    .join(sh_agg_trans, on=['shWeek', 'DoR'], how='left')
    .join(shweek_total_trans, on='shWeek', how='left')
    .fill_null(0)
)

In [5]:
sh_agg_trans_wideform = (
    sh_agg_trans_longform
    .pivot(on='DoR', index='shWeek', values='Count')
    .join(shweek_total_trans, on='shWeek', how='left')
)

col_total = sh_agg_trans_wideform.select(pl.col('*').exclude('shWeek').sum())

display(sh_agg_trans_wideform)
display(col_total)

shWeek,0,1,2,3,4,5,6,7,8,9,10,11,Total
i16,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
1,8,0,0,0,0,0,0,0,0,0,0,0,8
2,6,1,0,0,0,0,0,0,0,0,0,0,7
3,2,1,0,0,0,0,0,0,0,0,0,0,3
4,16,1,0,0,0,0,0,0,0,0,0,0,17
5,8,4,0,0,0,0,0,0,0,0,0,0,12
…,…,…,…,…,…,…,…,…,…,…,…,…,…
48,1,1,1,1,0,0,0,1,0,0,0,0,5
49,4,0,0,0,0,2,0,1,1,0,0,0,8
50,0,2,0,0,0,0,0,1,2,1,1,1,8
51,0,1,0,0,0,0,0,1,0,0,0,0,2


0,1,2,3,4,5,6,7,8,9,10,11,Total
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
139,52,31,23,17,14,9,8,6,4,2,1,306


In [6]:
sh_cum_trans_longform = sh_agg_trans_longform.with_columns(pl.col('Count').cum_sum().over('DoR').alias('Cum DoR'))
sh_cum_trans_wideform = sh_cum_trans_longform.pivot(on='DoR', index='shWeek', values='Cum DoR')

display(sh_cum_trans_wideform)

shWeek,0,1,2,3,4,5,6,7,8,9,10,11
i16,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
1,8,0,0,0,0,0,0,0,0,0,0,0
2,14,1,0,0,0,0,0,0,0,0,0,0
3,16,2,0,0,0,0,0,0,0,0,0,0
4,32,3,0,0,0,0,0,0,0,0,0,0
5,40,7,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…
48,133,48,30,22,17,12,9,5,3,2,1,0
49,137,48,30,22,17,14,9,6,4,2,1,0
50,137,50,30,22,17,14,9,7,6,3,2,1
51,137,51,30,22,17,14,9,8,6,3,2,1


In [63]:
# Calculate Time Since Last Purchase (in weeks)
tslp_df = (
    shifted_lf
    .drop('Week', 'Day', 'Units')
    .collect()
    .sort('ID', 'shWeek')
    .with_columns(
        (pl.col('shWeek') - pl.col('shWeek').shift(1)).over('ID').alias('TSLP')
    )
)

# Aggregate purchases by depth and week
j_1_tj_1 = (
    shifted_lf
    .collect()
    .group_by('DoR', 'shWeek')
    .agg(pl.col("ID").n_unique().alias("Count")) 
    .sort('DoR', 'shWeek')
    .with_columns((pl.col('DoR') + 1).alias('DoR'))
)

dor_values = np.arange(0, 12, dtype='uint16')
rows = []
for dor in dor_values:
    for week in np.arange(1, 53, dtype='uint16'):
        tslp_values = np.arange(1, 53 - week + 1, dtype='uint16')
        for tslp in tslp_values:
            rows.append({'DoR': dor, 'shWeek': week, 'TSLP': tslp})

# Create the DataFrame
full_dor_week = pl.DataFrame(rows).cast({'DoR':pl.UInt16, 'shWeek': pl.Int16, 'TSLP': pl.Int16})

joined1 = (
    full_dor_week
    .join(j_1_tj_1, on=['DoR', 'shWeek'], how='left')
    .filter((pl.col('DoR') != 0) & (pl.col('DoR') != 12))
)

display(tslp_df)
display(j_1_tj_1)
display(joined1)

shWeek,DoR,ID,TSLP
i16,u16,u16,i16
49,0,20001,
14,0,20002,
24,0,20003,
49,0,20004,
6,0,20005,
…,…,…,…
26,5,20136,12
37,6,20136,11
13,0,20137,
18,0,20138,


DoR,shWeek,Count
u16,i16,u32
1,1,8
1,2,6
1,3,2
1,4,16
1,5,8
…,…,…
10,50,1
10,52,1
11,45,1
11,50,1


DoR,shWeek,TSLP,Count
u16,i16,i16,u32
1,1,1,8
1,1,2,8
1,1,3,8
1,1,4,8
1,1,5,8
…,…,…,…
11,50,2,1
11,50,3,1
11,51,1,
11,51,2,


In [67]:
test = (
    tslp_df
    .group_by("DoR", 'shWeek', 'TSLP')
    .agg(pl.col("ID").n_unique().alias("Count"))
    .sort('DoR', 'shWeek')
)

joined2 = joined1.join(test, on=['DoR', 'shWeek', 'TSLP'], how='left').fill_null(0).with_columns(pl.col('Count_right').cum_sum().over('DoR', 'shWeek').alias('Cum Count by Week')).filter((pl.col('DoR') == 1))
joined2.pivot(on='TSLP', index='shWeek', values='Cum Count by Week')

shWeek,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52
i16,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
3,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,,
4,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,,,
5,2,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
48,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
49,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
51,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
week_range, dor_range, tslp_range = np.meshgrid(np.arange(1, 53, dtype='int16'), np.arange(0, 12, dtype='uint16'), np.arange(1, 53, dtype='int16'))
full_dor_week = pl.DataFrame({'DoR': dor_range.reshape(-1), 'shWeek': week_range.reshape(-1), 'TSLP': tslp_range.reshape(-1)})

eligibility = (
    pl.concat([full_dor_week, j_1_tj_1], how='align')
    .fill_null(0)
    .with_columns(pl.col("Count").cum_sum().over('DoR', 'shWeek').alias('Cum Count by Week'))
)

eligibility #.filter(pl.col('DoR')==1).pivot(on='TSLP', index='shWeek', values='Cum Count by Week').sort('shWeek')

In [None]:

eligibility = (
    test
    .group_by("DoR", 'shWeek', 'TSLP')
    .agg(pl.col("ID").n_unique().alias("Count")) 
    .sort("DoR", 'shWeek')
    .with_columns(pl.col("Count").shift(-1).fill_null(0).alias("Eligible"))
)

week_range, dor_range, tslp_range = np.meshgrid(np.arange(1, 53, dtype='int16'), np.arange(0, 12, dtype='uint16'), np.arange(1, 53, dtype='int16'))
full_dor_week = pl.DataFrame({'DoR': dor_range.reshape(-1), 'shWeek': week_range.reshape(-1), 'TSLP': tslp_range.reshape(-1)})

eligibility = (
    pl.concat([full_dor_week, eligibility], how='align')
    .fill_null(0)
    .with_columns(pl.col("Count").cum_sum().over('DoR', 'shWeek').alias('Cum Count by Week'))
)

eligibility = (
    pl.concat([eligibility, purchase_counts], how='align')
    .filter(pl.col('DoR') == 1)
    .sort('TSLP')
)

eligibility.pivot(on='TSLP', index='shWeek', values='Cum Count by Week').sort('shWeek')