Beta-geometric/Negative Binomial Distribution Model (BG/NBD) - Forecasting Individual-Level Repeat-Buying

In [2]:
import polars as pl
import numpy as np

In [3]:
CDNOW_master = (
    pl.scan_csv(source = 'data/CDNOW/CDNOW_master.csv', 
                has_header=False, 
                separator=',', 
                schema={'CustID': pl.Int32,
                        'Date': pl.String,
                        'Quant': pl.Int16,
                        'Spend': pl.Float64})
    .with_columns(pl.col('Date').str.to_date("%Y%m%d"))
    .with_columns((pl.col('Date') - pl.date(1996,12,31)).dt.total_days().cast(pl.UInt16).alias('PurchDay'))
    .with_columns((pl.col('Spend')*100).round(0).cast(pl.Int64).alias('Spend Scaled'))
    .group_by('CustID', 'Date', maintain_order=True)
    .agg(pl.col('*').exclude('PurchDay').sum(), pl.col('PurchDay').max()) # Multiple transactions by a customer on a single day are aggregated into one
    # .sort('CustID', 'Date')
    .with_columns((pl.col("CustID").cum_count().over("CustID") - 1).cast(pl.UInt16).alias("DoR"))    
)

display(CDNOW_master.head().collect())
display(CDNOW_master.describe())

CustID,Date,Quant,Spend,Spend Scaled,PurchDay,DoR
i32,date,i64,f64,i64,u16,u16
1,1997-01-01,1,11.77,1177,1,0
2,1997-01-12,6,89.0,8900,12,0
3,1997-01-02,2,20.76,2076,2,0
3,1997-03-30,2,20.76,2076,89,1
3,1997-04-02,2,19.54,1954,92,2


statistic,CustID,Date,Quant,Spend,Spend Scaled,PurchDay,DoR
str,f64,str,f64,f64,f64,f64,f64
"""count""",67591.0,"""67591""",67591.0,67591.0,67591.0,67591.0,67591.0
"""null_count""",0.0,"""0""",0.0,0.0,0.0,0.0,0.0
"""mean""",11479.968931,"""1997-07-01 11:44:58.175000""",2.483777,36.991843,3699.184255,182.489562,3.976387
"""std""",6813.132779,,2.446038,38.143508,3814.350807,159.229817,9.45283
"""min""",1.0,"""1997-01-01""",1.0,0.0,0.0,1.0,0.0
"""25%""",5516.0,"""1997-02-22""",1.0,14.79,1479.0,53.0,0.0
"""50%""",11425.0,"""1997-04-21""",2.0,26.73,2673.0,111.0,1.0
"""75%""",17269.0,"""1997-11-06""",3.0,44.7,4470.0,310.0,4.0
"""max""",23570.0,"""1998-06-30""",99.0,1554.58,155458.0,546.0,170.0


In [4]:
TransMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='PurchDay', aggregate_function='max', maintain_order=True)
    .with_columns(pl.arange(0, pl.len()).alias('Original Index'))
)

QuantMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='Quant', aggregate_function='sum', maintain_order=True)
)

SpendMAT = (
    CDNOW_master
    .collect()
    .pivot(on='DoR', index='CustID', values='Spend', aggregate_function='sum', maintain_order=True)
    .fill_null(0)
)

In [5]:
RptSpend = (
    CDNOW_master
    .filter(pl.col('PurchDay') <= 273)
    .with_columns(pl.when(pl.col('DoR') > 0)
                  .then(-pl.col('Spend Scaled'))
                  .otherwise(0)
                  .alias('Repeat Spend'))
    .group_by('CustID')
    .agg(pl.col('Repeat Spend').sum())
    .sort('CustID')
    .collect()
)

RptSpend

CustID,Repeat Spend
i32,i64
1,0
2,0
3,-4030
4,-4469
5,-23188
…,…
23566,0
23567,0
23568,-9873
23569,0


In [6]:
RptSpendJoined = (
    RptSpend
    .join(TransMAT.select('CustID', '0'), on='CustID', how='left')
    .with_row_index()
    .with_columns((pl.col('index') + 1).alias('index'))
)

RptSpendJoined

index,CustID,Repeat Spend,0
u32,i32,i64,u16
1,1,0,1
2,2,0,12
3,3,-4030,2
4,4,-4469,1
5,5,-23188,1
…,…,…,…
23566,23566,0,84
23567,23567,0,84
23568,23568,-9873,84
23569,23569,0,84


In [29]:
id_df = pl.Series('CustID', [], dtype=pl.Int32)

for i in range(1, 13):
    
    spends = (
        RptSpendJoined
        .filter((pl.col("0") > (7 * (i - 1))) & (pl.col("0") <= (7 * i)))
        .sort('CustID')
        .select(pl.col('CustID', 'Repeat Spend').sort_by('Repeat Spend', descending=False, maintain_order=True))
    )
    
    spends.write_csv(f'testtrier/Python/spends_{i}.csv', include_header=False)    
        
    id_df.append(spends.to_series(0))

sampledID = id_df[9::10]

id_df.to_frame().write_csv(f'testtrier/id_main_python.csv', include_header=False)

with open('sampledID_python.csv', "w") as f:
    for i in sampledID:
        f.write(f'{i}\n')

In [25]:
import pandas as pd

id_df = pl.DataFrame({'CustID': [], 'Repeat Spend': []}, schema={'CustID': pl.Int32, 'Repeat Spend': pl.Int64})

for i in range(1, 13):
    
    filtered = (
        RptSpendJoined
        .filter((pl.col("0") > (7 * (i - 1))) & (pl.col("0") <= (7 * i)))
        .select('index', 'CustID', 'Repeat Spend').to_pandas()
    )
    
    spends = RptSpendJoined.filter(pl.col('index').is_in(filtered['index']))
    
    spends = spends.sort(['Repeat Spend', 'CustID'], descending=[False, True], maintain_order=True) 

    id_df.extend(spends.select('CustID', 'Repeat Spend'))

sampledID = id_df.select('CustID').to_series()[9::10]

with open('sampledID_python.csv', "w") as f:
    for i in sampledID:
        f.write(f'{i}\n')

In [46]:
TrialWeek = (
    TransMAT
    .with_columns(((pl.col('0') - 1) // 7 + 1).alias('Trial Week'))
    .select('CustID', 'Trial Week', '0')
)

id_df = (
    TrialWeek
    .join(RptSpend, on='CustID', how='left')
    .sort(['Trial Week','Repeat Spend', 'CustID'], descending=[False, False, False], maintain_order=True)
)

sampledID = id_df[9::10].select('CustID')

sampledID.write_csv('sampledID_python.csv', include_header=False)

In [30]:
py = []
with open('sampledID_python.csv', 'r') as mat:
    csvFile = mat.readlines()
    for lines in csvFile:
        line = lines.strip().split(',')
        line = int(line[0])
        py.append(line)
        
matlab = []
with open('sampledID.csv', 'r') as mat:
    csvFile = mat.readlines()
    for lines in csvFile:
        line = lines.strip().split(',')
        line = int(line[0])
        matlab.append(line)
        
cdsample = []
with open('data/CDNOW/CDNOW_sample.csv', 'r') as mat:
    csvFile = mat.readlines()
    for lines in csvFile:
        line = lines.strip().split(',')
        line = int(line[0])
        cdsample.append(line)

In [31]:
idx = []
test = set(cdsample).difference(set(py))

for i in test:
    # i = str(i)
    # i = (5-len(i))*'0' + i
    idx.append(i)

RptSpend.join(TransMAT, on='CustID', how='left').filter(pl.col('CustID').is_in(idx)).sort('Repeat Spend', descending=False).select('CustID', 'Repeat Spend', '0').with_columns(((pl.col('0') - 1) // 7 + 1).alias('Trial Week'))

CustID,Repeat Spend,0,Trial Week
i32,i64,u16,u16
8500,-19149,33,5
11574,-14785,42,6
21086,-8997,74,11
2790,-7222,12,2
20136,-6979,73,11
…,…,…,…
16285,-997,58,9
9204,-977,34,5
12853,-679,48,7
14206,-578,51,8


In [32]:
idx = []
test = set(py).difference(set(cdsample))

for i in test:
    # i = str(i)
    # i = (5-len(i))*'0' + i
    idx.append(i)

RptSpend.join(TransMAT, on='CustID', how='left').filter(pl.col('CustID').is_in(idx)).sort('Repeat Spend', descending=False).select('CustID', 'Repeat Spend', '0').with_columns(((pl.col('0') - 1) // 7 + 1).alias('Trial Week'))

CustID,Repeat Spend,0,Trial Week
i32,i64,u16,u16
8136,-19149,31,5
11363,-14785,41,6
20295,-8997,71,11
2916,-7222,12,2
21693,-6979,77,11
…,…,…,…
16579,-997,59,9
7925,-977,30,5
13342,-679,48,7
6301,-578,60,9


In [None]:
# # Vectorized Method in Numpy - Using Masks

# TransMAT = TransMAT.to_numpy()
# SpendMAT = SpendMAT.to_numpy()
# # Step 1: Calculate x (number of valid transactions)
# x = np.sum(((TransMAT[:, 2:] > 0) & (TransMAT[:, 2:] <= 273)), axis=1, dtype='int16')

# # Step 2: Create a mask to include only valid columns for each customer
# mask = ((TransMAT[:, 2:] > 0) & (TransMAT[:, 2:] <= 273))  # Exclude ID Column & Trial

# RptSpend = np.sum(SpendMAT[:,2:] * mask, axis=1, dtype='float64')

# RptSpend