In [1]:
from src.dataset import Dataset
from src.postprocess_data import MinInteractionsFilter, IdsEncoder, SplitTrainValTest

data = Dataset('ml-1m').get_data()

min_iterations_filter = MinInteractionsFilter()
ids_encoder = IdsEncoder()

data = min_iterations_filter.transform(data)
data = ids_encoder.fit_transform(data)

n_users = len(data['user_id'].unique())
n_items = len(data['item_id'].unique())
n_interactions = len(data)
print(f'users: {n_users}')
print(f'items: {n_items}')
print(f'interactions: {n_interactions}')
print(f'density: {(n_interactions / n_users / n_items) * 100:.2f}%')

splitter = SplitTrainValTest()
train_df, val_df, test_df = splitter.transform(data)

data

users: 6034
items: 3125
interactions: 574376
density: 3.05%


user_id,item_id,event_ts
i64,i64,i64
0,924,978300760
0,2685,978300275
0,1835,978824291
0,1015,978302039
0,2218,978300719
0,515,978302268
0,714,978301368
0,516,978824268
0,733,978301752
0,1876,978302281


In [2]:
from src.postprocess_data import create_sparse_dataset, create_positives_dataset

train_dataset = create_sparse_dataset(train_df)
train_positives = create_positives_dataset(train_df)
val_positives = create_positives_dataset(val_df)
test_positives = create_positives_dataset(test_df)

print(f'{len(train_positives.keys())} users in train')
print(f'{len(val_positives.keys())} users in val')
print(f'{len(test_positives.keys())} users in test')

4006 users in train
1579 users in val
1784 users in test


In [3]:
import faiss
import random
import numpy as np
from collections import defaultdict
from implicit.bpr import BayesianPersonalizedRanking

from src.metrics import user_recall, user_ap

def linear_lr(epoch, n_epochs, lr_start, lr_end):
    p = epoch / n_epochs
    return lr_start * (1 - p) + lr_end * p

def cosine_annealing_warm_restart(epoch, n_epochs, lr_start, lr_end, t_i):
    # https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.CosineAnnealingWarmRestarts.html#torch.optim.lr_scheduler.CosineAnnealingWarmRestarts
    t_cur = epoch % t_i
    # modification to make lr after restart closer to lr_end 
    linear_coef = (n_epochs - epoch) / n_epochs
    return lr_end + linear_coef * (lr_start - lr_end) * (1 + np.cos(np.pi * t_cur / t_i)) / 2

class CallbackClass:
    def __init__(self):
        pass
        
    def callback_fn(self, epoch, *_):        
#         model.learning_rate = cosine_annealing_warm_restart(
#             epoch, model.iterations, LR_START, LR_END, RESTART_EPOCHS)
#         model.learning_rate = linear_lr(epoch, model.iterations, LR_START, LR_END)
        pass


SEED = 42
DIM = 128
LR_START = 1e-2
LR_END = 1e-3
EPOCHS = 300
RESTART_EPOCHS = 100
REG_FACTOR = 1e-2

def set_seed():
    np.random.seed(SEED)
    random.seed(SEED)

def get_model():
    return BayesianPersonalizedRanking(
        iterations=EPOCHS, factors=(DIM - 1), random_state=SEED,
        learning_rate=LR_START, regularization=REG_FACTOR
    )

set_seed()
model = get_model()
model.iterations = 0
model.fit(train_dataset, callback=CallbackClass().callback_fn)

user_factors = model.user_factors.copy()
item_factors = model.item_factors.copy()

0it [00:00, ?it/s]

In [4]:
recs_by_seed = []
models_by_seed = []

seeds = np.random.choice(10_000, 10)
print(seeds)

for SEED in seeds:
    print('-'*25)
    print(f'SEED = {SEED}')
    np.random.seed(SEED)
    random.seed(SEED)
    
    model = get_model()
    model.user_factors = user_factors.copy()
    model.item_factors = item_factors.copy()
    
    fit_callback = CallbackClass()
    model.fit(train_dataset, callback=fit_callback.callback_fn)

    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]
    
    recs_by_seed.append(recs)
    models_by_seed.append(model)

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))

[7270  860 5390 5191 5734 6265  466 4426 5578 8322]
-------------------------
SEED = 7270


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(SEED)


  0%|          | 0/300 [00:00<?, ?it/s]

Recall@5 0.0235
MAP@5 0.167
Recall@10 0.0411
MAP@10 0.1353
-------------------------
SEED = 860


  return np.sum([


  0%|          | 0/300 [00:00<?, ?it/s]

Recall@5 0.0233
MAP@5 0.1641
Recall@10 0.0418
MAP@10 0.135
-------------------------
SEED = 5390


  0%|          | 0/300 [00:00<?, ?it/s]

Recall@5 0.0227
MAP@5 0.1673
Recall@10 0.04
MAP@10 0.1336
-------------------------
SEED = 5191


  0%|          | 0/300 [00:00<?, ?it/s]

Recall@5 0.0224
MAP@5 0.1622
Recall@10 0.0406
MAP@10 0.1334
-------------------------
SEED = 5734


  0%|          | 0/300 [00:00<?, ?it/s]

Recall@5 0.023
MAP@5 0.1682
Recall@10 0.0411
MAP@10 0.1355
-------------------------
SEED = 6265


  0%|          | 0/300 [00:00<?, ?it/s]

Recall@5 0.0224
MAP@5 0.1674
Recall@10 0.0408
MAP@10 0.1341
-------------------------
SEED = 466


  0%|          | 0/300 [00:00<?, ?it/s]

Recall@5 0.0233
MAP@5 0.1674
Recall@10 0.0405
MAP@10 0.1339
-------------------------
SEED = 4426


  0%|          | 0/300 [00:00<?, ?it/s]

Recall@5 0.0228
MAP@5 0.1676
Recall@10 0.0414
MAP@10 0.1358
-------------------------
SEED = 5578


  0%|          | 0/300 [00:00<?, ?it/s]

Recall@5 0.0233
MAP@5 0.1689
Recall@10 0.0415
MAP@10 0.1368
-------------------------
SEED = 8322


  0%|          | 0/300 [00:00<?, ?it/s]

Recall@5 0.0236
MAP@5 0.17
Recall@10 0.0427
MAP@10 0.1373


In [5]:
k = 20
for i in range(1, len(recs_by_seed)):
    intersection_count = []
    for prev_recs, cur_recs in zip(
        recs_by_seed[0][list(train_positives.keys())],
        recs_by_seed[i][list(train_positives.keys())]
    ):
        intersection_count.append(len(set(cur_recs[:k]).intersection(prev_recs)))
    print(np.mean(intersection_count) / k)

0.9886669995007489
0.9879181228157764
0.9872566150773838
0.987156764852721
0.987855716425362
0.9870943584623065
0.987443834248627
0.9879056415376934
0.9889540688966549


In [6]:
user_tau = [model.user_factors - user_factors for model in models_by_seed]
item_tau = [model.item_factors - item_factors for model in models_by_seed]

In [7]:
best_init_alpha, best_score = None, None
for init_alpha in np.linspace(0.01, 0.5, 100):
    new_item_factors = item_factors.copy()
    new_user_factors = user_factors.copy()

    alpha = init_alpha
    for tau in item_tau:
        new_item_factors += alpha * tau

    alpha = init_alpha
    for tau in user_tau:
        new_user_factors += alpha * tau

    index = faiss.IndexFlatIP(DIM)
    index.add(new_item_factors)
    recs = index.search(new_user_factors, 30)[1]

    print(init_alpha)
    for k in [5, 10]:
        recall_list = []
        for user_id, y_true in val_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            recall_list.append(user_recall(y_pred, y_true, k))
        print(k, np.mean(recall_list))
        
        if k == 10:
            if best_score is None or best_score < np.mean(recall_list):
                best_init_alpha, best_score = init_alpha, np.mean(recall_list)
    print('-'*25)

0.01
5 0.03458366021707631
10 0.058864378615636356
-------------------------
0.014949494949494949
5 0.035196662138405374
10 0.05868251139058492
-------------------------
0.0198989898989899
5 0.035351591386880175
10 0.05847477881814231
-------------------------
0.02484848484848485
5 0.035094779107870414
10 0.058847781882582585
-------------------------
0.029797979797979796
5 0.03420458024653849
10 0.057633498450128655
-------------------------
0.03474747474747475
5 0.03425606114077181
10 0.05712097185551786
-------------------------
0.039696969696969696
5 0.03412511791527526
10 0.05697601388713112
-------------------------
0.04464646464646465
5 0.03389789908978116
10 0.05644853817587903
-------------------------
0.049595959595959596
5 0.03366165027668469
10 0.05607936216133426
-------------------------
0.05454545454545454
5 0.033434380959208025
10 0.0560672194726428
-------------------------
0.059494949494949496
5 0.03328880786452526
10 0.055904872098816775
-------------------------
0.0

10 0.055635781664309804
-------------------------
0.46040404040404037
5 0.032002398221868227
10 0.055614456210283714
-------------------------
0.46535353535353535
5 0.032002398221868227
10 0.055614456210283714
-------------------------
0.4703030303030303
5 0.032002398221868227
10 0.055614456210283714
-------------------------
0.47525252525252526
5 0.032002398221868227
10 0.055614456210283714
-------------------------
0.4802020202020202
5 0.032002398221868227
10 0.055614456210283714
-------------------------
0.4851515151515151
5 0.032002398221868227
10 0.055604241497010716
-------------------------
0.4901010101010101
5 0.032002398221868227
10 0.055577853487722144
-------------------------
0.495050505050505
5 0.032002398221868227
10 0.055577853487722144
-------------------------
0.5
5 0.032002398221868227
10 0.055577853487722144
-------------------------


In [8]:
from src.metrics import user_recall, user_ap


init_alpha = best_init_alpha
print(f'init_alpha = {init_alpha}')

new_item_factors = item_factors.copy()
new_user_factors = user_factors.copy()

alpha = init_alpha
for tau in item_tau:
    new_item_factors += alpha * tau
    
alpha = init_alpha
for tau in user_tau:
    new_user_factors += alpha * tau

index = faiss.IndexFlatIP(DIM)
index.add(new_item_factors)
recs = index.search(new_user_factors, 50)[1]

for k in [5, 10]:
    map_list = []
    recall_list = []
    for user_id, y_true in test_positives.items():
        y_pred = [
            item_id for item_id in recs[user_id]
            if item_id not in train_positives.get(user_id, set())
        ]
        map_list.append(user_ap(y_pred, y_true, k))
        recall_list.append(user_recall(y_pred, y_true, k))
    print(f'Recall@{k}', round(np.mean(recall_list), 4))
    print(f'MAP@{k}', round(np.nanmean(map_list), 4))

init_alpha = 0.01
Recall@5 0.0281
MAP@5 0.1958
Recall@10 0.0502
MAP@10 0.1563


In [9]:
for model in models_by_seed:
    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
    print('-'*25)

Recall@5 0.0235
MAP@5 0.167
Recall@10 0.0411
MAP@10 0.1353
-------------------------
Recall@5 0.0233
MAP@5 0.1641
Recall@10 0.0418
MAP@10 0.135
-------------------------
Recall@5 0.0227
MAP@5 0.1673
Recall@10 0.04
MAP@10 0.1336
-------------------------
Recall@5 0.0224
MAP@5 0.1622
Recall@10 0.0406
MAP@10 0.1334
-------------------------
Recall@5 0.023
MAP@5 0.1682
Recall@10 0.0411
MAP@10 0.1355
-------------------------
Recall@5 0.0224
MAP@5 0.1674
Recall@10 0.0408
MAP@10 0.1341
-------------------------
Recall@5 0.0233
MAP@5 0.1674
Recall@10 0.0405
MAP@10 0.1339
-------------------------
Recall@5 0.0228
MAP@5 0.1676
Recall@10 0.0414
MAP@10 0.1358
-------------------------
Recall@5 0.0233
MAP@5 0.1689
Recall@10 0.0415
MAP@10 0.1368
-------------------------
Recall@5 0.0236
MAP@5 0.17
Recall@10 0.0427
MAP@10 0.1373
-------------------------


In [10]:
new_user_factors = np.mean([model.user_factors for model in models_by_seed], axis=0)
new_item_factors = np.mean([model.item_factors for model in models_by_seed], axis=0)

index = faiss.IndexFlatIP(DIM)
index.add(new_item_factors)
recs = index.search(new_user_factors, 50)[1]

for k in [5, 10]:
    map_list = []
    recall_list = []
    for user_id, y_true in test_positives.items():
        y_pred = [
            item_id for item_id in recs[user_id]
            if item_id not in train_positives.get(user_id, set())
        ]
        map_list.append(user_ap(y_pred, y_true, k))
        recall_list.append(user_recall(y_pred, y_true, k))
    print(f'Recall@{k}', round(np.mean(recall_list), 4))
    print(f'MAP@{k}', round(np.nanmean(map_list), 4))

Recall@5 0.0231
MAP@5 0.169
Recall@10 0.0425
MAP@10 0.1362
