In [1]:
import sys
sys.path.insert(1, '../..')

In [2]:
import polars as pl
from src.dataset import Dataset
from src.postprocess_data import MinInteractionsFilter, IdsEncoder, SplitTrainValTest

data = Dataset('ml-1m').get_data()

min_iterations_filter = MinInteractionsFilter()
ids_encoder = IdsEncoder()

data = min_iterations_filter.transform(data)
data = ids_encoder.fit_transform(data)

n_users = len(data['user_id'].unique())
n_items = len(data['item_id'].unique())
n_interactions = len(data)
print(f'users: {n_users}')
print(f'items: {n_items}')
print(f'interactions: {n_interactions}')
print(f'density: {(n_interactions / n_users / n_items) * 100:.2f}%')

splitter = SplitTrainValTest()
train_df, val_df, test_df = splitter.transform(data)

data

users: 6034
items: 3125
interactions: 574376
density: 3.05%


user_id,item_id,event_ts
i64,i64,i64
0,924,978300760
0,2685,978300275
0,1835,978824291
0,1015,978302039
0,2218,978300719
0,515,978302268
0,714,978301368
0,516,978824268
0,733,978301752
0,1876,978302281


In [2]:
from src.postprocess_data import create_sparse_dataset, create_positives_dataset

train_dataset = create_sparse_dataset(train_df)
train_positives = create_positives_dataset(train_df)
val_positives = create_positives_dataset(val_df)
test_positives = create_positives_dataset(test_df)

print(f'{len(train_positives.keys())} users in train')
print(f'{len(val_positives.keys())} users in val')
print(f'{len(test_positives.keys())} users in test')

4006 users in train
1579 users in val
1784 users in test


In [3]:
import faiss
import random
import numpy as np
from collections import defaultdict
from implicit.bpr import BayesianPersonalizedRanking

from src.metrics import user_recall, user_ap, user_hitrate


SEED = 42
DIM = 128
LR_START = 1e-2
LR_END = 1e-3
EPOCHS = 100
RESTART_EPOCHS = 100
REG_FACTOR = 1e-2

def set_seed():
    np.random.seed(SEED)
    random.seed(SEED)

def get_model():
    return BayesianPersonalizedRanking(
        iterations=EPOCHS, factors=(DIM - 1), random_state=SEED,
        learning_rate=LR_START, regularization=REG_FACTOR
    )

set_seed()
model = get_model()
model.iterations = 0
model.fit(train_dataset)

user_factors = model.user_factors.copy()
item_factors = model.item_factors.copy()

0it [00:00, ?it/s]

In [4]:
recs_by_seed = []
models_by_seed = []

set_seed()
seeds = np.random.choice(10_000, 5)
print(seeds)

for SEED in seeds:
    print('-'*25)
    print(f'SEED = {SEED}')
    np.random.seed(SEED)
    random.seed(SEED)
    
    model = get_model()
    model.user_factors = user_factors.copy()
    model.item_factors = item_factors.copy()
    
    model.fit(train_dataset)
    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]
    
    recs_by_seed.append(recs)
    models_by_seed.append(model)

    for k in [5, 10]:
        map_list = []
        recall_list = []
        hitrate_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
            hitrate_list.append(min(1, user_hitrate(y_pred, y_true, k)))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
#         print(f'Hitrate@{k}', round(np.nanmean(hitrate_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))

[7270  860 5390 5191 5734]
-------------------------
SEED = 7270


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(SEED)


  0%|          | 0/100 [00:00<?, ?it/s]

Recall@5 0.025
MAP@5 0.1636
Recall@10 0.0463
MAP@10 0.1359
-------------------------
SEED = 860


  return np.sum([


  0%|          | 0/100 [00:00<?, ?it/s]

Recall@5 0.0249
MAP@5 0.1676
Recall@10 0.0447
MAP@10 0.1359
-------------------------
SEED = 5390


  0%|          | 0/100 [00:00<?, ?it/s]

Recall@5 0.0244
MAP@5 0.1693
Recall@10 0.0444
MAP@10 0.1378
-------------------------
SEED = 5191


  0%|          | 0/100 [00:00<?, ?it/s]

Recall@5 0.0238
MAP@5 0.1673
Recall@10 0.0433
MAP@10 0.1353
-------------------------
SEED = 5734


  0%|          | 0/100 [00:00<?, ?it/s]

Recall@5 0.0253
MAP@5 0.1699
Recall@10 0.0436
MAP@10 0.1353


In [5]:
k = 20
for i in range(1, len(recs_by_seed)):
    intersection_count = []
    for prev_recs, cur_recs in zip(
        recs_by_seed[0][list(train_positives.keys())],
        recs_by_seed[i][list(train_positives.keys())]
    ):
        intersection_count.append(len(set(cur_recs[:k]).intersection(prev_recs)))
    print(np.mean(intersection_count) / k)

0.9715551672491263
0.9722790813779332
0.9740014977533699
0.9718172740888666


In [6]:
user_tau = [model.user_factors - user_factors for model in models_by_seed]
item_tau = [model.item_factors - item_factors for model in models_by_seed]

In [7]:
best_init_alpha, best_score = None, None
for init_alpha in np.linspace(0.01, 0.5, 100):
    new_item_factors = item_factors.copy()
    new_user_factors = user_factors.copy()

    alpha = init_alpha
    for tau in item_tau:
        new_item_factors += alpha * tau

    alpha = init_alpha
    for tau in user_tau:
        new_user_factors += alpha * tau

    index = faiss.IndexFlatIP(DIM)
    index.add(new_item_factors)
    recs = index.search(new_user_factors, 30)[1]

    print(init_alpha)
    for k in [5, 10]:
        recall_list = []
        for user_id, y_true in val_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            recall_list.append(user_recall(y_pred, y_true, k))
        print(k, np.mean(recall_list))
        
        if k == 10:
            if best_score is None or best_score < np.mean(recall_list):
                best_init_alpha, best_score = init_alpha, np.mean(recall_list)
    print('-'*25)

0.01
5 0.035073708495553926
10 0.05726263576920514
-------------------------
0.014949494949494949
5 0.035135233798859244
10 0.05586761176112283
-------------------------
0.0198989898989899
5 0.035206086893078754
10 0.05793919580876071
-------------------------
0.02484848484848485
5 0.03513937714935027
10 0.05812995484293186
-------------------------
0.029797979797979796
5 0.035380568213470354
10 0.05891599774885971
-------------------------
0.03474747474747475
5 0.03540558273908174
10 0.05946858127644968
-------------------------
0.039696969696969696
5 0.03571344834238526
10 0.059660226998930165
-------------------------
0.04464646464646465
5 0.035943142583201505
10 0.059872980808071345
-------------------------
0.049595959595959596
5 0.03655203081083577
10 0.05981654781590254
-------------------------
0.05454545454545454
5 0.03675613055901673
10 0.05991805401402379
-------------------------
0.059494949494949496
5 0.03705817414401153
10 0.059871449170388495
-------------------------
0.

0.4802020202020202
5 0.033117033937759346
10 0.05528971139669236
-------------------------
0.4851515151515151
5 0.033117033937759346
10 0.05528971139669236
-------------------------
0.4901010101010101
5 0.033117033937759346
10 0.055301890477902474
-------------------------
0.495050505050505
5 0.033117033937759346
10 0.05528896573865909
-------------------------
0.5
5 0.033117033937759346
10 0.05528896573865909
-------------------------


In [8]:
from src.metrics import user_recall, user_ap


init_alpha = best_init_alpha
print(f'init_alpha = {init_alpha}')

new_item_factors = item_factors.copy()
new_user_factors = user_factors.copy()

alpha = init_alpha
for tau in item_tau:
    new_item_factors += alpha * tau
    
alpha = init_alpha
for tau in user_tau:
    new_user_factors += alpha * tau

index = faiss.IndexFlatIP(DIM)
index.add(new_item_factors)
recs = index.search(new_user_factors, 50)[1]

for k in [5, 10]:
    map_list = []
    recall_list = []
    for user_id, y_true in test_positives.items():
        y_pred = [
            item_id for item_id in recs[user_id]
            if item_id not in train_positives.get(user_id, set())
        ]
        map_list.append(user_ap(y_pred, y_true, k))
        recall_list.append(user_recall(y_pred, y_true, k))
    print(f'Recall@{k}', round(np.mean(recall_list), 4))
    print(f'MAP@{k}', round(np.nanmean(map_list), 4))

init_alpha = 0.05454545454545454
Recall@5 0.0307
MAP@5 0.1947
Recall@10 0.0532
MAP@10 0.1563


In [9]:
for model in models_by_seed:
    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
    print('-'*25)

Recall@5 0.025
MAP@5 0.1636
Recall@10 0.0463
MAP@10 0.1359
-------------------------
Recall@5 0.0249
MAP@5 0.1676
Recall@10 0.0447
MAP@10 0.1359
-------------------------
Recall@5 0.0244
MAP@5 0.1693
Recall@10 0.0444
MAP@10 0.1378
-------------------------
Recall@5 0.0238
MAP@5 0.1673
Recall@10 0.0433
MAP@10 0.1353
-------------------------
Recall@5 0.0253
MAP@5 0.1699
Recall@10 0.0436
MAP@10 0.1353
-------------------------


In [10]:
print('Just averaging weights')
new_user_factors = np.mean([model.user_factors for model in models_by_seed], axis=0)
new_item_factors = np.mean([model.item_factors for model in models_by_seed], axis=0)

index = faiss.IndexFlatIP(DIM)
index.add(new_item_factors)
recs = index.search(new_user_factors, 50)[1]

for k in [5, 10]:
    map_list = []
    recall_list = []
    for user_id, y_true in test_positives.items():
        y_pred = [
            item_id for item_id in recs[user_id]
            if item_id not in train_positives.get(user_id, set())
        ]
        map_list.append(user_ap(y_pred, y_true, k))
        recall_list.append(user_recall(y_pred, y_true, k))
    print(f'Recall@{k}', round(np.mean(recall_list), 4))
    print(f'MAP@{k}', round(np.nanmean(map_list), 4))

Just averaging weights
Recall@5 0.0252
MAP@5 0.1716
Recall@10 0.0449
MAP@10 0.1363
