In [1]:
import sys
sys.path.insert(1, '../..')

In [2]:
from src.dataset import Dataset
from src.postprocess_data import MinInteractionsFilter, IdsEncoder, SplitTrainValTest

data = Dataset('ml-1m').get_data()

min_iterations_filter = MinInteractionsFilter()
ids_encoder = IdsEncoder()

data = min_iterations_filter.transform(data)
data = ids_encoder.fit_transform(data)

n_users = len(data['user_id'].unique())
n_items = len(data['item_id'].unique())
n_interactions = len(data)
print(f'users: {n_users}')
print(f'items: {n_items}')
print(f'interactions: {n_interactions}')
print(f'density: {(n_interactions / n_users / n_items) * 100:.2f}%')

splitter = SplitTrainValTest()
train_df, val_df, test_df = splitter.transform(data)

data

100% [..........................................................................] 5917549 / 5917549users: 6034
items: 3125
interactions: 574376
density: 3.05%


user_id,item_id,event_ts
i64,i64,i64
0,924,978300760
0,2685,978300275
0,1835,978824291
0,1015,978302039
0,2218,978300719
0,515,978302268
0,714,978301368
0,516,978824268
0,733,978301752
0,1876,978302281


In [2]:
from src.postprocess_data import create_sparse_dataset, create_positives_dataset

train_positives = create_positives_dataset(train_df)
val_positives = create_positives_dataset(val_df)
test_positives = create_positives_dataset(test_df)

print(f'{len(train_positives.keys())} users in train')
print(f'{len(val_positives.keys())} users in val')
print(f'{len(test_positives.keys())} users in test')

4006 users in train
1579 users in val
1784 users in test


In [3]:
import faiss
import random
import numpy as np
from collections import defaultdict
from implicit.bpr import BayesianPersonalizedRanking

from src.metrics import user_recall, user_ap

SEED = 42
DIM = 128
LR_START = 1e-2
LR_END = 1e-3
EPOCHS = 100
RESTART_EPOCHS = 100
REG_FACTOR = 1e-2

def set_seed():
    np.random.seed(SEED)
    random.seed(SEED)

def get_model():
    return BayesianPersonalizedRanking(
        iterations=EPOCHS, factors=(DIM - 1), random_state=SEED,
        learning_rate=LR_START, regularization=REG_FACTOR
    )

train_dataset = create_sparse_dataset(train_df)

set_seed()
model = get_model()
model.iterations = 0
model.fit(train_dataset)

user_factors = model.user_factors.copy()
item_factors = model.item_factors.copy()

0it [00:00, ?it/s]

In [4]:
recs_by_seed = []
models_by_seed = []

set_seed()
seeds = np.random.choice(10_000, 5)
print(seeds)

for SEED in seeds:
    print('-'*25)
    print(f'SEED = {SEED}')
    np.random.seed(SEED)
    random.seed(SEED)
    
    train_dataset = create_sparse_dataset(train_df.sample(frac=0.5))
    
    model = get_model()
    model.user_factors = user_factors.copy()
    model.item_factors = item_factors.copy()
    
    model.fit(train_dataset)

    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]
    
    recs_by_seed.append(recs)
    models_by_seed.append(model)

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))

[7270  860 5390 5191 5734]
-------------------------
SEED = 7270


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(SEED)


  0%|          | 0/100 [00:00<?, ?it/s]

Recall@5 0.0245
MAP@5 0.1595
Recall@10 0.043
MAP@10 0.1291
-------------------------
SEED = 860


  return np.sum([


  0%|          | 0/100 [00:00<?, ?it/s]

Recall@5 0.0211
MAP@5 0.1423
Recall@10 0.0387
MAP@10 0.115
-------------------------
SEED = 5390


  0%|          | 0/100 [00:00<?, ?it/s]

Recall@5 0.0178
MAP@5 0.1176
Recall@10 0.0374
MAP@10 0.1022
-------------------------
SEED = 5191


  0%|          | 0/100 [00:00<?, ?it/s]

Recall@5 0.022
MAP@5 0.1488
Recall@10 0.0405
MAP@10 0.1274
-------------------------
SEED = 5734


  0%|          | 0/100 [00:00<?, ?it/s]

Recall@5 0.0212
MAP@5 0.1362
Recall@10 0.0397
MAP@10 0.1167


In [5]:
k = 20
for i in range(1, len(recs_by_seed)):
    intersection_count = []
    for prev_recs, cur_recs in zip(
        recs_by_seed[0][list(train_positives.keys())],
        recs_by_seed[i][list(train_positives.keys())]
    ):
        intersection_count.append(len(set(cur_recs[:k]).intersection(prev_recs)))
    print(np.mean(intersection_count) / k)

0.5650399400898652
0.5585996005991014
0.5611707438841738
0.5663879181228157


In [6]:
user_tau = [model.user_factors - user_factors for model in models_by_seed]
item_tau = [model.item_factors - item_factors for model in models_by_seed]

In [7]:
best_init_alpha, best_score = None, None

for init_alpha in np.linspace(0.01, 0.5, 50):
    new_item_factors = item_factors.copy()
    new_user_factors = user_factors.copy()

    alpha = init_alpha
    for tau in item_tau:
        new_item_factors += alpha * tau

    alpha = init_alpha
    for tau in user_tau:
        new_user_factors += alpha * tau

    index = faiss.IndexFlatIP(DIM)
    index.add(new_item_factors)
    recs = index.search(new_user_factors, 50)[1]

    print(init_alpha)
    for k in [5, 10]:
        recall_list = []
        for user_id, y_true in val_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            recall_list.append(user_recall(y_pred, y_true, k))
        print(k, np.mean(recall_list))
        
        if k == 10:
            if best_score is None or best_score < np.mean(recall_list):
                best_init_alpha, best_score = init_alpha, np.mean(recall_list)
    print('-'*25)

0.01
5 0.0341584050299873
10 0.06261571531161582
-------------------------
0.02
5 0.034763408311993156
10 0.061323408358067114
-------------------------
0.03
5 0.03557019551017427
10 0.062494609784533944
-------------------------
0.04
5 0.0357552935583294
10 0.06215775145753682
-------------------------
0.05
5 0.0365504404161195
10 0.06211684189489575
-------------------------
0.060000000000000005
5 0.03630399194705492
10 0.06171764692997892
-------------------------
0.06999999999999999
5 0.03488331958484504
10 0.062067607642517605
-------------------------
0.08
5 0.033988840749600666
10 0.06163791801531713
-------------------------
0.09
5 0.0335383466780452
10 0.05961661823398387
-------------------------
0.09999999999999999
5 0.033666406024730686
10 0.05889426639195951
-------------------------
0.11
5 0.033753226947904606
10 0.0582222945882442
-------------------------
0.12
5 0.033479050308847826
10 0.05783531211273893
-------------------------
0.13
5 0.033328390694760755
10 0.057474

In [8]:
init_alpha = best_init_alpha
print(f'init_alpha = {best_init_alpha}')
new_item_factors = item_factors.copy()
new_user_factors = user_factors.copy()

alpha = init_alpha
for tau in item_tau:
    new_item_factors += alpha * tau
    
alpha = init_alpha
for tau in user_tau:
    new_user_factors += alpha * tau

index = faiss.IndexFlatIP(DIM)
index.add(new_item_factors)
recs = index.search(new_user_factors, 50)[1]

for k in [5, 10]:
    map_list = []
    recall_list = []
    for user_id, y_true in test_positives.items():
        y_pred = [
            item_id for item_id in recs[user_id]
            if item_id not in train_positives.get(user_id, set())
        ]
        map_list.append(user_ap(y_pred, y_true, k))
        recall_list.append(user_recall(y_pred, y_true, k))
    print(f'Recall@{k}', round(np.mean(recall_list), 4))
    print(f'MAP@{k}', round(np.nanmean(map_list), 4))

init_alpha = 0.01
Recall@5 0.0304
MAP@5 0.184
Recall@10 0.055
MAP@10 0.1512


In [9]:
for model in models_by_seed:
    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
    print('-'*25)

Recall@5 0.0245
MAP@5 0.1595
Recall@10 0.043
MAP@10 0.1291
-------------------------
Recall@5 0.0211
MAP@5 0.1423
Recall@10 0.0387
MAP@10 0.115
-------------------------
Recall@5 0.0178
MAP@5 0.1176
Recall@10 0.0374
MAP@10 0.1022
-------------------------
Recall@5 0.022
MAP@5 0.1488
Recall@10 0.0405
MAP@10 0.1274
-------------------------
Recall@5 0.0212
MAP@5 0.1362
Recall@10 0.0397
MAP@10 0.1167
-------------------------


In [10]:
print('Just averaging weights')
new_user_factors = np.mean([model.user_factors for model in models_by_seed], axis=0)
new_item_factors = np.mean([model.item_factors for model in models_by_seed], axis=0)

index = faiss.IndexFlatIP(DIM)
index.add(new_item_factors)
recs = index.search(new_user_factors, 50)[1]

for k in [5, 10]:
    map_list = []
    recall_list = []
    for user_id, y_true in test_positives.items():
        y_pred = [
            item_id for item_id in recs[user_id]
            if item_id not in train_positives.get(user_id, set())
        ]
        map_list.append(user_ap(y_pred, y_true, k))
        recall_list.append(user_recall(y_pred, y_true, k))
    print(f'Recall@{k}', round(np.mean(recall_list), 4))
    print(f'MAP@{k}', round(np.nanmean(map_list), 4))

Just averaging weights
Recall@5 0.0262
MAP@5 0.1688
Recall@10 0.0446
MAP@10 0.1363
