In [1]:
from src.dataset import Dataset
from src.postprocess_data import MinInteractionsFilter, IdsEncoder, SplitTrainValTest
import polars as pl
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

data = Dataset('ml-1m').get_data()

positive_data = data.filter(pl.col("rating") >= 4)

min_iterations_filter = MinInteractionsFilter()
ids_encoder = IdsEncoder()

positive_data = min_iterations_filter.transform(positive_data)
positive_data = ids_encoder.fit_transform(positive_data)

n_users = len(positive_data['user_id'].unique())
n_items = len(positive_data['item_id'].unique())
n_interactions = len(positive_data)
print(f'users: {n_users}')
print(f'items: {n_items}')
print(f'interactions: {n_interactions}')
print(f'density: {(n_interactions / n_users / n_items) * 100:.2f}%')

positive_data

users: 6034
items: 3125
interactions: 574376
density: 3.05%


user_id,item_id,event_ts
i64,i64,i64
0,924,978300760
0,2685,978300275
0,1835,978824291
0,1015,978302039
0,2218,978300719
0,515,978302268
0,714,978301368
0,516,978824268
0,733,978301752
0,1876,978302281


In [2]:
negative_data = data.filter(pl.col("rating") < 4)

min_iterations_filter = MinInteractionsFilter()
ids_encoder = IdsEncoder()

negative_data = min_iterations_filter.transform(negative_data)
negative_data = ids_encoder.fit_transform(negative_data)

n_users = len(negative_data['user_id'].unique())
n_items = len(negative_data['item_id'].unique())
n_interactions = len(negative_data)
print(f'users: {n_users}')
print(f'items: {n_items}')
print(f'interactions: {n_interactions}')
print(f'density: {(n_interactions / n_users / n_items) * 100:.2f}%')

negative_data

users: 5822
items: 3295
interactions: 423571
density: 2.21%


user_id,item_id,event_ts
i64,i64,i64
0,576,978302109
0,736,978301968
0,959,978302268
0,2219,978824268
0,1892,978302205
0,612,978300760
0,1910,978300103
0,627,978824268
1,569,978299351
1,2166,978300051


In [3]:
splitter = SplitTrainValTest()
positive_train_df, positive_val_df, positive_test_df = splitter.transform(positive_data)
negative_train_df, negative_val_df, negative_test_df = splitter.transform(negative_data)
print(f"POSITIVE TRAIN: {len(positive_train_df)}")
print(f"NEGATIVE TRAIN: {len(negative_train_df)}")

POSITIVE TRAIN: 344626
NEGATIVE TRAIN: 254143


In [4]:
from src.postprocess_data import create_sparse_dataset, create_positives_dataset

train_positive_dataset = create_sparse_dataset(positive_train_df)

train_positives = create_positives_dataset(positive_train_df)
val_positives = create_positives_dataset(positive_val_df)
test_positives = create_positives_dataset(positive_test_df)


train_negative_dataset = create_sparse_dataset(negative_train_df)

train_negatives = create_positives_dataset(negative_train_df)
val_negatives = create_positives_dataset(negative_val_df)
test_negatives = create_positives_dataset(negative_test_df)

print(f'{len(train_positives.keys())} positive users in train')
print(f'{len(val_positives.keys())} positive users in val')
print(f'{len(test_positives.keys())} positive users in test')

print(f'{len(train_negatives.keys())} negative users in train')
print(f'{len(val_negatives.keys())} negative users in val')
print(f'{len(test_negatives.keys())} negative users in test')

4006 positive users in train
1579 positive users in val
1784 positive users in test
4113 negative users in train
1433 negative users in val
1636 negative users in test


## POSITIVE TRAINING

In [5]:
import faiss
import random
import numpy as np
from collections import defaultdict
from implicit.bpr import BayesianPersonalizedRanking

from src.metrics import user_recall, user_ap

def linear_lr(epoch, n_epochs, lr_start, lr_end):
    p = epoch / n_epochs
    return lr_start * (1 - p) + lr_end * p

def cosine_annealing_warm_restart(epoch, n_epochs, lr_start, lr_end, t_i):
    t_cur = epoch % t_i
    linear_coef = (n_epochs - epoch) / n_epochs
    return lr_end + linear_coef * (lr_start - lr_end) * (1 + np.cos(np.pi * t_cur / t_i)) / 2

class CallbackClass:
    def __init__(self):
        pass
        
    def callback_fn(self, epoch, *_):        
        pass


SEED = 42
DIM = 128
LR_START = 1e-2
LR_END = 1e-3
EPOCHS = 200
RESTART_EPOCHS = 100
REG_FACTOR = 1e-2

def set_seed():
    np.random.seed(SEED)
    random.seed(SEED)

def get_model():
    return BayesianPersonalizedRanking(
        iterations=EPOCHS, factors=(DIM - 1), random_state=SEED,
        learning_rate=LR_START, regularization=REG_FACTOR
    )

set_seed()
model_pos = get_model()
model_pos.iterations = 0
model_pos.fit(train_positive_dataset, callback=CallbackClass().callback_fn)

user_factors = model_pos.user_factors.copy()
item_factors = model_pos.item_factors.copy()
print("GET INIT WEIGTS of user_factors and item_factors")

0it [00:00, ?it/s]

GET INIT WEIGTS of user_factors and item_factors


In [6]:
recs_by_seed_pos = []
models_by_seed_pos = []

seeds = np.random.choice(10_000, 3)
print(seeds)

for SEED in seeds:
    print('-'*25)
    print(f'SEED = {SEED}')
    np.random.seed(SEED)
    random.seed(SEED)
    
    model_pos = get_model()
    model_pos.user_factors = user_factors.copy()
    model_pos.item_factors = item_factors.copy()
    
    fit_callback = CallbackClass()
    model_pos.fit(train_positive_dataset, callback=fit_callback.callback_fn)

    index = faiss.IndexFlatIP(DIM)
    index.add(model_pos.item_factors)
    recs = index.search(model_pos.user_factors, 50)[1]
    
    recs_by_seed_pos.append(recs)
    models_by_seed_pos.append(model_pos)

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))

[7270  860 5390]
-------------------------
SEED = 7270


  0%|          | 0/200 [00:00<?, ?it/s]

Recall@5 0.0225
MAP@5 0.1619
Recall@10 0.0407
MAP@10 0.1326
-------------------------
SEED = 860


  return np.sum([


  0%|          | 0/200 [00:00<?, ?it/s]

Recall@5 0.024
MAP@5 0.1661
Recall@10 0.0413
MAP@10 0.1319
-------------------------
SEED = 5390


  0%|          | 0/200 [00:00<?, ?it/s]

Recall@5 0.0233
MAP@5 0.1679
Recall@10 0.0408
MAP@10 0.1331


In [7]:
k = 20
for i in range(1, len(recs_by_seed_pos)):
    intersection_count = []
    for prev_recs, cur_recs in zip(
        recs_by_seed_pos[0][list(train_positives.keys())],
        recs_by_seed_pos[i][list(train_positives.keys())]
    ):
        intersection_count.append(len(set(cur_recs[:k]).intersection(prev_recs)))
    print(np.mean(intersection_count) / k)

0.983936595107339
0.9845232151772342


In [8]:
user_tau_pos = [model.user_factors - user_factors for model in models_by_seed_pos]
item_tau_pos = [model.item_factors - item_factors for model in models_by_seed_pos]

In [11]:
best_init_alpha_pos, best_score_pos = None, None
for init_alpha in np.linspace(0.01, 0.5, 20):
    new_item_factors = item_factors.copy()
    new_user_factors = user_factors.copy()

    alpha = init_alpha
    for tau in item_tau_pos:
        new_item_factors += alpha * tau

    alpha = init_alpha
    for tau in user_tau_pos:
        new_user_factors += alpha * tau

    index = faiss.IndexFlatIP(DIM)
    index.add(new_item_factors)
    recs = index.search(new_user_factors, 30)[1]

    print(init_alpha)
    for k in [5, 10]:
        recall_list = []
        for user_id, y_true in val_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            recall_list.append(user_recall(y_pred, y_true, k))
        print(k, np.mean(recall_list))
        
        if k == 10:
            if best_score_pos is None or best_score_pos < np.mean(recall_list):
                best_init_alpha_pos, best_score_pos = init_alpha, np.mean(recall_list)
    print('-'*25)

print(f'best_init_alpha_pos = {best_init_alpha_pos}, best_score = {best_score_pos}')

0.01
5 0.033127536926199505
10 0.0580031027422643
-------------------------
0.035789473684210524
5 0.035412855931014495
10 0.05797277801229127
-------------------------
0.06157894736842105
5 0.03542225843361235
10 0.058110381139235334
-------------------------
0.08736842105263157
5 0.03448265072783405
10 0.0581329401270715
-------------------------
0.1131578947368421
5 0.034687193119630845
10 0.05692110888830973
-------------------------
0.13894736842105262
5 0.03426633253096597
10 0.05636168014403502
-------------------------
0.16473684210526315
5 0.03382710126484319
10 0.05567146500162848
-------------------------
0.19052631578947368
5 0.033677529604128606
10 0.05537356318697776
-------------------------
0.2163157894736842
5 0.0336421608093864
10 0.05515649756564783
-------------------------
0.24210526315789474
5 0.03386480821155919
10 0.05552193005698348
-------------------------
0.26789473684210524
5 0.03343527098614666
10 0.05537540781351499
-------------------------
0.29368421052

## NEGATIVE TRAINING

In [5]:
import faiss
import random
import numpy as np
from collections import defaultdict
from implicit.bpr import BayesianPersonalizedRanking

from src.metrics import user_recall, user_ap

def linear_lr(epoch, n_epochs, lr_start, lr_end):
    p = epoch / n_epochs
    return lr_start * (1 - p) + lr_end * p

def cosine_annealing_warm_restart(epoch, n_epochs, lr_start, lr_end, t_i):
    t_cur = epoch % t_i
    linear_coef = (n_epochs - epoch) / n_epochs
    return lr_end + linear_coef * (lr_start - lr_end) * (1 + np.cos(np.pi * t_cur / t_i)) / 2

class CallbackClass:
    def __init__(self):
        pass
        
    def callback_fn(self, epoch, *_):        
        pass


SEED = 42
DIM = 128
LR_START = 1e-2
LR_END = 1e-3
EPOCHS = 200
RESTART_EPOCHS = 100
REG_FACTOR = 1e-2

def set_seed():
    np.random.seed(SEED)
    random.seed(SEED)

def get_model():
    return BayesianPersonalizedRanking(
        iterations=EPOCHS, factors=(DIM - 1), random_state=SEED,
        learning_rate=LR_START, regularization=REG_FACTOR
    )

set_seed()
model = get_model()
model.iterations = 0
model.fit(train_negative_dataset, callback=CallbackClass().callback_fn)

user_factors = model.user_factors.copy()
item_factors = model.item_factors.copy()
print("GET INIT WEIGTS of user_factors and item_factors")

0it [00:00, ?it/s]

GET INIT WEIGTS of user_factors and item_factors


In [6]:
recs_by_seed = []
models_by_seed = []

seeds = np.random.choice(10_000, 3)
print(seeds)

for SEED in seeds:
    print('-'*25)
    print(f'SEED = {SEED}')
    np.random.seed(SEED)
    random.seed(SEED)
    
    model = get_model()
    model.user_factors = user_factors.copy()
    model.item_factors = item_factors.copy()
    
    fit_callback = CallbackClass()
    model.fit(train_negative_dataset, callback=fit_callback.callback_fn)

    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]
    
    recs_by_seed.append(recs)
    models_by_seed.append(model)

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in val_negatives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_negatives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))

[7270  860 5390]
-------------------------
SEED = 7270


  0%|          | 0/200 [00:00<?, ?it/s]

Recall@5 0.0148
MAP@5 0.073
Recall@10 0.023
MAP@10 0.0514
-------------------------
SEED = 860


  return np.sum([


  0%|          | 0/200 [00:00<?, ?it/s]

Recall@5 0.0139
MAP@5 0.0723
Recall@10 0.0274
MAP@10 0.0545
-------------------------
SEED = 5390


  0%|          | 0/200 [00:00<?, ?it/s]

Recall@5 0.014
MAP@5 0.0724
Recall@10 0.0248
MAP@10 0.0519


In [7]:
k = 20
for i in range(1, len(recs_by_seed)):
    intersection_count = []
    for prev_recs, cur_recs in zip(
        recs_by_seed[0][list(train_negatives.keys())],
        recs_by_seed[i][list(train_negatives.keys())]
    ):
        intersection_count.append(len(set(cur_recs[:k]).intersection(prev_recs)))
    print(np.mean(intersection_count) / k)

0.9660709944079746
0.9663384390955507


In [8]:
user_tau = [model.user_factors - user_factors for model in models_by_seed]
item_tau = [model.item_factors - item_factors for model in models_by_seed]

In [16]:
best_init_alpha, best_score = None, None
for init_alpha in np.linspace(0.01, 0.5, 100):
    new_item_factors = item_factors.copy()
    new_user_factors = user_factors.copy()

    alpha = init_alpha
    for tau in item_tau:
        new_item_factors -= alpha * tau

    alpha = init_alpha
    for tau in user_tau:
        new_user_factors -= alpha * tau

    index = faiss.IndexFlatIP(DIM)
    index.add(new_item_factors)
    recs = index.search(new_user_factors, 30)[1]

    print(init_alpha)
    for k in [5, 10]:
        recall_list = []
        for user_id, y_true in val_positives.items():
            if len(recs) <= user_id:
                continue
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_negatives.get(user_id, set())
            ]
            recall_list.append(user_recall(y_pred, y_true, k))
        print(k, np.mean(recall_list))
        
        if k == 10:
            if best_score is None or best_score < np.mean(recall_list):
                best_init_alpha, best_score = init_alpha, np.mean(recall_list)
    print('-'*25)
    
print(f'best_init_alpha = {best_init_alpha}, best_score = {best_score}')

0.01
5 0.0009246317011848879
10 0.0010848150945288442
-------------------------
0.014949494949494949
5 0.0009143910457829422
10 0.0013083825447552145
-------------------------
0.0198989898989899
5 0.0011960653808744024
10 0.0014046547303720209
-------------------------
0.02484848484848485
5 0.0012647548396972539
10 0.0014838138293870892
-------------------------
0.029797979797979796
5 0.0013136202237568149
10 0.0015250294534006781
-------------------------
0.03474747474747475
5 0.0013695787950354736
10 0.0016291272804522352
-------------------------
0.039696969696969696
5 0.0013465302225084432
10 0.0016958741326850788
-------------------------
0.04464646464646465
5 0.0013371574952391761
10 0.0018033884224270963
-------------------------
0.049595959595959596
5 0.001331873000640008
10 0.001773687678380902
-------------------------
0.05454545454545454
5 0.0013315401923196205
10 0.001864607253914346
-------------------------
0.059494949494949496
5 0.001333315165821625
10 0.0018419091238653

10 0.0014559373954430544
-------------------------
0.4505050505050505
5 0.000722325554867759
10 0.0014559373954430544
-------------------------
0.45545454545454545
5 0.0007276610223881004
10 0.0014559373954430544
-------------------------
0.46040404040404037
5 0.0007276610223881004
10 0.0014559373954430544
-------------------------
0.46535353535353535
5 0.0007276610223881004
10 0.0014559373954430544
-------------------------
0.4703030303030303
5 0.0007276610223881004
10 0.0014542306195427302
-------------------------
0.47525252525252526
5 0.0007276610223881004
10 0.0014542306195427302
-------------------------
0.4802020202020202
5 0.0007276610223881004
10 0.0014542306195427302
-------------------------
0.4851515151515151
5 0.0007276610223881004
10 0.001458968833236168
-------------------------
0.4901010101010101
5 0.0007276610223881004
10 0.001458968833236168
-------------------------
0.495050505050505
5 0.0007297359264237887
10 0.001458968833236168
-------------------------
0.5
5 0.00

In [17]:
from src.metrics import user_recall, user_ap


init_alpha = best_init_alpha
print(f'init_alpha = {init_alpha}')

new_item_factors = item_factors.copy()
new_user_factors = user_factors.copy()

alpha = init_alpha
for tau in item_tau:
    new_item_factors -= alpha * tau

alpha = init_alpha
for tau in user_tau:
    new_user_factors -= alpha * tau


index = faiss.IndexFlatIP(DIM)
index.add(new_item_factors)
recs = index.search(new_user_factors, 50)[1]

for k in [5, 10]:
    map_list = []
    recall_list = []
    for user_id, y_true in test_positives.items():
        if len(recs) <= user_id:
            continue
        y_pred = [
            item_id for item_id in recs[user_id]
            if item_id not in train_negatives.get(user_id, set())
        ]
        map_list.append(user_ap(y_pred, y_true, k))
        recall_list.append(user_recall(y_pred, y_true, k))
    print(f'Recall@{k}', round(np.mean(recall_list), 4))
    print(f'MAP@{k}', round(np.nanmean(map_list), 4))

init_alpha = 0.15848484848484848
Recall@5 0.0008
MAP@5 0.0056
Recall@10 0.0014
MAP@10 0.0035


## Объединение позитива и негатива

In [18]:
from src.metrics import user_recall, user_ap


init_alpha = best_init_alpha_pos
init_beta = best_init_alpha
print(f'init_alpha = {init_alpha}')

new_item_factors = item_factors.copy()
new_user_factors = user_factors.copy()

# positive step
alpha = init_alpha
for tau in item_tau_pos:
    new_item_factors += alpha * tau
# anti-negative step
beta = init_beta
for tau in item_tau:
    new_item_factors -= beta * tau

# positive step
alpha = init_alpha
for tau in user_tau_pos:
    new_user_factors += alpha * tau
# anti-negative step
beta = init_beta
for tau in user_tau:
    new_user_factors -= beta * tau

index = faiss.IndexFlatIP(DIM)
index.add(new_item_factors)
recs = index.search(new_user_factors, 50)[1]

for k in [5, 10]:
    map_list = []
    recall_list = []
    for user_id, y_true in test_positives.items():
        y_pred = [
            item_id for item_id in recs[user_id]
            if item_id not in train_positives.get(user_id, set())
        ]
        map_list.append(user_ap(y_pred, y_true, k))
        recall_list.append(user_recall(y_pred, y_true, k))
    print(f'Recall@{k}', round(np.mean(recall_list), 4))
    print(f'MAP@{k}', round(np.nanmean(map_list), 4))

init_alpha = 0.08736842105263157


ValueError: operands could not be broadcast together with shapes (3295,128) (3125,128) (3295,128) 